diff --git a/.ci/pnnx.yml b/.ci/pnnx.yml
index 267a0afa289b..3f116a4fa2e4 100644
--- a/.ci/pnnx.yml
+++ b/.ci/pnnx.yml
@@ -48,6 +48,10 @@ jobs:
             torchvision-version: 0.15.1
             torchvision-cache-key: '0_15_1'
 
+          - torch-version: 2.1.0
+            torchvision-version: 0.16.0
+            torchvision-cache-key: '0_16_0'
+
     runs-on:
       pool-name: docker
       container:
diff --git a/.github/labeler.yml b/.github/labeler.yml
new file mode 100644
index 000000000000..31d03ede341c
--- /dev/null
+++ b/.github/labeler.yml
@@ -0,0 +1,24 @@
+cmake:
+- cmake/**
+- toolchains/**
+
+doc: docs/**
+
+python: python/**
+
+example: examples/**
+
+test: tests/**
+
+tool: tools/**
+pnnx: tools/pnnx/**
+
+core: src/*
+layer: src/layer/*
+
+arm: src/layer/arm/**
+loongarch: src/layer/loongarch/**
+mips: src/layer/mips/**
+riscv: src/layer/riscv/**
+vulkan: src/layer/vulkan/**
+x86: src/layer/x86/**
diff --git a/.github/workflows/code-format.yml b/.github/workflows/code-format.yml
index 9ffb978181d1..8051371d3e51 100644
--- a/.github/workflows/code-format.yml
+++ b/.github/workflows/code-format.yml
@@ -51,7 +51,7 @@ jobs:
         rm -rf $GITHUB_WORKSPACE/clang-format-install
         export PATH=~/bin:$PATH
         sh codeformat.sh
-    - uses: stefanzweifel/git-auto-commit-action@v4
+    - uses: stefanzweifel/git-auto-commit-action@v5
       with:
         commit_message: apply code-format changes
 
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
new file mode 100644
index 000000000000..889c41b54c97
--- /dev/null
+++ b/.github/workflows/labeler.yml
@@ -0,0 +1,12 @@
+name: labeler
+on: [pull_request_target]
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  label:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/labeler@v4
diff --git a/.github/workflows/release-python.yml b/.github/workflows/release-python.yml
index 9d99ffdd630d..da75661b0b4c 100644
--- a/.github/workflows/release-python.yml
+++ b/.github/workflows/release-python.yml
@@ -68,7 +68,7 @@ jobs:
         brew uninstall --ignore-dependencies libomp
 
     - name: Build wheels
-      uses: pypa/cibuildwheel@v2.15.0
+      uses: pypa/cibuildwheel@v2.16.2
       env:
         CIBW_ARCHS_MACOS: ${{ matrix.arch }}
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
@@ -124,7 +124,7 @@ jobs:
         platforms: all
 
     - name: Build wheels
-      uses: pypa/cibuildwheel@v2.15.0
+      uses: pypa/cibuildwheel@v2.16.2
       env:
         CIBW_ARCHS_LINUX: ${{ matrix.arch }}
         CIBW_BUILD: ${{ matrix.build }}
diff --git a/benchmark/README.md b/benchmark/README.md
index 83347dec97bc..211480893b8f 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1631,7 +1631,92 @@ cooling_down = 1
   vision_transformer  min = 6605.19  max = 6606.66  avg = 6605.73
           FastestDet  min =   52.11  max =   52.97  avg =   52.61
 ```
+### Raspberry Pi 5 Broadcom BCM2712, Cortex-A76 (ARMv8) (2.4GHz x 4)
+```
+pi@raspberrypi:~/ncnn/benchmark $ ./benchncnn 10 4 0 -1 1
+loop_count = 10
+num_threads = 4
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =    8.56  max =    8.65  avg =    8.61
+     squeezenet_int8  min =   11.65  max =   12.64  avg =   11.94
+           mobilenet  min =   11.32  max =   13.46  avg =   11.75
+      mobilenet_int8  min =   11.30  max =   11.60  avg =   11.45
+        mobilenet_v2  min =   13.57  max =   13.77  avg =   13.63
+        mobilenet_v3  min =    9.18  max =   10.52  avg =    9.48
+          shufflenet  min =    4.56  max =    6.19  avg =    5.98
+       shufflenet_v2  min =    5.04  max =    5.13  avg =    5.09
+             mnasnet  min =    8.27  max =    9.86  avg =    8.65
+     proxylessnasnet  min =    9.36  max =   11.18  avg =    9.62
+     efficientnet_b0  min =   14.77  max =   14.96  avg =   14.87
+   efficientnetv2_b0  min =   19.91  max =   20.11  avg =   19.99
+        regnety_400m  min =   11.91  max =   12.10  avg =   11.96
+           blazeface  min =    2.26  max =    2.29  avg =    2.28
+           googlenet  min =   32.80  max =   33.17  avg =   32.97
+      googlenet_int8  min =   32.63  max =   32.99  avg =   32.78
+            resnet18  min =   23.95  max =   24.21  avg =   24.12
+       resnet18_int8  min =   32.50  max =   32.79  avg =   32.68
+             alexnet  min =   25.31  max =   25.75  avg =   25.51
+               vgg16  min =  162.19  max =  165.08  avg =  163.75
+          vgg16_int8  min =  187.46  max =  191.21  avg =  189.09
+            resnet50  min =   55.95  max =   56.61  avg =   56.29
+       resnet50_int8  min =   73.34  max =   73.97  avg =   73.59
+      squeezenet_ssd  min =   40.48  max =   41.39  avg =   40.92
+ squeezenet_ssd_int8  min =   45.67  max =   46.35  avg =   46.06
+       mobilenet_ssd  min =   31.15  max =   31.73  avg =   31.48
+  mobilenet_ssd_int8  min =   31.09  max =   31.44  avg =   31.27
+      mobilenet_yolo  min =   71.51  max =   72.38  avg =   71.95
+  mobilenetv2_yolov3  min =   47.86  max =   48.41  avg =   48.04
+         yolov4-tiny  min =   55.95  max =   56.51  avg =   56.19
+           nanodet_m  min =   14.26  max =   14.68  avg =   14.48
+    yolo-fastest-1.1  min =    6.48  max =    8.10  avg =    7.30
+      yolo-fastestv2  min =    6.03  max =    7.33  avg =    7.04
+  vision_transformer  min =  613.62  max =  637.97  avg =  629.51
+          FastestDet  min =    6.53  max =    6.66  avg =    6.59
+pi@raspberrypi:~/ncnn/benchmark $ ./benchncnn 10 1 0 -1 1
+loop_count = 10
+num_threads = 1
+powersave = 0
+gpu_device = -1
+cooling_down = 1
+          squeezenet  min =   13.18  max =   13.27  avg =   13.22
+     squeezenet_int8  min =   15.69  max =   15.93  avg =   15.78
+           mobilenet  min =   21.42  max =   21.55  avg =   21.46
+      mobilenet_int8  min =   14.92  max =   20.91  avg =   17.34
+        mobilenet_v2  min =   18.56  max =   23.06  avg =   19.24
+        mobilenet_v3  min =   13.16  max =   13.33  avg =   13.25
+          shufflenet  min =    7.25  max =   11.14  avg =    8.43
+       shufflenet_v2  min =    7.17  max =   11.15  avg =    7.70
+             mnasnet  min =   13.89  max =   13.94  avg =   13.91
+     proxylessnasnet  min =   17.01  max =   17.26  avg =   17.07
+     efficientnet_b0  min =   26.19  max =   26.30  avg =   26.24
+   efficientnetv2_b0  min =   39.69  max =   40.12  avg =   39.97
+        regnety_400m  min =   17.30  max =   17.44  avg =   17.36
+           blazeface  min =    4.74  max =    4.78  avg =    4.76
+           googlenet  min =   57.64  max =   57.84  avg =   57.72
+      googlenet_int8  min =   55.80  max =   56.01  avg =   55.93
+            resnet18  min =   31.90  max =   32.09  avg =   32.00
+       resnet18_int8  min =   56.92  max =   57.16  avg =   57.01
+             alexnet  min =   39.84  max =   40.12  avg =   39.92
+               vgg16  min =  208.33  max =  211.06  avg =  209.64
+          vgg16_int8  min =  437.53  max =  440.55  avg =  439.35
+            resnet50  min =   95.75  max =   96.68  avg =   96.28
+       resnet50_int8  min =  116.80  max =  118.01  avg =  117.57
+      squeezenet_ssd  min =   47.75  max =   47.97  avg =   47.86
+ squeezenet_ssd_int8  min =   61.98  max =   62.90  avg =   62.47
+       mobilenet_ssd  min =   52.83  max =   53.39  avg =   53.07
+  mobilenet_ssd_int8  min =   46.15  max =   46.60  avg =   46.35
+      mobilenet_yolo  min =  117.68  max =  117.97  avg =  117.81
+  mobilenetv2_yolov3  min =   67.37  max =   67.67  avg =   67.48
+         yolov4-tiny  min =   73.85  max =   74.35  avg =   74.10
+           nanodet_m  min =   22.78  max =   23.33  avg =   22.96
+    yolo-fastest-1.1  min =    8.82  max =    8.91  avg =    8.87
+      yolo-fastestv2  min =    8.18  max =   11.42  avg =    8.59
+  vision_transformer  min = 1267.90  max = 1269.45  avg = 1268.82
+          FastestDet  min =    7.79  max =   11.14  avg =    9.03
 
+```
 ### Raspberry Pi Zero 2 W Broadcom BCM2710A1, Cortex-A53 (ARMv8) (1.0GHz x 4)
 
 ```
@@ -5079,6 +5164,61 @@ cooling_down = 0
   mobilenetv2_yolov3  min =    3.69  max =    5.14  avg =    3.91
 ```
 
+### nVIDIA RTX A3000 of Notebook (6GB)
+```
+cx@HP-ZBook-Fury-15-6-inch-G8-Mobile-Workstation-PC:~/ncnn/build/benchmark$ ./benchncnn 10 1 0 1
+[0 Intel(R) UHD Graphics (TGL GT1)]  queueC=0[1]  queueG=0[1]  queueT=0[1]
+[0 Intel(R) UHD Graphics (TGL GT1)]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[0 Intel(R) UHD Graphics (TGL GT1)]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[0 Intel(R) UHD Graphics (TGL GT1)]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
+[0 Intel(R) UHD Graphics (TGL GT1)]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
+[1 NVIDIA RTX A3000 Laptop GPU]  queueC=2[8]  queueG=0[16]  queueT=1[2]
+[1 NVIDIA RTX A3000 Laptop GPU]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[1 NVIDIA RTX A3000 Laptop GPU]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[1 NVIDIA RTX A3000 Laptop GPU]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
+[1 NVIDIA RTX A3000 Laptop GPU]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
+loop_count = 10
+num_threads = 1
+powersave = 0
+gpu_device = 1
+cooling_down = 1
+          squeezenet  min =    1.49  max =    1.94  avg =    1.74
+     squeezenet_int8  min =    6.13  max =    6.20  avg =    6.16
+           mobilenet  min =    4.05  max =    4.82  avg =    4.65
+      mobilenet_int8  min =   10.24  max =   10.29  avg =   10.26
+        mobilenet_v2  min =    0.98  max =    1.14  avg =    1.03
+        mobilenet_v3  min =    1.74  max =    1.82  avg =    1.77
+          shufflenet  min =    1.43  max =   30.51  avg =    9.51
+       shufflenet_v2  min =    3.43  max =    3.89  avg =    3.77
+             mnasnet  min =    6.50  max =    6.75  avg =    6.62
+     proxylessnasnet  min =    6.46  max =    7.28  avg =    7.00
+     efficientnet_b0  min =    3.14  max =   15.11  avg =    7.29
+   efficientnetv2_b0  min =   18.50  max =   20.13  avg =   19.17
+        regnety_400m  min =    2.16  max =    3.57  avg =    2.70
+           blazeface  min =    2.52  max =    2.76  avg =    2.65
+           googlenet  min =    2.67  max =   14.67  avg =    9.85
+      googlenet_int8  min =   19.08  max =   19.40  avg =   19.19
+            resnet18  min =    5.19  max =    9.44  avg =    8.48
+       resnet18_int8  min =   16.57  max =   17.69  avg =   16.96
+             alexnet  min =    1.98  max =    3.24  avg =    2.23
+               vgg16  min =    3.59  max =   12.34  avg =   10.99
+          vgg16_int8  min =  110.63  max =  124.31  avg =  118.16
+            resnet50  min =    3.01  max =    4.93  avg =    3.77
+       resnet50_int8  min =   41.58  max =   44.80  avg =   43.24
+      squeezenet_ssd  min =    4.08  max =    4.70  avg =    4.32
+ squeezenet_ssd_int8  min =   17.32  max =   17.92  avg =   17.46
+       mobilenet_ssd  min =    2.26  max =    8.23  avg =    5.57
+  mobilenet_ssd_int8  min =   20.35  max =   21.89  avg =   20.76
+      mobilenet_yolo  min =    2.14  max =   16.94  avg =    6.44
+  mobilenetv2_yolov3  min =    3.64  max =    5.09  avg =    4.02
+         yolov4-tiny  min =   10.94  max =   17.46  avg =   13.58
+           nanodet_m  min =    6.57  max =   13.91  avg =    9.82
+    yolo-fastest-1.1  min =    5.40  max =   14.22  avg =   10.78
+      yolo-fastestv2  min =    7.49  max =    9.43  avg =    7.99
+  vision_transformer  min =   76.04  max =   76.96  avg =   76.43
+          FastestDet  min =    6.31  max =    6.60  avg =    6.43
+```
+
 ### nVIDIA RTX2080 of Desktop
 ```
 E:\projects\framework\ncnn\benchmark>benchncnn.exe 4096 1 0 0 0
@@ -6508,3 +6648,253 @@ cooling_down = 0
   vision_transformer  min =  600.83  max =  666.35  avg =  617.33
           FastestDet  min =    6.05  max =    6.72  avg =    6.23
 ```
+
+### AMD Ryzen 9 5950X 16-Core of Desktop[2023-10-12]
+```
+E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 -1 0
+loop_count = 100
+num_threads = 16
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    2.68  max =    3.10  avg =    2.77
+     squeezenet_int8  min =    3.57  max =    4.72  avg =    4.04
+           mobilenet  min =    3.09  max =    5.44  avg =    3.38
+      mobilenet_int8  min =    2.36  max =    3.40  avg =    2.74
+        mobilenet_v2  min =    4.24  max =    4.81  avg =    4.40
+        mobilenet_v3  min =    3.46  max =    3.93  avg =    3.58
+          shufflenet  min =    3.21  max =    4.54  avg =    4.01
+       shufflenet_v2  min =    2.99  max =    4.49  avg =    3.34
+             mnasnet  min =    3.62  max =    4.31  avg =    3.83
+     proxylessnasnet  min =    4.06  max =    5.70  avg =    4.23
+     efficientnet_b0  min =    5.60  max =    6.55  avg =    5.81
+   efficientnetv2_b0  min =    6.83  max =    8.82  avg =    7.12
+        regnety_400m  min =    8.02  max =    9.75  avg =    8.34
+           blazeface  min =    1.34  max =    1.77  avg =    1.46
+           googlenet  min =   11.62  max =   15.95  avg =   12.70
+      googlenet_int8  min =    7.43  max =   10.06  avg =    7.92
+            resnet18  min =    8.39  max =   10.39  avg =    9.04
+       resnet18_int8  min =    6.23  max =    8.64  avg =    6.75
+             alexnet  min =    7.78  max =   12.51  avg =    8.51
+               vgg16  min =   53.85  max =   63.39  avg =   56.36
+          vgg16_int8  min =   35.61  max =   46.94  avg =   38.08
+            resnet50  min =   18.55  max =   24.46  avg =   19.81
+       resnet50_int8  min =   11.95  max =   23.21  avg =   13.51
+      squeezenet_ssd  min =   10.01  max =   13.16  avg =   10.69
+ squeezenet_ssd_int8  min =    9.29  max =   14.02  avg =   10.47
+       mobilenet_ssd  min =    6.38  max =   10.26  avg =    7.15
+  mobilenet_ssd_int8  min =    4.69  max =    6.98  avg =    5.42
+      mobilenet_yolo  min =   17.63  max =   22.59  avg =   19.45
+  mobilenetv2_yolov3  min =   11.79  max =   15.67  avg =   12.76
+         yolov4-tiny  min =   21.53  max =   25.79  avg =   22.46
+           nanodet_m  min =    7.16  max =    9.99  avg =    8.01
+    yolo-fastest-1.1  min =    3.66  max =    5.00  avg =    4.38
+      yolo-fastestv2  min =    3.52  max =    5.20  avg =    4.60
+  vision_transformer  min =   67.01  max =   93.71  avg =   78.48
+          FastestDet  min =    4.44  max =    8.62  avg =    4.69
+```
+
+### AMD Radeon RX 6900 XT of Desktop[2023-10-12]
+```
+E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 0 0
+[0 AMD Radeon RX 6900 XT]  queueC=1[2]  queueG=0[1]  queueT=2[2]
+[0 AMD Radeon RX 6900 XT]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[0 AMD Radeon RX 6900 XT]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[0 AMD Radeon RX 6900 XT]  subgroup=64  basic/vote/ballot/shuffle=1/1/1/1
+[0 AMD Radeon RX 6900 XT]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
+loop_count = 100
+num_threads = 16
+powersave = 0
+gpu_device = 0
+cooling_down = 0
+          squeezenet  min =    2.19  max =    2.70  avg =    2.47
+     squeezenet_int8  min =    3.94  max =    4.51  avg =    4.18
+           mobilenet  min =    2.03  max =    2.63  avg =    2.28
+      mobilenet_int8  min =    2.56  max =    3.34  avg =    2.69
+        mobilenet_v2  min =    2.29  max =    2.98  avg =    2.62
+        mobilenet_v3  min =    2.31  max =    3.10  avg =    2.75
+          shufflenet  min =    1.89  max =    2.61  avg =    2.30
+       shufflenet_v2  min =    2.17  max =    3.04  avg =    2.59
+             mnasnet  min =    2.19  max =    2.98  avg =    2.69
+     proxylessnasnet  min =    2.12  max =    4.08  avg =    2.62
+     efficientnet_b0  min =    3.62  max =    5.27  avg =    4.21
+   efficientnetv2_b0  min =    6.09  max =    7.15  avg =    6.49
+        regnety_400m  min =    2.55  max =    3.82  avg =    3.00
+           blazeface  min =    1.93  max =    2.56  avg =    2.28
+           googlenet  min =    3.35  max =    4.46  avg =    3.75
+      googlenet_int8  min =    8.02  max =   12.84  avg =    9.15
+            resnet18  min =    2.46  max =    3.14  avg =    2.84
+       resnet18_int8  min =    6.37  max =    9.15  avg =    7.30
+             alexnet  min =    2.31  max =    2.91  avg =    2.69
+               vgg16  min =    4.76  max =    5.79  avg =    5.24
+          vgg16_int8  min =   35.94  max =   46.27  avg =   39.05
+            resnet50  min =    3.25  max =    4.09  avg =    3.75
+       resnet50_int8  min =   12.04  max =   20.53  avg =   14.61
+      squeezenet_ssd  min =    3.03  max =    5.31  avg =    3.66
+ squeezenet_ssd_int8  min =    9.74  max =   13.46  avg =   10.42
+       mobilenet_ssd  min =    2.82  max =    4.75  avg =    3.39
+  mobilenet_ssd_int8  min =    4.67  max =    6.76  avg =    5.30
+      mobilenet_yolo  min =    3.01  max =    3.67  avg =    3.34
+  mobilenetv2_yolov3  min =    4.04  max =    6.46  avg =    4.55
+         yolov4-tiny  min =    5.75  max =    8.05  avg =    6.52
+           nanodet_m  min =   10.16  max =   14.97  avg =   13.11
+    yolo-fastest-1.1  min =    2.36  max =    3.80  avg =    2.88
+      yolo-fastestv2  min =    2.24  max =    3.19  avg =    2.80
+  vision_transformer  min =   20.43  max =   25.06  avg =   21.07
+          FastestDet  min =    2.49  max =    3.18  avg =    2.93
+```
+
+### NVIDIA GeForce RTX 3060 Ti of Desktop[2023-10-12]
+```
+E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 0 0
+[0 NVIDIA GeForce RTX 3060 Ti]  queueC=2[8]  queueG=0[16]  queueT=1[2]
+[0 NVIDIA GeForce RTX 3060 Ti]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[0 NVIDIA GeForce RTX 3060 Ti]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[0 NVIDIA GeForce RTX 3060 Ti]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
+[0 NVIDIA GeForce RTX 3060 Ti]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
+[1 Intel(R) UHD Graphics 770]  queueC=0[1]  queueG=0[1]  queueT=0[1]
+[1 Intel(R) UHD Graphics 770]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[1 Intel(R) UHD Graphics 770]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[1 Intel(R) UHD Graphics 770]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
+[1 Intel(R) UHD Graphics 770]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
+loop_count = 100
+num_threads = 16
+powersave = 0
+gpu_device = 0
+cooling_down = 0
+          squeezenet  min =    0.80  max =    2.51  avg =    0.89
+     squeezenet_int8  min =    2.81  max =    3.51  avg =    2.96
+           mobilenet  min =    0.70  max =    0.79  avg =    0.71
+      mobilenet_int8  min =    2.95  max =    3.44  avg =    3.03
+        mobilenet_v2  min =    1.09  max =    1.25  avg =    1.12
+        mobilenet_v3  min =    1.33  max =    2.04  avg =    1.56
+          shufflenet  min =    1.20  max =    1.39  avg =    1.27
+       shufflenet_v2  min =    1.50  max =    1.66  avg =    1.57
+             mnasnet  min =    1.11  max =    1.22  avg =    1.15
+     proxylessnasnet  min =    1.20  max =    1.63  avg =    1.24
+     efficientnet_b0  min =    2.38  max =    3.21  avg =    2.61
+   efficientnetv2_b0  min =    9.16  max =   11.35  avg =    9.63
+        regnety_400m  min =    1.86  max =    2.03  avg =    1.94
+           blazeface  min =    0.70  max =    1.10  avg =    0.76
+           googlenet  min =    2.11  max =    2.40  avg =    2.30
+      googlenet_int8  min =    6.91  max =    7.88  avg =    7.17
+            resnet18  min =    1.14  max =    1.47  avg =    1.19
+       resnet18_int8  min =    4.96  max =    6.82  avg =    5.40
+             alexnet  min =    1.10  max =    1.85  avg =    1.19
+               vgg16  min =    2.27  max =    3.97  avg =    2.46
+          vgg16_int8  min =   19.02  max =   22.20  avg =   20.28
+            resnet50  min =    2.00  max =    2.99  avg =    2.10
+       resnet50_int8  min =   10.66  max =   13.30  avg =   11.29
+      squeezenet_ssd  min =    2.74  max =    3.44  avg =    2.90
+ squeezenet_ssd_int8  min =    6.93  max =    7.95  avg =    7.19
+       mobilenet_ssd  min =    1.86  max =    2.07  avg =    1.96
+  mobilenet_ssd_int8  min =    5.92  max =    6.48  avg =    6.09
+      mobilenet_yolo  min =    1.65  max =    2.58  avg =    1.78
+  mobilenetv2_yolov3  min =    3.85  max =    4.11  avg =    3.96
+         yolov4-tiny  min =    6.54  max =    7.05  avg =    6.69
+           nanodet_m  min =    2.38  max =    3.28  avg =    2.72
+    yolo-fastest-1.1  min =    1.73  max =    2.07  avg =    1.83
+      yolo-fastestv2  min =    1.72  max =    1.92  avg =    1.80
+  vision_transformer  min =   53.91  max =   56.59  avg =   55.27
+          FastestDet  min =    1.48  max =    1.83  avg =    1.69
+```
+
+### Intel(R) UHD Graphics 770 of Desktop[2023-10-12]
+```
+E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 1 0
+[0 NVIDIA GeForce RTX 3060 Ti]  queueC=2[8]  queueG=0[16]  queueT=1[2]
+[0 NVIDIA GeForce RTX 3060 Ti]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[0 NVIDIA GeForce RTX 3060 Ti]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[0 NVIDIA GeForce RTX 3060 Ti]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
+[0 NVIDIA GeForce RTX 3060 Ti]  fp16-matrix-16_8_8/16_8_16/16_16_16=1/1/1
+[1 Intel(R) UHD Graphics 770]  queueC=0[1]  queueG=0[1]  queueT=0[1]
+[1 Intel(R) UHD Graphics 770]  bugsbn1=0  bugbilz=0  bugcopc=0  bugihfa=0
+[1 Intel(R) UHD Graphics 770]  fp16-p/s/a=1/1/1  int8-p/s/a=1/1/1
+[1 Intel(R) UHD Graphics 770]  subgroup=32  basic/vote/ballot/shuffle=1/1/1/1
+[1 Intel(R) UHD Graphics 770]  fp16-matrix-16_8_8/16_8_16/16_16_16=0/0/0
+loop_count = 100
+num_threads = 16
+powersave = 0
+gpu_device = 1
+cooling_down = 0
+          squeezenet  min =    3.11  max =    4.47  avg =    3.45
+     squeezenet_int8  min =    1.89  max =    2.84  avg =    2.23
+           mobilenet  min =    4.98  max =    5.67  avg =    5.18
+      mobilenet_int8  min =    2.54  max =    3.17  avg =    2.98
+        mobilenet_v2  min =    4.03  max =    4.89  avg =    4.37
+        mobilenet_v3  min =    4.45  max =    5.68  avg =    4.86
+          shufflenet  min =    3.42  max =    4.42  avg =    3.79
+       shufflenet_v2  min =    3.00  max =    4.01  avg =    3.30
+             mnasnet  min =    4.21  max =    5.12  avg =    4.51
+     proxylessnasnet  min =    4.62  max =    5.64  avg =    4.90
+     efficientnet_b0  min =    7.82  max =    8.63  avg =    8.10
+   efficientnetv2_b0  min =   34.52  max =   36.34  avg =   35.29
+        regnety_400m  min =    6.07  max =    7.31  avg =    6.44
+           blazeface  min =    1.54  max =    1.67  avg =    1.59
+           googlenet  min =   11.53  max =   12.64  avg =   11.89
+      googlenet_int8  min =   13.71  max =   15.52  avg =   14.38
+            resnet18  min =   10.75  max =   12.94  avg =   11.07
+       resnet18_int8  min =    9.04  max =   11.05  avg =    9.53
+             alexnet  min =   13.64  max =   14.37  avg =   13.98
+               vgg16  min =   38.53  max =   40.16  avg =   39.22
+          vgg16_int8  min =   16.04  max =   21.16  avg =   19.35
+            resnet50  min =   25.61  max =   28.22  avg =   26.62
+       resnet50_int8  min =    7.72  max =   12.83  avg =   10.29
+      squeezenet_ssd  min =   10.34  max =   15.88  avg =   14.75
+ squeezenet_ssd_int8  min =    4.63  max =    7.13  avg =    5.66
+       mobilenet_ssd  min =   11.35  max =   13.06  avg =   12.44
+  mobilenet_ssd_int8  min =    4.21  max =    6.31  avg =    5.32
+      mobilenet_yolo  min =   20.14  max =   22.92  avg =   21.94
+  mobilenetv2_yolov3  min =   12.58  max =   14.88  avg =   14.21
+         yolov4-tiny  min =   20.62  max =   25.58  avg =   24.39
+           nanodet_m  min =    7.75  max =   12.49  avg =   11.42
+    yolo-fastest-1.1  min =    3.68  max =    6.49  avg =    5.54
+      yolo-fastestv2  min =    4.32  max =    5.39  avg =    4.51
+  vision_transformer  min =  796.51  max =  805.29  avg =  802.39
+          FastestDet  min =    2.89  max =    4.83  avg =    3.95
+```
+
+### Intel® Core™ i7-13700K of Desktop[2023-10-12]
+```
+E:\github\ncnn\build-ncnn-vs2019\benchmark\Release>benchncnn.exe 100 16 0 -1 0
+loop_count = 100
+num_threads = 16
+powersave = 0
+gpu_device = -1
+cooling_down = 0
+          squeezenet  min =    1.69  max =    2.63  avg =    2.12
+     squeezenet_int8  min =    1.83  max =    3.03  avg =    2.26
+           mobilenet  min =    1.69  max =    2.64  avg =    2.24
+      mobilenet_int8  min =    2.47  max =    3.06  avg =    2.84
+        mobilenet_v2  min =    1.94  max =    3.47  avg =    2.47
+        mobilenet_v3  min =    1.49  max =    2.74  avg =    1.87
+          shufflenet  min =    1.57  max =    3.00  avg =    1.82
+       shufflenet_v2  min =    1.41  max =    1.72  avg =    1.51
+             mnasnet  min =    1.73  max =    2.94  avg =    2.13
+     proxylessnasnet  min =    2.08  max =    3.31  avg =    2.69
+     efficientnet_b0  min =    3.20  max =    4.99  avg =    3.78
+   efficientnetv2_b0  min =    3.51  max =    5.16  avg =    4.08
+        regnety_400m  min =    4.51  max =   10.29  avg =    6.18
+           blazeface  min =    0.52  max =    0.92  avg =    0.59
+           googlenet  min =    5.49  max =    7.48  avg =    6.26
+      googlenet_int8  min =    4.83  max =    7.54  avg =    5.90
+            resnet18  min =    4.05  max =    6.61  avg =    4.83
+       resnet18_int8  min =    3.77  max =    5.70  avg =    4.57
+             alexnet  min =    3.60  max =    5.09  avg =    4.26
+               vgg16  min =   25.19  max =   28.79  avg =   26.81
+          vgg16_int8  min =   17.52  max =   21.79  avg =   19.80
+            resnet50  min =    9.23  max =   13.15  avg =   11.34
+       resnet50_int8  min =    7.77  max =   12.00  avg =   10.18
+      squeezenet_ssd  min =    4.33  max =    6.73  avg =    4.96
+ squeezenet_ssd_int8  min =    4.77  max =    7.62  avg =    5.71
+       mobilenet_ssd  min =    3.70  max =    6.43  avg =    4.53
+  mobilenet_ssd_int8  min =    4.16  max =    6.53  avg =    5.38
+      mobilenet_yolo  min =   11.27  max =   14.93  avg =   12.90
+  mobilenetv2_yolov3  min =    7.41  max =   11.52  avg =    9.11
+         yolov4-tiny  min =   12.05  max =   18.96  avg =   14.15
+           nanodet_m  min =    3.39  max =    5.77  avg =    4.07
+    yolo-fastest-1.1  min =    1.95  max =    3.85  avg =    2.30
+      yolo-fastestv2  min =    1.91  max =    3.52  avg =    2.27
+  vision_transformer  min =   79.50  max =   99.93  avg =   88.91
+          FastestDet  min =    1.92  max =    2.72  avg =    2.19
+```
\ No newline at end of file
diff --git a/docs/developer-guide/operators.md b/docs/developer-guide/operators.md
index b0975ee46877..4cabb049340f 100644
--- a/docs/developer-guide/operators.md
+++ b/docs/developer-guide/operators.md
@@ -914,6 +914,7 @@ This function is often used in conjunction with affine_grid() to build Spatial T
 | 0         | sample_type   | int   | 1         |                   |
 | 1         | padding_mode  | int   | 1         |                   |
 | 2         | align_corner  | int   | 0         |                   |
+| 3         | permute_fusion| int   | 0         | fuse with permute |
 
 
 Sample type:
diff --git a/src/layer/gridsample.cpp b/src/layer/gridsample.cpp
index f87a0e93cb5f..abeec6fa5bec 100644
--- a/src/layer/gridsample.cpp
+++ b/src/layer/gridsample.cpp
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // coord compliance with the License. You may obtain a copy of the License at
@@ -27,6 +27,7 @@ int GridSample::load_param(const ParamDict& pd)
     sample_type = pd.get(0, 1);
     padding_mode = pd.get(1, 1);
     align_corner = pd.get(2, 0);
+    permute_fusion = pd.get(3, 0);
 
     if (sample_type < 1 || sample_type > 3)
     {
@@ -57,19 +58,19 @@ static float grid_sample_unormalize(int w, float coordx, int align_corner)
     return align_corner ? (coordx + 1) / 2.f * (w - 1) : ((coordx + 1) * w - 1) / 2.f;
 }
 
-static float border_coord(int x, int border)
+static float border_coord(float x, float border)
 {
-    return std::min(border, std::max(x, 0));
+    return std::min(border, std::max(x, 0.0f));
 }
 
 static float reflect_coord(float x, int high)
 {
-    x = abs(x);
-    x = high - abs(x - high);
+    x = fabs(x);
+    x = high - fabs(x - high);
     return x;
 }
 
-static int compute_coord(int sx, int w, int padding_mode, int align_corner)
+static float compute_coord(float sx, int w, int padding_mode, int align_corner)
 {
     if (padding_mode == 2) // border
     {
@@ -83,7 +84,7 @@ static int compute_coord(int sx, int w, int padding_mode, int align_corner)
         }
         else
         {
-            sx = static_cast<int>(reflect_coord(sx + 0.5f, w) - 0.5f);
+            sx = reflect_coord(sx + 0.5, w) - 0.5;
             sx = border_coord(sx, w - 1);
         }
     }
@@ -108,7 +109,7 @@ static float get_value_bounded(const Mat& image, int x, int y)
 
 static float get_value_bounded(const Mat& image, int x, int y, int z)
 {
-    return in_bounds(image, x, y, z) ? image.channel(z).row(y)[x] : 0.f;
+    return in_bounds(image, x, y, z) ? image.depth(z).row(y)[x] : 0.f;
 }
 
 static float get_value_bounded(const Mat& image, int x, int y, int padding_mode, int align_corner)
@@ -119,15 +120,6 @@ static float get_value_bounded(const Mat& image, int x, int y, int padding_mode,
     return get_value_bounded(image, x, y);
 }
 
-static float get_value_bounded(const Mat& image, int x, int y, int z, int padding_mode, int align_corner)
-{
-    x = compute_coord(x, image.w, padding_mode, align_corner);
-    y = compute_coord(y, image.h, padding_mode, align_corner);
-    z = compute_coord(z, image.c, padding_mode, align_corner);
-
-    return get_value_bounded(image, x, y, z);
-}
-
 static inline void interpolate_cubic(float fx, float* coeffs)
 {
     const float A = -0.75f;
@@ -158,45 +150,102 @@ int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
 
     if (dims == 3)
     {
-        int outw = grid.h;
-        int outh = grid.c;
+        int outw = permute_fusion == 0 ? grid.h : grid.w;
+        int outh = permute_fusion == 0 ? grid.c : grid.h;
 
         top_blob.create(outw, outh, channels, elemsize, opt.blob_allocator);
-        if (top_blob.empty())
+
+        Mat offset_blob;
+        offset_blob.create(outw, outh, grid.c, elemsize, opt.workspace_allocator);
+
+        if (top_blob.empty() || offset_blob.empty())
             return -100;
 
-        if (sample_type == 1) // bilinear
+        //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly
+        if (permute_fusion == 0)
+        {
+            float* offsetptr_x = offset_blob.channel(0);
+            float* offsetptr_y = offset_blob.channel(1);
+
+            for (int y = 0; y < outh; y++)
+            {
+                const float* gridptr = grid.channel(y);
+                for (int x = 0; x < outw; x++)
+                {
+                    float sample_x = gridptr[0];
+                    float sample_y = gridptr[1];
+
+                    sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                    sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+
+                    *offsetptr_x = sample_x;
+                    *offsetptr_y = sample_y;
+
+                    gridptr += 2;
+                    offsetptr_x++;
+                    offsetptr_y++;
+                }
+            }
+        }
+        else
+        {
+            const float* gridptr_x = grid.channel(0);
+            const float* gridptr_y = grid.channel(1);
+            float* offsetptr_x = offset_blob.channel(0);
+            float* offsetptr_y = offset_blob.channel(1);
+
+            for (int y = 0; y < outh; y++)
+            {
+                for (int x = 0; x < outw; x++)
+                {
+                    float sample_x = *gridptr_x;
+                    float sample_y = *gridptr_y;
+
+                    sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                    sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+
+                    *offsetptr_x = sample_x;
+                    *offsetptr_y = sample_y;
+
+                    gridptr_x++;
+                    gridptr_y++;
+                    offsetptr_x++;
+                    offsetptr_y++;
+                }
+            }
+        }
+
+        if (sample_type == Interpolation_BILINEAR) // bilinear
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
             {
                 const Mat image = bottom_blob.channel(q);
                 float* outptr = top_blob.channel(q);
+                const float* offsetptr_x = offset_blob.channel(0);
+                const float* offsetptr_y = offset_blob.channel(1);
 
                 for (int y = 0; y < outh; y++)
                 {
-                    const float* gridptr = grid.channel(y);
-
                     for (int x = 0; x < outw; x++)
                     {
-                        float sample_x = gridptr[0];
-                        float sample_y = gridptr[1];
-
-                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
-                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+                        float sample_x = *offsetptr_x;
+                        float sample_y = *offsetptr_y;
 
                         // bilinear interpolate
                         float v;
                         {
-                            int x0 = (int)floor(sample_x);
-                            int y0 = (int)floor(sample_y);
+                            sample_x = compute_coord(sample_x, w, padding_mode, align_corner);
+                            sample_y = compute_coord(sample_y, h, padding_mode, align_corner);
+                            int x0 = floor(sample_x);
+                            int y0 = floor(sample_y);
                             int x1 = x0 + 1;
                             int y1 = y0 + 1;
 
-                            float v00 = get_value_bounded(image, x0, y0, padding_mode, align_corner);
-                            float v01 = get_value_bounded(image, x1, y0, padding_mode, align_corner);
-                            float v10 = get_value_bounded(image, x0, y1, padding_mode, align_corner);
-                            float v11 = get_value_bounded(image, x1, y1, padding_mode, align_corner);
+                            float v00 = get_value_bounded(image, x0, y0);
+                            float v01 = get_value_bounded(image, x1, y0);
+                            float v10 = get_value_bounded(image, x0, y1);
+                            float v11 = get_value_bounded(image, x1, y1);
 
                             float alpha = sample_x - x0;
                             float beta = sample_y - y0;
@@ -210,63 +259,61 @@ int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
                         outptr[0] = v;
                         outptr += 1;
 
-                        gridptr += 2;
+                        offsetptr_x++;
+                        offsetptr_y++;
                     }
                 }
             }
         }
-        else if (sample_type == 2) // nearest
+        else if (sample_type == Interpolation_NEAREST) // nearest
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
             {
                 const Mat image = bottom_blob.channel(q);
                 float* outptr = top_blob.channel(q);
+                const float* offsetptr_x = offset_blob.channel(0);
+                const float* offsetptr_y = offset_blob.channel(1);
 
                 for (int y = 0; y < outh; y++)
                 {
-                    const float* gridptr = grid.channel(y);
-
                     for (int x = 0; x < outw; x++)
                     {
-                        float sample_x = gridptr[0];
-                        float sample_y = gridptr[1];
+                        float sample_x = *offsetptr_x;
+                        float sample_y = *offsetptr_y;
+                        sample_x = compute_coord(sample_x, w, padding_mode, align_corner);
+                        sample_y = compute_coord(sample_y, h, padding_mode, align_corner);
 
-                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
-                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
-
-                        int x0 = static_cast<int>(round(sample_x));
-                        int y0 = static_cast<int>(round(sample_y));
+                        int x0 = static_cast<int>(floor(sample_x + 0.5f));
+                        int y0 = static_cast<int>(floor(sample_y + 0.5f));
 
-                        float v = get_value_bounded(image, x0, y0, padding_mode, align_corner);
+                        float v = get_value_bounded(image, x0, y0);
 
                         outptr[0] = v;
                         outptr += 1;
 
-                        gridptr += 2;
+                        offsetptr_x++;
+                        offsetptr_y++;
                     }
                 }
             }
         }
-        else if (sample_type == 3) // bicubic
+        else if (sample_type == Interpolation_BICUBIC) // bicubic
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
             {
                 const Mat image = bottom_blob.channel(q);
                 float* outptr = top_blob.channel(q);
+                const float* offsetptr_x = offset_blob.channel(0);
+                const float* offsetptr_y = offset_blob.channel(1);
 
                 for (int y = 0; y < outh; y++)
                 {
-                    const float* gridptr = grid.channel(y);
-
                     for (int x = 0; x < outw; x++)
                     {
-                        float sample_x = gridptr[0];
-                        float sample_y = gridptr[1];
-
-                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
-                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+                        float sample_x = *offsetptr_x;
+                        float sample_y = *offsetptr_y;
 
                         // bicubic interpolate
                         float v;
@@ -313,7 +360,8 @@ int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
                         outptr[0] = v;
                         outptr += 1;
 
-                        gridptr += 2;
+                        offsetptr_x++;
+                        offsetptr_y++;
                     }
                 }
             }
@@ -322,37 +370,120 @@ int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
 
     if (dims == 4)
     {
-        int outw = grid.h;
-        int outh = grid.d;
-        int outd = grid.c;
+        int outw = permute_fusion == 0 ? grid.h : grid.w;
+        int outh = permute_fusion == 0 ? grid.d : grid.h;
+        int outd = permute_fusion == 0 ? grid.c : grid.d;
 
         top_blob.create(outw, outh, outd, channels, elemsize, opt.blob_allocator);
-        if (top_blob.empty())
+
+        Mat offset_blob;
+        offset_blob.create(outw, outh, outd, grid.c, elemsize, opt.workspace_allocator);
+
+        if (top_blob.empty() || offset_blob.empty())
             return -100;
 
-        if (sample_type == 1) // bilinear
+        //pre-calculate all interpolation offsets for each x y, unpack grid on-the-fly
+        if (permute_fusion == 0)
+        {
+            float* offsetptr_x = offset_blob.channel(0);
+            float* offsetptr_y = offset_blob.channel(1);
+            float* offsetptr_z = offset_blob.channel(2);
+
+            for (int z = 0; z < outd; z++)
+            {
+                const float* gridptr = grid.channel(z);
+                for (int y = 0; y < outh; y++)
+                {
+                    for (int x = 0; x < outw; x++)
+                    {
+                        float sample_x = gridptr[0];
+                        float sample_y = gridptr[1];
+                        float sample_z = gridptr[2];
+
+                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                        sample_x = compute_coord(sample_x, w, padding_mode, align_corner);
+
+                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+                        sample_y = compute_coord(sample_y, h, padding_mode, align_corner);
+
+                        sample_z = grid_sample_unormalize(d, sample_z, align_corner);
+                        sample_z = compute_coord(sample_z, d, padding_mode, align_corner);
+
+                        *offsetptr_x = sample_x;
+                        *offsetptr_y = sample_y;
+                        *offsetptr_z = sample_z;
+
+                        gridptr += 3;
+                        offsetptr_x++;
+                        offsetptr_y++;
+                        offsetptr_z++;
+                    }
+                }
+            }
+        }
+        else
+        {
+            const float* gridptr_x = grid.channel(0);
+            const float* gridptr_y = grid.channel(1);
+            const float* gridptr_z = grid.channel(2);
+            float* offsetptr_x = offset_blob.channel(0);
+            float* offsetptr_y = offset_blob.channel(1);
+            float* offsetptr_z = offset_blob.channel(2);
+
+            for (int z = 0; z < outd; z++)
+            {
+                for (int y = 0; y < outh; y++)
+                {
+                    for (int x = 0; x < outw; x++)
+                    {
+                        float sample_x = *gridptr_x;
+                        float sample_y = *gridptr_y;
+                        float sample_z = *gridptr_z;
+
+                        sample_x = grid_sample_unormalize(w, sample_x, align_corner);
+                        sample_x = compute_coord(sample_x, w, padding_mode, align_corner);
+
+                        sample_y = grid_sample_unormalize(h, sample_y, align_corner);
+                        sample_y = compute_coord(sample_y, h, padding_mode, align_corner);
+
+                        sample_z = grid_sample_unormalize(d, sample_z, align_corner);
+                        sample_z = compute_coord(sample_z, d, padding_mode, align_corner);
+
+                        *offsetptr_x = sample_x;
+                        *offsetptr_y = sample_y;
+                        *offsetptr_z = sample_z;
+
+                        gridptr_x++;
+                        gridptr_y++;
+                        gridptr_z++;
+                        offsetptr_x++;
+                        offsetptr_y++;
+                        offsetptr_z++;
+                    }
+                }
+            }
+        }
+
+        if (sample_type == Interpolation_BILINEAR) // bilinear
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
             {
                 const Mat image = bottom_blob.channel(q);
                 float* outptr = top_blob.channel(q);
+                const float* offsetptr_x = offset_blob.channel(0);
+                const float* offsetptr_y = offset_blob.channel(1);
+                const float* offsetptr_z = offset_blob.channel(2);
 
                 for (int z = 0; z < outd; z++)
                 {
-                    const float* gridptr = grid.channel(z);
-
                     for (int y = 0; y < outh; y++)
                     {
                         for (int x = 0; x < outw; x++)
                         {
-                            float sample_x = gridptr[0];
-                            float sample_y = gridptr[1];
-                            float sample_z = gridptr[2];
-
-                            sample_x = grid_sample_unormalize(w, sample_x, align_corner);
-                            sample_y = grid_sample_unormalize(h, sample_y, align_corner);
-                            sample_z = grid_sample_unormalize(d, sample_z, align_corner);
+                            float sample_x = *offsetptr_x;
+                            float sample_y = *offsetptr_y;
+                            float sample_z = *offsetptr_z;
 
                             // bilinear interpolate
                             float v;
@@ -364,14 +495,14 @@ int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
                                 int y1 = y0 + 1;
                                 int z1 = z0 + 1;
 
-                                float v000 = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner);
-                                float v001 = get_value_bounded(image, x1, y0, z0, padding_mode, align_corner);
-                                float v010 = get_value_bounded(image, x0, y1, z0, padding_mode, align_corner);
-                                float v011 = get_value_bounded(image, x1, y1, z0, padding_mode, align_corner);
-                                float v100 = get_value_bounded(image, x0, y0, z1, padding_mode, align_corner);
-                                float v101 = get_value_bounded(image, x1, y0, z1, padding_mode, align_corner);
-                                float v110 = get_value_bounded(image, x0, y1, z1, padding_mode, align_corner);
-                                float v111 = get_value_bounded(image, x1, y1, z1, padding_mode, align_corner);
+                                float v000 = get_value_bounded(image, x0, y0, z0);
+                                float v001 = get_value_bounded(image, x1, y0, z0);
+                                float v010 = get_value_bounded(image, x0, y1, z0);
+                                float v011 = get_value_bounded(image, x1, y1, z0);
+                                float v100 = get_value_bounded(image, x0, y0, z1);
+                                float v101 = get_value_bounded(image, x1, y0, z1);
+                                float v110 = get_value_bounded(image, x0, y1, z1);
+                                float v111 = get_value_bounded(image, x1, y1, z1);
 
                                 float alpha = sample_x - x0;
                                 float beta = sample_y - y0;
@@ -391,46 +522,47 @@ int GridSample::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>&
                             outptr[0] = v;
                             outptr += 1;
 
-                            gridptr += 3;
+                            offsetptr_x++;
+                            offsetptr_y++;
+                            offsetptr_z++;
                         }
                     }
                 }
             }
         }
-        else if (sample_type == 2) // nearest
+        else if (sample_type == Interpolation_NEAREST) // nearest
         {
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < channels; q++)
             {
                 const Mat image = bottom_blob.channel(q);
                 float* outptr = top_blob.channel(q);
+                const float* offsetptr_x = offset_blob.channel(0);
+                const float* offsetptr_y = offset_blob.channel(1);
+                const float* offsetptr_z = offset_blob.channel(2);
 
                 for (int z = 0; z < outd; z++)
                 {
-                    const float* gridptr = grid.channel(z);
-
                     for (int y = 0; y < outh; y++)
                     {
                         for (int x = 0; x < outw; x++)
                         {
-                            float sample_x = gridptr[0];
-                            float sample_y = gridptr[1];
-                            float sample_z = gridptr[2];
-
-                            sample_x = grid_sample_unormalize(w, sample_x, align_corner);
-                            sample_y = grid_sample_unormalize(h, sample_y, align_corner);
-                            sample_z = grid_sample_unormalize(d, sample_z, align_corner);
+                            float sample_x = *offsetptr_x;
+                            float sample_y = *offsetptr_y;
+                            float sample_z = *offsetptr_z;
 
-                            int x0 = static_cast<int>(round(sample_x));
-                            int y0 = static_cast<int>(round(sample_y));
-                            int z0 = static_cast<int>(round(sample_z));
+                            int x0 = static_cast<int>(floor(sample_x + 0.5f));
+                            int y0 = static_cast<int>(floor(sample_y + 0.5f));
+                            int z0 = static_cast<int>(floor(sample_z + 0.5f));
 
-                            float v = get_value_bounded(image, x0, y0, z0, padding_mode, align_corner);
+                            float v = get_value_bounded(image, x0, y0, z0);
 
                             outptr[0] = v;
                             outptr += 1;
 
-                            gridptr += 3;
+                            offsetptr_x++;
+                            offsetptr_y++;
+                            offsetptr_z++;
                         }
                     }
                 }
diff --git a/src/layer/gridsample.h b/src/layer/gridsample.h
index 0ea540eb4baf..f6e17c9d2f49 100644
--- a/src/layer/gridsample.h
+++ b/src/layer/gridsample.h
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -28,11 +28,27 @@ class GridSample : public Layer
 
     virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
 
+    enum InterpolationMode // 1=bilinear  2=nearest  3=bicubic
+    {
+        Interpolation_BILINEAR = 1,
+        Interpolation_NEAREST = 2,
+        Interpolation_BICUBIC = 3
+    };
+
+    enum PaddingMode // 1=zeros     2=border   3=reflection
+    {
+        Padding_ZEROS = 1,
+        Padding_BORDER = 2,
+        Padding_REFLECTION = 3
+    };
+
 public:
     // param
     int sample_type;  // 1=bilinear  2=nearest  3=bicubic
     int padding_mode; // 1=zeros     2=border   3=reflection
     int align_corner;
+
+    int permute_fusion;
 };
 
 } // namespace ncnn
diff --git a/src/layer/memorydata.cpp b/src/layer/memorydata.cpp
index 6cd314d76b97..02a0e0be078c 100644
--- a/src/layer/memorydata.cpp
+++ b/src/layer/memorydata.cpp
@@ -28,6 +28,7 @@ int MemoryData::load_param(const ParamDict& pd)
     h = pd.get(1, 0);
     d = pd.get(11, 0);
     c = pd.get(2, 0);
+    load_type = pd.get(21, 1);
 
     return 0;
 }
@@ -36,19 +37,19 @@ int MemoryData::load_model(const ModelBin& mb)
 {
     if (d != 0)
     {
-        data = mb.load(w, h, d, c, 1);
+        data = mb.load(w, h, d, c, load_type);
     }
     else if (c != 0)
     {
-        data = mb.load(w, h, c, 1);
+        data = mb.load(w, h, c, load_type);
     }
     else if (h != 0)
     {
-        data = mb.load(w, h, 1);
+        data = mb.load(w, h, load_type);
     }
     else if (w != 0)
     {
-        data = mb.load(w, 1);
+        data = mb.load(w, load_type);
     }
     else // 0 0 0
     {
diff --git a/src/layer/memorydata.h b/src/layer/memorydata.h
index 4b2c697912f8..d5175ad0dd8a 100644
--- a/src/layer/memorydata.h
+++ b/src/layer/memorydata.h
@@ -35,6 +35,7 @@ class MemoryData : public Layer
     int h;
     int d;
     int c;
+    int load_type;
 
     Mat data;
 };
diff --git a/src/layer/riscv/gemm_riscv.cpp b/src/layer/riscv/gemm_riscv.cpp
new file mode 100644
index 000000000000..ec5a5cdac413
--- /dev/null
+++ b/src/layer/riscv/gemm_riscv.cpp
@@ -0,0 +1,4282 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gemm_riscv.h"
+
+#if __riscv_vector
+#include <riscv_vector.h>
+#endif // __riscv_vector
+
+#include "riscv_usability.h"
+
+#include "cpu.h"
+
+namespace ncnn {
+
+Gemm_riscv::Gemm_riscv()
+{
+#if __riscv_vector
+    support_packing = true;
+#endif // __riscv_vector
+    one_blob_only = false;
+    support_inplace = false;
+
+    nT = 0;
+#if __riscv_vector
+    // When processing float data,
+    // even if the current hardware provides vector registers of more than 128 bits,
+    // vl=4 is still used, even though this will waste the width of the vector register.
+    vl = vsetvlmax_e32m1();
+    vl = vl >= 4 ? 4 : vl;
+#else
+    vl = 0;
+#endif // __riscv_vector
+}
+
+static void pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, size_t vl)
+{
+    const int elempack = A.elempack;
+    const int A_hstep = A.dims == 3 ? (int)A.cstep : A.w;
+
+    float* pp = AT;
+
+    int ii = 0;
+#if __riscv_vector
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)A + (i + ii) * A_hstep + k * 4;
+            const float* p1 = (const float*)A + (i + ii + 4) * A_hstep + k * 4;
+
+            for (int kk = 0; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                vse32_v_f32m1(pp + 4, vle32_v_f32m1(p1, vl), vl);
+                pp += 8;
+                p0 += 4;
+                p1 += 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
+            const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
+            const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
+            const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;
+            const float* p4 = (const float*)A + (i + ii + 4) * A_hstep + k;
+            const float* p5 = (const float*)A + (i + ii + 5) * A_hstep + k;
+            const float* p6 = (const float*)A + (i + ii + 6) * A_hstep + k;
+            const float* p7 = (const float*)A + (i + ii + 7) * A_hstep + k;
+
+            int kk = 0;
+            for (; kk + 7 < max_kk; kk += 8)
+            {
+                vfloat32m1_t _r0l = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t _r0h = vle32_v_f32m1(p0 + 4, vl);
+                vfloat32m1_t _r1l = vle32_v_f32m1(p1, vl);
+                vfloat32m1_t _r1h = vle32_v_f32m1(p1 + 4, vl);
+                vfloat32m1_t _r2l = vle32_v_f32m1(p2, vl);
+                vfloat32m1_t _r2h = vle32_v_f32m1(p2 + 4, vl);
+                vfloat32m1_t _r3l = vle32_v_f32m1(p3, vl);
+                vfloat32m1_t _r3h = vle32_v_f32m1(p3 + 4, vl);
+                vfloat32m1_t _r4l = vle32_v_f32m1(p4, vl);
+                vfloat32m1_t _r4h = vle32_v_f32m1(p4 + 4, vl);
+                vfloat32m1_t _r5l = vle32_v_f32m1(p5, vl);
+                vfloat32m1_t _r5h = vle32_v_f32m1(p5 + 4, vl);
+                vfloat32m1_t _r6l = vle32_v_f32m1(p6, vl);
+                vfloat32m1_t _r6h = vle32_v_f32m1(p6 + 4, vl);
+                vfloat32m1_t _r7l = vle32_v_f32m1(p7, vl);
+                vfloat32m1_t _r7h = vle32_v_f32m1(p7 + 4, vl);
+                transpose8x8_ps(_r0l, _r0h, _r1l, _r1h, _r2l, _r2h, _r3l, _r3h, _r4l, _r4h, _r5l, _r5h, _r6l, _r6h, _r7l, _r7h, vl);
+                vse32_v_f32m1(pp, _r0l, vl);
+                vse32_v_f32m1(pp + 4, _r0h, vl);
+                vse32_v_f32m1(pp + 8, _r1l, vl);
+                vse32_v_f32m1(pp + 12, _r1h, vl);
+                vse32_v_f32m1(pp + 8 * 2, _r2l, vl);
+                vse32_v_f32m1(pp + 8 * 2 + 4, _r2h, vl);
+                vse32_v_f32m1(pp + 8 * 3, _r3l, vl);
+                vse32_v_f32m1(pp + 8 * 3 + 4, _r3h, vl);
+                vse32_v_f32m1(pp + 8 * 4, _r4l, vl);
+                vse32_v_f32m1(pp + 8 * 4 + 4, _r4h, vl);
+                vse32_v_f32m1(pp + 8 * 5, _r5l, vl);
+                vse32_v_f32m1(pp + 8 * 5 + 4, _r5h, vl);
+                vse32_v_f32m1(pp + 8 * 6, _r6l, vl);
+                vse32_v_f32m1(pp + 8 * 6 + 4, _r6h, vl);
+                vse32_v_f32m1(pp + 8 * 7, _r7l, vl);
+                vse32_v_f32m1(pp + 8 * 7 + 4, _r7h, vl);
+                pp += 64;
+                p0 += 8;
+                p1 += 8;
+                p2 += 8;
+                p3 += 8;
+                p4 += 8;
+                p5 += 8;
+                p6 += 8;
+                p7 += 8;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp[2] = p2[0];
+                pp[3] = p3[0];
+                pp[4] = p4[0];
+                pp[5] = p5[0];
+                pp[6] = p6[0];
+                pp[7] = p7[0];
+                pp += 8;
+                p0++;
+                p1++;
+                p2++;
+                p3++;
+                p4++;
+                p5++;
+                p6++;
+                p7++;
+            }
+        }
+    }
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)A + (i + ii) * A_hstep + k * 4;
+
+            for (int kk = 0; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
+            const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
+            const float* p2 = (const float*)A + (i + ii + 2) * A_hstep + k;
+            const float* p3 = (const float*)A + (i + ii + 3) * A_hstep + k;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
+                vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
+                vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
+                store_float_v4(v0, v1, v2, v3, pp, vl);
+                pp += 16;
+                p0 += 4;
+                p1 += 4;
+                p2 += 4;
+                p3 += 4;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp[2] = p2[0];
+                pp[3] = p3[0];
+                pp += 4;
+                p0++;
+                p1++;
+                p2++;
+                p3++;
+            }
+        }
+    }
+#endif // __riscv_vector
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        // if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
+            const float* p1 = (const float*)A + (i + ii + 1) * A_hstep + k;
+
+            int kk = 0;
+#if __riscv_vector
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
+                store_float_v2(v0, v1, pp, vl);
+                pp += 8;
+                p0 += 4;
+                p1 += 4;
+            }
+#endif // __riscv_vector
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp += 2;
+                p0++;
+                p1++;
+            }
+        }
+    }
+    for (; ii < max_ii; ii += 1)
+    {
+        // if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + (i + ii) * A_hstep + k;
+
+            int kk = 0;
+#if __riscv_vector
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += 4;
+            }
+#endif // __riscv_vector
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp += 1;
+                p0++;
+            }
+        }
+    }
+}
+
+static void transpose_pack_A_tile(const Mat& A, Mat& AT, int i, int max_ii, int k, int max_kk, size_t vl)
+{
+    const int elempack = A.elempack;
+    const int A_hstep = A.dims == 3 ? (int)A.cstep : A.w;
+
+    float* pp = AT;
+
+    int ii = 0;
+#if __riscv_vector
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0;
+                vfloat32m1_t _r1;
+                vfloat32m1_t _r2;
+                vfloat32m1_t _r3;
+                vfloat32m1_t _r4;
+                vfloat32m1_t _r5;
+                vfloat32m1_t _r6;
+                vfloat32m1_t _r7;
+                vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl);
+                vlseg4e32_v_f32m1(&_r4, &_r5, &_r6, &_r7, p0 + 16, vl);
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r4, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r5, vl);
+                vse32_v_f32m1(pp + 4 * 4, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 5, _r6, vl);
+                vse32_v_f32m1(pp + 4 * 6, _r3, vl);
+                vse32_v_f32m1(pp + 4 * 7, _r7, vl);
+                pp += 32;
+                p0 += A_hstep * 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                vse32_v_f32m1(pp + 4, vle32_v_f32m1(p0 + 4, vl), vl);
+                pp += 8;
+                p0 += A_hstep;
+            }
+        }
+    }
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0;
+                vfloat32m1_t _r1;
+                vfloat32m1_t _r2;
+                vfloat32m1_t _r3;
+                vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl);
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r3, vl);
+                pp += 16;
+                p0 += A_hstep * 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += A_hstep;
+            }
+        }
+    }
+#endif // __riscv_vector
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+#if __riscv_vector
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
+                store_float_v2(v0, v1, pp, vl);
+                pp += 8;
+                p0 += A_hstep * 4;
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                pp += 2;
+                p0 += A_hstep;
+            }
+        }
+    }
+    for (; ii < max_ii; ii += 1)
+    {
+#if __riscv_vector
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += A_hstep * 4;
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)A + k * A_hstep + (i + ii);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp += 1;
+                p0 += A_hstep;
+            }
+        }
+    }
+}
+
+static void pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, size_t vl)
+{
+    const int elempack = B.elempack;
+    const int B_hstep = B.dims == 3 ? (int)B.cstep : B.w;
+
+    float* pp = BT;
+
+    int jj = 0;
+#if __riscv_vector
+    for (; jj + 11 < max_jj; jj += 12)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;
+            const float* p1 = (const float*)B + (j + jj + 4) * B_hstep + k * 4;
+            const float* p2 = (const float*)B + (j + jj + 8) * B_hstep + k * 4;
+
+            for (int kk = 0; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                vse32_v_f32m1(pp + 4, vle32_v_f32m1(p1, vl), vl);
+                vse32_v_f32m1(pp + 8, vle32_v_f32m1(p2, vl), vl);
+                pp += 12;
+                p0 += 4;
+                p1 += 4;
+                p2 += 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
+            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
+            const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
+            const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
+            const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
+            const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
+            const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
+            const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;
+            const float* p8 = (const float*)B + (j + jj + 8) * B_hstep + k;
+            const float* p9 = (const float*)B + (j + jj + 9) * B_hstep + k;
+            const float* pa = (const float*)B + (j + jj + 10) * B_hstep + k;
+            const float* pb = (const float*)B + (j + jj + 11) * B_hstep + k;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t _r1 = vle32_v_f32m1(p1, vl);
+                vfloat32m1_t _r2 = vle32_v_f32m1(p2, vl);
+                vfloat32m1_t _r3 = vle32_v_f32m1(p3, vl);
+                vfloat32m1_t _r4 = vle32_v_f32m1(p4, vl);
+                vfloat32m1_t _r5 = vle32_v_f32m1(p5, vl);
+                vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl);
+                vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl);
+                vfloat32m1_t _r8 = vle32_v_f32m1(p8, vl);
+                vfloat32m1_t _r9 = vle32_v_f32m1(p9, vl);
+                vfloat32m1_t _ra = vle32_v_f32m1(pa, vl);
+                vfloat32m1_t _rb = vle32_v_f32m1(pb, vl);
+
+                transpose4x4_ps(_r0, _r1, _r2, _r3, vl);
+                transpose4x4_ps(_r4, _r5, _r6, _r7, vl);
+                transpose4x4_ps(_r8, _r9, _ra, _rb, vl);
+
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r4, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r8, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 4, _r5, vl);
+                vse32_v_f32m1(pp + 4 * 5, _r9, vl);
+                vse32_v_f32m1(pp + 4 * 6, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 7, _r6, vl);
+                vse32_v_f32m1(pp + 4 * 8, _ra, vl);
+                vse32_v_f32m1(pp + 4 * 9, _r3, vl);
+                vse32_v_f32m1(pp + 4 * 10, _r7, vl);
+                vse32_v_f32m1(pp + 4 * 11, _rb, vl);
+                pp += 48;
+                p0 += 4;
+                p1 += 4;
+                p2 += 4;
+                p3 += 4;
+                p4 += 4;
+                p5 += 4;
+                p6 += 4;
+                p7 += 4;
+                p8 += 4;
+                p9 += 4;
+                pa += 4;
+                pb += 4;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp[2] = p2[0];
+                pp[3] = p3[0];
+                pp[4] = p4[0];
+                pp[5] = p5[0];
+                pp[6] = p6[0];
+                pp[7] = p7[0];
+                pp[8] = p8[0];
+                pp[9] = p9[0];
+                pp[10] = pa[0];
+                pp[11] = pb[0];
+                pp += 12;
+                p0++;
+                p1++;
+                p2++;
+                p3++;
+                p4++;
+                p5++;
+                p6++;
+                p7++;
+                p8++;
+                p9++;
+                pa++;
+                pb++;
+            }
+        }
+    }
+    for (; jj + 7 < max_jj; jj += 8)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;
+            const float* p1 = (const float*)B + (j + jj + 4) * B_hstep + k * 4;
+
+            for (int kk = 0; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                vse32_v_f32m1(pp + 4, vle32_v_f32m1(p1, vl), vl);
+                pp += 8;
+                p0 += 4;
+                p1 += 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
+            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
+            const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
+            const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
+            const float* p4 = (const float*)B + (j + jj + 4) * B_hstep + k;
+            const float* p5 = (const float*)B + (j + jj + 5) * B_hstep + k;
+            const float* p6 = (const float*)B + (j + jj + 6) * B_hstep + k;
+            const float* p7 = (const float*)B + (j + jj + 7) * B_hstep + k;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t _r1 = vle32_v_f32m1(p1, vl);
+                vfloat32m1_t _r2 = vle32_v_f32m1(p2, vl);
+                vfloat32m1_t _r3 = vle32_v_f32m1(p3, vl);
+                vfloat32m1_t _r4 = vle32_v_f32m1(p4, vl);
+                vfloat32m1_t _r5 = vle32_v_f32m1(p5, vl);
+                vfloat32m1_t _r6 = vle32_v_f32m1(p6, vl);
+                vfloat32m1_t _r7 = vle32_v_f32m1(p7, vl);
+
+                transpose4x4_ps(_r0, _r1, _r2, _r3, vl);
+                transpose4x4_ps(_r4, _r5, _r6, _r7, vl);
+
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r4, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r5, vl);
+                vse32_v_f32m1(pp + 4 * 4, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 5, _r6, vl);
+                vse32_v_f32m1(pp + 4 * 6, _r3, vl);
+                vse32_v_f32m1(pp + 4 * 7, _r7, vl);
+                pp += 32;
+                p0 += 4;
+                p1 += 4;
+                p2 += 4;
+                p3 += 4;
+                p4 += 4;
+                p5 += 4;
+                p6 += 4;
+                p7 += 4;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp[2] = p2[0];
+                pp[3] = p3[0];
+                pp[4] = p4[0];
+                pp[5] = p5[0];
+                pp[6] = p6[0];
+                pp[7] = p7[0];
+                pp += 8;
+                p0++;
+                p1++;
+                p2++;
+                p3++;
+                p4++;
+                p5++;
+                p6++;
+                p7++;
+            }
+        }
+    }
+    for (; jj + 3 < max_jj; jj += 4)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k * 4;
+
+            for (int kk = 0; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
+            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
+            const float* p2 = (const float*)B + (j + jj + 2) * B_hstep + k;
+            const float* p3 = (const float*)B + (j + jj + 3) * B_hstep + k;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
+                vfloat32m1_t v2 = vle32_v_f32m1(p2, vl);
+                vfloat32m1_t v3 = vle32_v_f32m1(p3, vl);
+                store_float_v4(v0, v1, v2, v3, pp, vl);
+                pp += 16;
+                p0 += 4;
+                p1 += 4;
+                p2 += 4;
+                p3 += 4;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp[2] = p2[0];
+                pp[3] = p3[0];
+                pp += 4;
+                p0++;
+                p1++;
+                p2++;
+                p3++;
+            }
+        }
+    }
+#endif // __riscv_vector
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+        // if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
+            const float* p1 = (const float*)B + (j + jj + 1) * B_hstep + k;
+
+            int kk = 0;
+#if __riscv_vector
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(p1, vl);
+                store_float_v2(v0, v1, pp, vl);
+                pp += 8;
+                p0 += 4;
+                p1 += 4;
+            }
+#endif // __riscv_vector
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p1[0];
+                pp += 2;
+                p0++;
+                p1++;
+            }
+        }
+    }
+    for (; jj < max_jj; jj += 1)
+    {
+        // if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + (j + jj) * B_hstep + k;
+
+            int kk = 0;
+#if __riscv_vector
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += 4;
+            }
+#endif // __riscv_vector
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp += 1;
+                p0++;
+            }
+        }
+    }
+}
+
+static void transpose_pack_B_tile(const Mat& B, Mat& BT, int j, int max_jj, int k, int max_kk, size_t vl)
+{
+    const int elempack = B.elempack;
+    const int B_hstep = B.dims == 3 ? (int)B.cstep : B.w;
+
+    float* pp = BT;
+
+    int jj = 0;
+#if __riscv_vector
+    for (; jj + 11 < max_jj; jj += 12)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0;
+                vfloat32m1_t _r1;
+                vfloat32m1_t _r2;
+                vfloat32m1_t _r3;
+                vfloat32m1_t _r4;
+                vfloat32m1_t _r5;
+                vfloat32m1_t _r6;
+                vfloat32m1_t _r7;
+                vfloat32m1_t _r8;
+                vfloat32m1_t _r9;
+                vfloat32m1_t _ra;
+                vfloat32m1_t _rb;
+                vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl);
+                vlseg4e32_v_f32m1(&_r4, &_r5, &_r6, &_r7, p0 + 16, vl);
+                vlseg4e32_v_f32m1(&_r8, &_r9, &_ra, &_rb, p0 + 32, vl);
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r4, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r8, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 4, _r5, vl);
+                vse32_v_f32m1(pp + 4 * 5, _r9, vl);
+                vse32_v_f32m1(pp + 4 * 6, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 7, _r6, vl);
+                vse32_v_f32m1(pp + 4 * 8, _ra, vl);
+                vse32_v_f32m1(pp + 4 * 9, _r3, vl);
+                vse32_v_f32m1(pp + 4 * 10, _r7, vl);
+                vse32_v_f32m1(pp + 4 * 11, _rb, vl);
+                pp += 48;
+                p0 += B_hstep * 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                vse32_v_f32m1(pp + 4, vle32_v_f32m1(p0 + 4, vl), vl);
+                vse32_v_f32m1(pp + 8, vle32_v_f32m1(p0 + 8, vl), vl);
+                pp += 12;
+                p0 += B_hstep;
+            }
+        }
+    }
+    for (; jj + 7 < max_jj; jj += 8)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0;
+                vfloat32m1_t _r1;
+                vfloat32m1_t _r2;
+                vfloat32m1_t _r3;
+                vfloat32m1_t _r4;
+                vfloat32m1_t _r5;
+                vfloat32m1_t _r6;
+                vfloat32m1_t _r7;
+                vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl);
+                vlseg4e32_v_f32m1(&_r4, &_r5, &_r6, &_r7, p0 + 16, vl);
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r4, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r5, vl);
+                vse32_v_f32m1(pp + 4 * 4, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 5, _r6, vl);
+                vse32_v_f32m1(pp + 4 * 6, _r3, vl);
+                vse32_v_f32m1(pp + 4 * 7, _r7, vl);
+                pp += 32;
+                p0 += B_hstep * 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                vse32_v_f32m1(pp + 4, vle32_v_f32m1(p0 + 4, vl), vl);
+                pp += 8;
+                p0 += B_hstep;
+            }
+        }
+    }
+    for (; jj + 3 < max_jj; jj += 4)
+    {
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _r0;
+                vfloat32m1_t _r1;
+                vfloat32m1_t _r2;
+                vfloat32m1_t _r3;
+                vlseg4e32_v_f32m1(&_r0, &_r1, &_r2, &_r3, p0, vl);
+                vse32_v_f32m1(pp, _r0, vl);
+                vse32_v_f32m1(pp + 4, _r1, vl);
+                vse32_v_f32m1(pp + 4 * 2, _r2, vl);
+                vse32_v_f32m1(pp + 4 * 3, _r3, vl);
+                pp += 16;
+                p0 += B_hstep * 4;
+            }
+        }
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += B_hstep;
+            }
+        }
+    }
+#endif // __riscv_vector
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+#if __riscv_vector
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(p0, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(p0 + 4, vl);
+                store_float_v2(v0, v1, pp, vl);
+                pp += 8;
+                p0 += B_hstep * 4;
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                pp += 2;
+                p0 += B_hstep;
+            }
+        }
+    }
+    for (; jj < max_jj; jj += 1)
+    {
+#if __riscv_vector
+        if (elempack == 4)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj) * 4;
+
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vse32_v_f32m1(pp, vle32_v_f32m1(p0, vl), vl);
+                pp += 4;
+                p0 += B_hstep * 4;
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1)
+        {
+            const float* p0 = (const float*)B + k * B_hstep + (j + jj);
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp += 1;
+                p0 += B_hstep;
+            }
+        }
+    }
+}
+
+static void transpose_unpack_output_tile(const Mat& topT, Mat& top_blob, int i, int max_ii, int j, int max_jj, size_t vl)
+{
+    const int out_elempack = top_blob.elempack;
+    const int out_hstep = top_blob.dims == 3 ? (int)top_blob.cstep : top_blob.w;
+
+    const float* pp = topT;
+
+    int ii = 0;
+#if __riscv_vector
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+        if (out_elempack == 4)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;
+
+            for (int jj = 0; jj + 3 < max_jj; jj += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(pp, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(pp + 8, vl);
+                vfloat32m1_t v2 = vle32_v_f32m1(pp + 16, vl);
+                vfloat32m1_t v3 = vle32_v_f32m1(pp + 24, vl);
+                store_float_v4(v0, v1, v2, v3, p0, vl);
+                v0 = vle32_v_f32m1(pp + 4, vl);
+                v1 = vle32_v_f32m1(pp + 12, vl);
+                v2 = vle32_v_f32m1(pp + 20, vl);
+                v3 = vle32_v_f32m1(pp + 28, vl);
+                store_float_v4(v0, v1, v2, v3, p0 + 16, vl);
+                pp += 32;
+                p0 += out_hstep * 4;
+            }
+        }
+        if (out_elempack == 1)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);
+
+            for (int jj = 0; jj < max_jj; jj += 1)
+            {
+                vfloat32m1_t _r0 = vle32_v_f32m1(pp, vl);
+                vfloat32m1_t _r1 = vle32_v_f32m1(pp + 4, vl);
+                vse32_v_f32m1(p0, _r0, vl);
+                vse32_v_f32m1(p0 + 4, _r1, vl);
+                pp += 8;
+                p0 += out_hstep;
+            }
+        }
+    }
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+        if (out_elempack == 4)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;
+
+            for (int jj = 0; jj + 3 < max_jj; jj += 4)
+            {
+                vfloat32m1_t v0 = vle32_v_f32m1(pp, vl);
+                vfloat32m1_t v1 = vle32_v_f32m1(pp + 4, vl);
+                vfloat32m1_t v2 = vle32_v_f32m1(pp + 8, vl);
+                vfloat32m1_t v3 = vle32_v_f32m1(pp + 12, vl);
+                store_float_v4(v0, v1, v2, v3, p0, vl);
+                pp += 16;
+                p0 += out_hstep * 4;
+            }
+        }
+        if (out_elempack == 1)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);
+
+            for (int jj = 0; jj < max_jj; jj += 1)
+            {
+                vfloat32m1_t _r0 = vle32_v_f32m1(pp, vl);
+                vse32_v_f32m1(p0, _r0, vl);
+                pp += 4;
+                p0 += out_hstep;
+            }
+        }
+    }
+#endif // __riscv_vector
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+#if __riscv_vector
+        if (out_elempack == 4)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;
+
+            for (int jj = 0; jj + 3 < max_jj; jj += 4)
+            {
+                p0[0] = pp[0];
+                p0[1] = pp[2];
+                p0[2] = pp[4];
+                p0[3] = pp[6];
+                p0[4] = pp[1];
+                p0[5] = pp[3];
+                p0[6] = pp[5];
+                p0[7] = pp[7];
+                pp += 8;
+                p0 += out_hstep * 4;
+            }
+        }
+#endif // __riscv_vector
+        if (out_elempack == 1)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);
+
+            for (int jj = 0; jj < max_jj; jj += 1)
+            {
+                p0[0] = pp[0];
+                p0[1] = pp[1];
+                pp += 2;
+                p0 += out_hstep;
+            }
+        }
+    }
+    for (; ii < max_ii; ii += 1)
+    {
+#if __riscv_vector
+        if (out_elempack == 4)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii) * 4;
+
+            for (int jj = 0; jj + 3 < max_jj; jj += 4)
+            {
+                vfloat32m1_t _r0 = vle32_v_f32m1(pp, vl);
+                vse32_v_f32m1(p0, _r0, vl);
+                pp += 4;
+                p0 += out_hstep * 4;
+            }
+        }
+#endif // __riscv_vector
+        if (out_elempack == 1)
+        {
+            float* p0 = (float*)top_blob + j * out_hstep + (i + ii);
+
+            for (int jj = 0; jj < max_jj; jj += 1)
+            {
+                p0[0] = pp[0];
+                pp += 1;
+                p0 += out_hstep;
+            }
+        }
+    }
+}
+
+static void gemm_transB_packed_tile(const Mat& AT_tile, const Mat& BT_tile, const Mat& CT_tile, Mat& topT_tile, Mat& top_blob, int broadcast_type_C, int i, int max_ii, int j, int max_jj, int k, int max_kk, bool k_end, size_t vl)
+{
+    const int out_elempack = top_blob.elempack;
+    const int out_hstep = top_blob.dims == 3 ? (int)top_blob.cstep : top_blob.w;
+
+    const float* pAT = AT_tile;
+    const float* pBT = BT_tile;
+    const float* pC = CT_tile;
+
+    float* outptr = topT_tile;
+
+    int ii = 0;
+#if __riscv_vector
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;
+
+        const float* pB = pBT;
+
+        if (pC)
+        {
+            if (broadcast_type_C == 1 || broadcast_type_C == 2)
+            {
+                pC = (const float*)CT_tile + i + ii;
+            }
+            if (broadcast_type_C == 4)
+            {
+                pC = (const float*)CT_tile + j;
+            }
+        }
+
+        int jj = 0;
+        for (; jj + 11 < max_jj; jj += 12)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+            vfloat32m1_t _sum10;
+            vfloat32m1_t _sum11;
+            vfloat32m1_t _sum20;
+            vfloat32m1_t _sum21;
+            vfloat32m1_t _sum30;
+            vfloat32m1_t _sum31;
+            vfloat32m1_t _sum40;
+            vfloat32m1_t _sum41;
+            vfloat32m1_t _sum50;
+            vfloat32m1_t _sum51;
+            vfloat32m1_t _sum60;
+            vfloat32m1_t _sum61;
+            vfloat32m1_t _sum70;
+            vfloat32m1_t _sum71;
+            vfloat32m1_t _sum80;
+            vfloat32m1_t _sum81;
+            vfloat32m1_t _sum90;
+            vfloat32m1_t _sum91;
+            vfloat32m1_t _suma0;
+            vfloat32m1_t _suma1;
+            vfloat32m1_t _sumb0;
+            vfloat32m1_t _sumb1;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+                _sum10 = vfmv_v_f_f32m1(0.f, vl);
+                _sum11 = vfmv_v_f_f32m1(0.f, vl);
+                _sum20 = vfmv_v_f_f32m1(0.f, vl);
+                _sum21 = vfmv_v_f_f32m1(0.f, vl);
+                _sum30 = vfmv_v_f_f32m1(0.f, vl);
+                _sum31 = vfmv_v_f_f32m1(0.f, vl);
+                _sum40 = vfmv_v_f_f32m1(0.f, vl);
+                _sum41 = vfmv_v_f_f32m1(0.f, vl);
+                _sum50 = vfmv_v_f_f32m1(0.f, vl);
+                _sum51 = vfmv_v_f_f32m1(0.f, vl);
+                _sum60 = vfmv_v_f_f32m1(0.f, vl);
+                _sum61 = vfmv_v_f_f32m1(0.f, vl);
+                _sum70 = vfmv_v_f_f32m1(0.f, vl);
+                _sum71 = vfmv_v_f_f32m1(0.f, vl);
+                _sum80 = vfmv_v_f_f32m1(0.f, vl);
+                _sum81 = vfmv_v_f_f32m1(0.f, vl);
+                _sum90 = vfmv_v_f_f32m1(0.f, vl);
+                _sum91 = vfmv_v_f_f32m1(0.f, vl);
+                _suma0 = vfmv_v_f_f32m1(0.f, vl);
+                _suma1 = vfmv_v_f_f32m1(0.f, vl);
+                _sumb0 = vfmv_v_f_f32m1(0.f, vl);
+                _sumb1 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum20 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum21 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum30 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum31 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum40 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum41 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum50 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum51 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum60 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum61 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum70 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum71 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum80 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum81 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum90 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum91 = vfmv_v_f_f32m1(pC[0], vl);
+                        _suma0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _suma1 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sumb0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sumb1 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        _sum10 = _sum00;
+                        _sum11 = _sum01;
+                        _sum20 = _sum00;
+                        _sum21 = _sum01;
+                        _sum30 = _sum00;
+                        _sum31 = _sum01;
+                        _sum40 = _sum00;
+                        _sum41 = _sum01;
+                        _sum50 = _sum00;
+                        _sum51 = _sum01;
+                        _sum60 = _sum00;
+                        _sum61 = _sum01;
+                        _sum70 = _sum00;
+                        _sum71 = _sum01;
+                        _sum80 = _sum00;
+                        _sum81 = _sum01;
+                        _sum90 = _sum00;
+                        _sum91 = _sum01;
+                        _suma0 = _sum00;
+                        _suma1 = _sum01;
+                        _sumb0 = _sum00;
+                        _sumb1 = _sum01;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4 * 1, vl);
+                        _sum10 = vle32_v_f32m1(pC + 4 * 2, vl);
+                        _sum11 = vle32_v_f32m1(pC + 4 * 3, vl);
+                        _sum20 = vle32_v_f32m1(pC + 4 * 4, vl);
+                        _sum21 = vle32_v_f32m1(pC + 4 * 5, vl);
+                        _sum30 = vle32_v_f32m1(pC + 4 * 6, vl);
+                        _sum31 = vle32_v_f32m1(pC + 4 * 7, vl);
+                        _sum40 = vle32_v_f32m1(pC + 4 * 8, vl);
+                        _sum41 = vle32_v_f32m1(pC + 4 * 9, vl);
+                        _sum50 = vle32_v_f32m1(pC + 4 * 10, vl);
+                        _sum51 = vle32_v_f32m1(pC + 4 * 11, vl);
+                        _sum60 = vle32_v_f32m1(pC + 4 * 12, vl);
+                        _sum61 = vle32_v_f32m1(pC + 4 * 13, vl);
+                        _sum70 = vle32_v_f32m1(pC + 4 * 14, vl);
+                        _sum71 = vle32_v_f32m1(pC + 4 * 15, vl);
+                        _sum80 = vle32_v_f32m1(pC + 4 * 16, vl);
+                        _sum81 = vle32_v_f32m1(pC + 4 * 17, vl);
+                        _sum90 = vle32_v_f32m1(pC + 4 * 18, vl);
+                        _sum91 = vle32_v_f32m1(pC + 4 * 19, vl);
+                        _suma0 = vle32_v_f32m1(pC + 4 * 20, vl);
+                        _suma1 = vle32_v_f32m1(pC + 4 * 21, vl);
+                        _sumb0 = vle32_v_f32m1(pC + 4 * 22, vl);
+                        _sumb1 = vle32_v_f32m1(pC + 4 * 23, vl);
+                        pC += 96;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum20 = vfmv_v_f_f32m1(pC[2], vl);
+                        _sum30 = vfmv_v_f_f32m1(pC[3], vl);
+                        _sum40 = vfmv_v_f_f32m1(pC[4], vl);
+                        _sum50 = vfmv_v_f_f32m1(pC[5], vl);
+                        _sum60 = vfmv_v_f_f32m1(pC[6], vl);
+                        _sum70 = vfmv_v_f_f32m1(pC[7], vl);
+                        _sum80 = vfmv_v_f_f32m1(pC[8], vl);
+                        _sum90 = vfmv_v_f_f32m1(pC[9], vl);
+                        _suma0 = vfmv_v_f_f32m1(pC[10], vl);
+                        _sumb0 = vfmv_v_f_f32m1(pC[11], vl);
+                        _sum01 = _sum00;
+                        _sum11 = _sum10;
+                        _sum21 = _sum20;
+                        _sum31 = _sum30;
+                        _sum41 = _sum40;
+                        _sum51 = _sum50;
+                        _sum61 = _sum60;
+                        _sum71 = _sum70;
+                        _sum81 = _sum80;
+                        _sum91 = _sum90;
+                        _suma1 = _suma0;
+                        _sumb1 = _sumb0;
+                        pC += 12;
+                    }
+                }
+            }
+            else
+            {
+                _sum00 = vle32_v_f32m1(outptr, vl);
+                _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl);
+                _sum20 = vle32_v_f32m1(outptr + 4 * 4, vl);
+                _sum21 = vle32_v_f32m1(outptr + 4 * 5, vl);
+                _sum30 = vle32_v_f32m1(outptr + 4 * 6, vl);
+                _sum31 = vle32_v_f32m1(outptr + 4 * 7, vl);
+                _sum40 = vle32_v_f32m1(outptr + 4 * 8, vl);
+                _sum41 = vle32_v_f32m1(outptr + 4 * 9, vl);
+                _sum50 = vle32_v_f32m1(outptr + 4 * 10, vl);
+                _sum51 = vle32_v_f32m1(outptr + 4 * 11, vl);
+                _sum60 = vle32_v_f32m1(outptr + 4 * 12, vl);
+                _sum61 = vle32_v_f32m1(outptr + 4 * 13, vl);
+                _sum70 = vle32_v_f32m1(outptr + 4 * 14, vl);
+                _sum71 = vle32_v_f32m1(outptr + 4 * 15, vl);
+                _sum80 = vle32_v_f32m1(outptr + 4 * 16, vl);
+                _sum81 = vle32_v_f32m1(outptr + 4 * 17, vl);
+                _sum90 = vle32_v_f32m1(outptr + 4 * 18, vl);
+                _sum91 = vle32_v_f32m1(outptr + 4 * 19, vl);
+                _suma0 = vle32_v_f32m1(outptr + 4 * 20, vl);
+                _suma1 = vle32_v_f32m1(outptr + 4 * 21, vl);
+                _sumb0 = vle32_v_f32m1(outptr + 4 * 22, vl);
+                _sumb1 = vle32_v_f32m1(outptr + 4 * 23, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk + 3 < max_kk; kk += 4)
+            {
+                vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+                _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl);
+                _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl);
+                _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl);
+                _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl);
+                _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl);
+                _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl);
+                _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl);
+                _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl);
+                _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl);
+                _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl);
+                _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl);
+                _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl);
+                _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl);
+                _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl);
+                _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl);
+                _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl);
+
+                pA += 8;
+                pB += 12;
+
+                _pA0 = vle32_v_f32m1(pA, vl);
+                _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+                _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl);
+                _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl);
+                _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl);
+                _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl);
+                _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl);
+                _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl);
+                _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl);
+                _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl);
+                _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl);
+                _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl);
+                _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl);
+                _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl);
+                _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl);
+                _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl);
+                _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl);
+                _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl);
+
+                pA += 8;
+                pB += 12;
+
+                _pA0 = vle32_v_f32m1(pA, vl);
+                _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+                _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl);
+                _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl);
+                _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl);
+                _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl);
+                _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl);
+                _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl);
+                _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl);
+                _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl);
+                _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl);
+                _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl);
+                _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl);
+                _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl);
+                _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl);
+                _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl);
+                _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl);
+                _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl);
+
+                pA += 8;
+                pB += 12;
+
+                _pA0 = vle32_v_f32m1(pA, vl);
+                _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+                _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl);
+                _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl);
+                _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl);
+                _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl);
+                _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl);
+                _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl);
+                _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl);
+                _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl);
+                _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl);
+                _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl);
+                _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl);
+                _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl);
+                _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl);
+                _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl);
+                _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl);
+                _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl);
+
+                pA += 8;
+                pB += 12;
+            }
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+                _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl);
+                _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl);
+                _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl);
+                _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl);
+                _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl);
+                _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl);
+                _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl);
+                _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl);
+                _sum80 = vfmadd_vf_f32m1(_pA0, pB[8], _sum80, vl);
+                _sum81 = vfmadd_vf_f32m1(_pA1, pB[8], _sum81, vl);
+                _sum90 = vfmadd_vf_f32m1(_pA0, pB[9], _sum90, vl);
+                _sum91 = vfmadd_vf_f32m1(_pA1, pB[9], _sum91, vl);
+                _suma0 = vfmadd_vf_f32m1(_pA0, pB[10], _suma0, vl);
+                _suma1 = vfmadd_vf_f32m1(_pA1, pB[10], _suma1, vl);
+                _sumb0 = vfmadd_vf_f32m1(_pA0, pB[11], _sumb0, vl);
+                _sumb1 = vfmadd_vf_f32m1(_pA1, pB[11], _sumb1, vl);
+
+                pA += 8;
+                pB += 12;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 2, _sum20, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 3, _sum30, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 4, _sum40, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 5, _sum50, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 6, _sum60, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 7, _sum70, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 8, _sum80, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 9, _sum90, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 10, _suma0, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 11, _sumb0, vl);
+
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 2, _sum21, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 3, _sum31, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 4, _sum41, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 5, _sum51, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 6, _sum61, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 7, _sum71, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 8, _sum81, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 9, _sum91, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 10, _suma1, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 11, _sumb1, vl);
+
+                    outptr0 += 48;
+                }
+                if (out_elempack == 1)
+                {
+                    transpose8x12_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, _sum80, _sum81, _sum90, _sum91, _suma0, _suma1, _sumb0, _sumb1, vl);
+
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + 8, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 4, _sum20, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 8, _sum21, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2, _sum30, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum31, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2 + 8, _sum40, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3, _sum41, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _sum50, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3 + 8, _sum51, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum60, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum61, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 8, _sum70, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 5, _sum71, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 5 + 4, _sum80, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 5 + 8, _sum81, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 6, _sum90, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 6 + 4, _sum91, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 6 + 8, _suma0, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 7, _suma1, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 7 + 4, _sumb0, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 7 + 8, _sumb1, vl);
+
+                    outptr0 += 12;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum00, vl);
+                vse32_v_f32m1(outptr + 4, _sum01, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum10, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum11, vl);
+                vse32_v_f32m1(outptr + 4 * 4, _sum20, vl);
+                vse32_v_f32m1(outptr + 4 * 5, _sum21, vl);
+                vse32_v_f32m1(outptr + 4 * 6, _sum30, vl);
+                vse32_v_f32m1(outptr + 4 * 7, _sum31, vl);
+                vse32_v_f32m1(outptr + 4 * 8, _sum40, vl);
+                vse32_v_f32m1(outptr + 4 * 9, _sum41, vl);
+                vse32_v_f32m1(outptr + 4 * 10, _sum50, vl);
+                vse32_v_f32m1(outptr + 4 * 11, _sum51, vl);
+                vse32_v_f32m1(outptr + 4 * 12, _sum60, vl);
+                vse32_v_f32m1(outptr + 4 * 13, _sum61, vl);
+                vse32_v_f32m1(outptr + 4 * 14, _sum70, vl);
+                vse32_v_f32m1(outptr + 4 * 15, _sum71, vl);
+                vse32_v_f32m1(outptr + 4 * 16, _sum80, vl);
+                vse32_v_f32m1(outptr + 4 * 17, _sum81, vl);
+                vse32_v_f32m1(outptr + 4 * 18, _sum90, vl);
+                vse32_v_f32m1(outptr + 4 * 19, _sum91, vl);
+                vse32_v_f32m1(outptr + 4 * 20, _suma0, vl);
+                vse32_v_f32m1(outptr + 4 * 21, _suma1, vl);
+                vse32_v_f32m1(outptr + 4 * 22, _sumb0, vl);
+                vse32_v_f32m1(outptr + 4 * 23, _sumb1, vl);
+            }
+
+            outptr += 96;
+        }
+        for (; jj + 7 < max_jj; jj += 8)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+            vfloat32m1_t _sum10;
+            vfloat32m1_t _sum11;
+            vfloat32m1_t _sum20;
+            vfloat32m1_t _sum21;
+            vfloat32m1_t _sum30;
+            vfloat32m1_t _sum31;
+            vfloat32m1_t _sum40;
+            vfloat32m1_t _sum41;
+            vfloat32m1_t _sum50;
+            vfloat32m1_t _sum51;
+            vfloat32m1_t _sum60;
+            vfloat32m1_t _sum61;
+            vfloat32m1_t _sum70;
+            vfloat32m1_t _sum71;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+                _sum10 = vfmv_v_f_f32m1(0.f, vl);
+                _sum11 = vfmv_v_f_f32m1(0.f, vl);
+                _sum20 = vfmv_v_f_f32m1(0.f, vl);
+                _sum21 = vfmv_v_f_f32m1(0.f, vl);
+                _sum30 = vfmv_v_f_f32m1(0.f, vl);
+                _sum31 = vfmv_v_f_f32m1(0.f, vl);
+                _sum40 = vfmv_v_f_f32m1(0.f, vl);
+                _sum41 = vfmv_v_f_f32m1(0.f, vl);
+                _sum50 = vfmv_v_f_f32m1(0.f, vl);
+                _sum51 = vfmv_v_f_f32m1(0.f, vl);
+                _sum60 = vfmv_v_f_f32m1(0.f, vl);
+                _sum61 = vfmv_v_f_f32m1(0.f, vl);
+                _sum70 = vfmv_v_f_f32m1(0.f, vl);
+                _sum71 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum20 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum21 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum30 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum31 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum40 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum41 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum50 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum51 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum60 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum61 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum70 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum71 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        _sum10 = _sum00;
+                        _sum11 = _sum01;
+                        _sum20 = _sum00;
+                        _sum21 = _sum01;
+                        _sum30 = _sum00;
+                        _sum31 = _sum01;
+                        _sum40 = _sum00;
+                        _sum41 = _sum01;
+                        _sum50 = _sum00;
+                        _sum51 = _sum01;
+                        _sum60 = _sum00;
+                        _sum61 = _sum01;
+                        _sum70 = _sum00;
+                        _sum71 = _sum01;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4 * 1, vl);
+                        _sum10 = vle32_v_f32m1(pC + 4 * 2, vl);
+                        _sum11 = vle32_v_f32m1(pC + 4 * 3, vl);
+                        _sum20 = vle32_v_f32m1(pC + 4 * 4, vl);
+                        _sum21 = vle32_v_f32m1(pC + 4 * 5, vl);
+                        _sum30 = vle32_v_f32m1(pC + 4 * 6, vl);
+                        _sum31 = vle32_v_f32m1(pC + 4 * 7, vl);
+                        _sum40 = vle32_v_f32m1(pC + 4 * 8, vl);
+                        _sum41 = vle32_v_f32m1(pC + 4 * 9, vl);
+                        _sum50 = vle32_v_f32m1(pC + 4 * 10, vl);
+                        _sum51 = vle32_v_f32m1(pC + 4 * 11, vl);
+                        _sum60 = vle32_v_f32m1(pC + 4 * 12, vl);
+                        _sum61 = vle32_v_f32m1(pC + 4 * 13, vl);
+                        _sum70 = vle32_v_f32m1(pC + 4 * 14, vl);
+                        _sum71 = vle32_v_f32m1(pC + 4 * 15, vl);
+                        pC += 64;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum20 = vfmv_v_f_f32m1(pC[2], vl);
+                        _sum30 = vfmv_v_f_f32m1(pC[3], vl);
+                        _sum40 = vfmv_v_f_f32m1(pC[4], vl);
+                        _sum50 = vfmv_v_f_f32m1(pC[5], vl);
+                        _sum60 = vfmv_v_f_f32m1(pC[6], vl);
+                        _sum70 = vfmv_v_f_f32m1(pC[7], vl);
+                        _sum01 = _sum00;
+                        _sum11 = _sum10;
+                        _sum21 = _sum20;
+                        _sum31 = _sum30;
+                        _sum41 = _sum40;
+                        _sum51 = _sum50;
+                        _sum61 = _sum60;
+                        _sum71 = _sum70;
+                        pC += 8;
+                    }
+                }
+            }
+            else
+            {
+                _sum00 = vle32_v_f32m1(outptr, vl);
+                _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl);
+                _sum20 = vle32_v_f32m1(outptr + 4 * 4, vl);
+                _sum21 = vle32_v_f32m1(outptr + 4 * 5, vl);
+                _sum30 = vle32_v_f32m1(outptr + 4 * 6, vl);
+                _sum31 = vle32_v_f32m1(outptr + 4 * 7, vl);
+                _sum40 = vle32_v_f32m1(outptr + 4 * 8, vl);
+                _sum41 = vle32_v_f32m1(outptr + 4 * 9, vl);
+                _sum50 = vle32_v_f32m1(outptr + 4 * 10, vl);
+                _sum51 = vle32_v_f32m1(outptr + 4 * 11, vl);
+                _sum60 = vle32_v_f32m1(outptr + 4 * 12, vl);
+                _sum61 = vle32_v_f32m1(outptr + 4 * 13, vl);
+                _sum70 = vle32_v_f32m1(outptr + 4 * 14, vl);
+                _sum71 = vle32_v_f32m1(outptr + 4 * 15, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+                _sum40 = vfmadd_vf_f32m1(_pA0, pB[4], _sum40, vl);
+                _sum41 = vfmadd_vf_f32m1(_pA1, pB[4], _sum41, vl);
+                _sum50 = vfmadd_vf_f32m1(_pA0, pB[5], _sum50, vl);
+                _sum51 = vfmadd_vf_f32m1(_pA1, pB[5], _sum51, vl);
+                _sum60 = vfmadd_vf_f32m1(_pA0, pB[6], _sum60, vl);
+                _sum61 = vfmadd_vf_f32m1(_pA1, pB[6], _sum61, vl);
+                _sum70 = vfmadd_vf_f32m1(_pA0, pB[7], _sum70, vl);
+                _sum71 = vfmadd_vf_f32m1(_pA1, pB[7], _sum71, vl);
+
+                pA += 8;
+                pB += 8;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 2, _sum20, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 3, _sum30, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 4, _sum40, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 5, _sum50, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 6, _sum60, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 7, _sum70, vl);
+
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 2, _sum21, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 3, _sum31, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 4, _sum41, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 5, _sum51, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 6, _sum61, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 7, _sum71, vl);
+
+                    outptr0 += 32;
+                }
+                if (out_elempack == 1)
+                {
+                    transpose8x8_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, _sum40, _sum41, _sum50, _sum51, _sum60, _sum61, _sum70, _sum71, vl);
+
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 4, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2, _sum20, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum21, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3, _sum30, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _sum31, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum40, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum41, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 5, _sum50, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 5 + 4, _sum51, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 6, _sum60, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 6 + 4, _sum61, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 7, _sum70, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 7 + 4, _sum71, vl);
+
+                    outptr0 += 8;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum00, vl);
+                vse32_v_f32m1(outptr + 4, _sum01, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum10, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum11, vl);
+                vse32_v_f32m1(outptr + 4 * 4, _sum20, vl);
+                vse32_v_f32m1(outptr + 4 * 5, _sum21, vl);
+                vse32_v_f32m1(outptr + 4 * 6, _sum30, vl);
+                vse32_v_f32m1(outptr + 4 * 7, _sum31, vl);
+                vse32_v_f32m1(outptr + 4 * 8, _sum40, vl);
+                vse32_v_f32m1(outptr + 4 * 9, _sum41, vl);
+                vse32_v_f32m1(outptr + 4 * 10, _sum50, vl);
+                vse32_v_f32m1(outptr + 4 * 11, _sum51, vl);
+                vse32_v_f32m1(outptr + 4 * 12, _sum60, vl);
+                vse32_v_f32m1(outptr + 4 * 13, _sum61, vl);
+                vse32_v_f32m1(outptr + 4 * 14, _sum70, vl);
+                vse32_v_f32m1(outptr + 4 * 15, _sum71, vl);
+            }
+
+            outptr += 64;
+        }
+        for (; jj + 3 < max_jj; jj += 4)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+            vfloat32m1_t _sum10;
+            vfloat32m1_t _sum11;
+            vfloat32m1_t _sum20;
+            vfloat32m1_t _sum21;
+            vfloat32m1_t _sum30;
+            vfloat32m1_t _sum31;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+                _sum10 = vfmv_v_f_f32m1(0.f, vl);
+                _sum11 = vfmv_v_f_f32m1(0.f, vl);
+                _sum20 = vfmv_v_f_f32m1(0.f, vl);
+                _sum21 = vfmv_v_f_f32m1(0.f, vl);
+                _sum30 = vfmv_v_f_f32m1(0.f, vl);
+                _sum31 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum20 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum21 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum30 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum31 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        _sum10 = _sum00;
+                        _sum11 = _sum01;
+                        _sum20 = _sum00;
+                        _sum21 = _sum01;
+                        _sum30 = _sum00;
+                        _sum31 = _sum01;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4 * 1, vl);
+                        _sum10 = vle32_v_f32m1(pC + 4 * 2, vl);
+                        _sum11 = vle32_v_f32m1(pC + 4 * 3, vl);
+                        _sum20 = vle32_v_f32m1(pC + 4 * 4, vl);
+                        _sum21 = vle32_v_f32m1(pC + 4 * 5, vl);
+                        _sum30 = vle32_v_f32m1(pC + 4 * 6, vl);
+                        _sum31 = vle32_v_f32m1(pC + 4 * 7, vl);
+                        pC += 32;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum20 = vfmv_v_f_f32m1(pC[2], vl);
+                        _sum30 = vfmv_v_f_f32m1(pC[3], vl);
+                        _sum01 = _sum00;
+                        _sum11 = _sum10;
+                        _sum21 = _sum20;
+                        _sum31 = _sum30;
+                        pC += 4;
+                    }
+                }
+            }
+            else
+            {
+                _sum00 = vle32_v_f32m1(outptr, vl);
+                _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl);
+                _sum20 = vle32_v_f32m1(outptr + 4 * 4, vl);
+                _sum21 = vle32_v_f32m1(outptr + 4 * 5, vl);
+                _sum30 = vle32_v_f32m1(outptr + 4 * 6, vl);
+                _sum31 = vle32_v_f32m1(outptr + 4 * 7, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+                _sum20 = vfmadd_vf_f32m1(_pA0, pB[2], _sum20, vl);
+                _sum21 = vfmadd_vf_f32m1(_pA1, pB[2], _sum21, vl);
+                _sum30 = vfmadd_vf_f32m1(_pA0, pB[3], _sum30, vl);
+                _sum31 = vfmadd_vf_f32m1(_pA1, pB[3], _sum31, vl);
+
+                pA += 8;
+                pB += 4;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 2, _sum20, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 3, _sum30, vl);
+
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 2, _sum21, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4 * 3, _sum31, vl);
+
+                    outptr0 += 16;
+                }
+                if (out_elempack == 1)
+                {
+                    transpose8x4_ps(_sum00, _sum01, _sum10, _sum11, _sum20, _sum21, _sum30, _sum31, vl);
+
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 1, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum20, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 5, _sum21, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 6, _sum30, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 7, _sum31, vl);
+
+                    outptr0 += 4;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum00, vl);
+                vse32_v_f32m1(outptr + 4, _sum01, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum10, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum11, vl);
+                vse32_v_f32m1(outptr + 4 * 4, _sum20, vl);
+                vse32_v_f32m1(outptr + 4 * 5, _sum21, vl);
+                vse32_v_f32m1(outptr + 4 * 6, _sum30, vl);
+                vse32_v_f32m1(outptr + 4 * 7, _sum31, vl);
+            }
+
+            outptr += 32;
+        }
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+            vfloat32m1_t _sum10;
+            vfloat32m1_t _sum11;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+                _sum10 = vfmv_v_f_f32m1(0.f, vl);
+                _sum11 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        _sum10 = _sum00;
+                        _sum11 = _sum01;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4 * 1, vl);
+                        _sum10 = vle32_v_f32m1(pC + 4 * 2, vl);
+                        _sum11 = vle32_v_f32m1(pC + 4 * 3, vl);
+                        pC += 16;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum01 = _sum00;
+                        _sum11 = _sum10;
+                        pC += 2;
+                    }
+                }
+            }
+            else
+            {
+                _sum00 = vle32_v_f32m1(outptr, vl);
+                _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum10 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum11 = vle32_v_f32m1(outptr + 4 * 3, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pA0, pB[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pA1, pB[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pA0, pB[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pA1, pB[1], _sum11, vl);
+
+                pA += 8;
+                pB += 2;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum10, vl);
+
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4 + 4, _sum11, vl);
+                    outptr0 += 8;
+                }
+                if (out_elempack == 1)
+                {
+                    float sum0[8];
+                    float sum1[8];
+                    vse32_v_f32m1(sum0, _sum00, vl);
+                    vse32_v_f32m1(sum0 + 4, _sum01, vl);
+                    vse32_v_f32m1(sum1, _sum10, vl);
+                    vse32_v_f32m1(sum1 + 4, _sum11, vl);
+
+                    outptr0[0] = sum0[0];
+                    outptr0[out_hstep] = sum0[1];
+                    outptr0[out_hstep * 2] = sum0[2];
+                    outptr0[out_hstep * 3] = sum0[3];
+                    outptr0[out_hstep * 4] = sum0[4];
+                    outptr0[out_hstep * 5] = sum0[5];
+                    outptr0[out_hstep * 6] = sum0[6];
+                    outptr0[out_hstep * 7] = sum0[7];
+
+                    outptr0[1] = sum1[0];
+                    outptr0[out_hstep + 1] = sum1[1];
+                    outptr0[out_hstep * 2 + 1] = sum1[2];
+                    outptr0[out_hstep * 3 + 1] = sum1[3];
+                    outptr0[out_hstep * 4 + 1] = sum1[4];
+                    outptr0[out_hstep * 5 + 1] = sum1[5];
+                    outptr0[out_hstep * 6 + 1] = sum1[6];
+                    outptr0[out_hstep * 7 + 1] = sum1[7];
+                    outptr0 += 2;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum00, vl);
+                vse32_v_f32m1(outptr + 4, _sum01, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum10, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum11, vl);
+            }
+
+            outptr += 16;
+        }
+        for (; jj < max_jj; jj += 1)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        pC += 8;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = _sum00;
+                        pC += 1;
+                    }
+                }
+            }
+            else
+            {
+                _sum00 = vle32_v_f32m1(outptr, vl);
+                _sum01 = vle32_v_f32m1(outptr + 4 * 1, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA0 = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pA1 = vle32_v_f32m1(pA + 4, vl);
+
+                vfloat32m1_t _pB = vfmv_v_f_f32m1(pB[0], vl);
+
+                _sum00 = vfmadd_vv_f32m1(_pA0, _pB, _sum00, vl);
+                _sum01 = vfmadd_vv_f32m1(_pA1, _pB, _sum01, vl);
+
+                pA += 8;
+                pB += 1;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 4, _sum01, vl);
+                    outptr0 += 4;
+                }
+                if (out_elempack == 1)
+                {
+                    float sum0[8];
+                    vse32_v_f32m1(sum0, _sum00, vl);
+                    vse32_v_f32m1(sum0 + 4, _sum01, vl);
+
+                    outptr0[0] = sum0[0];
+                    outptr0[out_hstep * 1] = sum0[1];
+                    outptr0[out_hstep * 2] = sum0[2];
+                    outptr0[out_hstep * 3] = sum0[3];
+                    outptr0[out_hstep * 4] = sum0[4];
+                    outptr0[out_hstep * 5] = sum0[5];
+                    outptr0[out_hstep * 6] = sum0[6];
+                    outptr0[out_hstep * 7] = sum0[7];
+                    outptr0++;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum00, vl);
+                vse32_v_f32m1(outptr + 4, _sum01, vl);
+            }
+
+            outptr += 8;
+        }
+
+        pAT += max_kk * 8;
+    }
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j * out_elempack;
+
+        const float* pB = pBT;
+
+        if (pC)
+        {
+            if (broadcast_type_C == 1 || broadcast_type_C == 2)
+            {
+                pC = (const float*)CT_tile + i + ii;
+            }
+            if (broadcast_type_C == 4)
+            {
+                pC = (const float*)CT_tile + j;
+            }
+        }
+
+        int jj = 0;
+        for (; jj + 11 < max_jj; jj += 12)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+            vfloat32m1_t _sum2;
+            vfloat32m1_t _sum3;
+            vfloat32m1_t _sum4;
+            vfloat32m1_t _sum5;
+            vfloat32m1_t _sum6;
+            vfloat32m1_t _sum7;
+            vfloat32m1_t _sum8;
+            vfloat32m1_t _sum9;
+            vfloat32m1_t _suma;
+            vfloat32m1_t _sumb;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+                _sum2 = vfmv_v_f_f32m1(0.f, vl);
+                _sum3 = vfmv_v_f_f32m1(0.f, vl);
+                _sum4 = vfmv_v_f_f32m1(0.f, vl);
+                _sum5 = vfmv_v_f_f32m1(0.f, vl);
+                _sum6 = vfmv_v_f_f32m1(0.f, vl);
+                _sum7 = vfmv_v_f_f32m1(0.f, vl);
+                _sum8 = vfmv_v_f_f32m1(0.f, vl);
+                _sum9 = vfmv_v_f_f32m1(0.f, vl);
+                _suma = vfmv_v_f_f32m1(0.f, vl);
+                _sumb = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum3 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum4 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum5 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum6 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum7 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum8 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum9 = vfmv_v_f_f32m1(pC[0], vl);
+                        _suma = vfmv_v_f_f32m1(pC[0], vl);
+                        _sumb = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = _sum0;
+                        _sum2 = _sum0;
+                        _sum3 = _sum0;
+                        _sum4 = _sum0;
+                        _sum5 = _sum0;
+                        _sum6 = _sum0;
+                        _sum7 = _sum0;
+                        _sum8 = _sum0;
+                        _sum9 = _sum0;
+                        _suma = _sum0;
+                        _sumb = _sum0;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = vle32_v_f32m1(pC + 4, vl);
+                        _sum2 = vle32_v_f32m1(pC + 8, vl);
+                        _sum3 = vle32_v_f32m1(pC + 12, vl);
+                        _sum4 = vle32_v_f32m1(pC + 16, vl);
+                        _sum5 = vle32_v_f32m1(pC + 20, vl);
+                        _sum6 = vle32_v_f32m1(pC + 24, vl);
+                        _sum7 = vle32_v_f32m1(pC + 28, vl);
+                        _sum8 = vle32_v_f32m1(pC + 32, vl);
+                        _sum9 = vle32_v_f32m1(pC + 36, vl);
+                        _suma = vle32_v_f32m1(pC + 40, vl);
+                        _sumb = vle32_v_f32m1(pC + 44, vl);
+                        pC += 48;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[2], vl);
+                        _sum3 = vfmv_v_f_f32m1(pC[3], vl);
+                        _sum4 = vfmv_v_f_f32m1(pC[4], vl);
+                        _sum5 = vfmv_v_f_f32m1(pC[5], vl);
+                        _sum6 = vfmv_v_f_f32m1(pC[6], vl);
+                        _sum7 = vfmv_v_f_f32m1(pC[7], vl);
+                        _sum8 = vfmv_v_f_f32m1(pC[8], vl);
+                        _sum9 = vfmv_v_f_f32m1(pC[9], vl);
+                        _suma = vfmv_v_f_f32m1(pC[10], vl);
+                        _sumb = vfmv_v_f_f32m1(pC[11], vl);
+                        pC += 12;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+                _sum1 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum2 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum3 = vle32_v_f32m1(outptr + 4 * 3, vl);
+                _sum4 = vle32_v_f32m1(outptr + 4 * 4, vl);
+                _sum5 = vle32_v_f32m1(outptr + 4 * 5, vl);
+                _sum6 = vle32_v_f32m1(outptr + 4 * 6, vl);
+                _sum7 = vle32_v_f32m1(outptr + 4 * 7, vl);
+                _sum8 = vle32_v_f32m1(outptr + 4 * 8, vl);
+                _sum9 = vle32_v_f32m1(outptr + 4 * 9, vl);
+                _suma = vle32_v_f32m1(outptr + 4 * 10, vl);
+                _sumb = vle32_v_f32m1(outptr + 4 * 11, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA = vle32_v_f32m1(pA, vl);
+
+                _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl);
+                _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl);
+                _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl);
+                _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl);
+                _sum4 = vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl);
+                _sum5 = vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl);
+                _sum6 = vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl);
+                _sum7 = vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl);
+                _sum8 = vfmadd_vf_f32m1(_pA, pB[8], _sum8, vl);
+                _sum9 = vfmadd_vf_f32m1(_pA, pB[9], _sum9, vl);
+                _suma = vfmadd_vf_f32m1(_pA, pB[10], _suma, vl);
+                _sumb = vfmadd_vf_f32m1(_pA, pB[11], _sumb, vl);
+
+                pA += 4;
+                pB += 12;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 2, _sum2, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 3, _sum3, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 4, _sum4, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 5, _sum5, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 6, _sum6, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 7, _sum7, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 8, _sum8, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 9, _sum9, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 10, _suma, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 11, _sumb, vl);
+                    outptr0 += 48;
+                }
+                if (out_elempack == 1)
+                {
+                    transpose4x12_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, _sum8, _sum9, _suma, _sumb, vl);
+
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + 8, _sum2, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum3, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 4, _sum4, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 8, _sum5, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2, _sum6, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum7, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2 + 8, _sum8, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3, _sum9, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _suma, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3 + 8, _sumb, vl);
+                    outptr0 += 12;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+                vse32_v_f32m1(outptr + 4, _sum1, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum2, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum3, vl);
+                vse32_v_f32m1(outptr + 4 * 4, _sum4, vl);
+                vse32_v_f32m1(outptr + 4 * 5, _sum5, vl);
+                vse32_v_f32m1(outptr + 4 * 6, _sum6, vl);
+                vse32_v_f32m1(outptr + 4 * 7, _sum7, vl);
+                vse32_v_f32m1(outptr + 4 * 8, _sum8, vl);
+                vse32_v_f32m1(outptr + 4 * 9, _sum9, vl);
+                vse32_v_f32m1(outptr + 4 * 10, _suma, vl);
+                vse32_v_f32m1(outptr + 4 * 11, _sumb, vl);
+            }
+
+            outptr += 48;
+        }
+        for (; jj + 7 < max_jj; jj += 8)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+            vfloat32m1_t _sum2;
+            vfloat32m1_t _sum3;
+            vfloat32m1_t _sum4;
+            vfloat32m1_t _sum5;
+            vfloat32m1_t _sum6;
+            vfloat32m1_t _sum7;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+                _sum2 = vfmv_v_f_f32m1(0.f, vl);
+                _sum3 = vfmv_v_f_f32m1(0.f, vl);
+                _sum4 = vfmv_v_f_f32m1(0.f, vl);
+                _sum5 = vfmv_v_f_f32m1(0.f, vl);
+                _sum6 = vfmv_v_f_f32m1(0.f, vl);
+                _sum7 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum3 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum4 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum5 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum6 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum7 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = _sum0;
+                        _sum2 = _sum0;
+                        _sum3 = _sum0;
+                        _sum4 = _sum0;
+                        _sum5 = _sum0;
+                        _sum6 = _sum0;
+                        _sum7 = _sum0;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = vle32_v_f32m1(pC + 4, vl);
+                        _sum2 = vle32_v_f32m1(pC + 8, vl);
+                        _sum3 = vle32_v_f32m1(pC + 12, vl);
+                        _sum4 = vle32_v_f32m1(pC + 16, vl);
+                        _sum5 = vle32_v_f32m1(pC + 20, vl);
+                        _sum6 = vle32_v_f32m1(pC + 24, vl);
+                        _sum7 = vle32_v_f32m1(pC + 28, vl);
+                        pC += 32;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[2], vl);
+                        _sum3 = vfmv_v_f_f32m1(pC[3], vl);
+                        _sum4 = vfmv_v_f_f32m1(pC[4], vl);
+                        _sum5 = vfmv_v_f_f32m1(pC[5], vl);
+                        _sum6 = vfmv_v_f_f32m1(pC[6], vl);
+                        _sum7 = vfmv_v_f_f32m1(pC[7], vl);
+                        pC += 8;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+                _sum1 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum2 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum3 = vle32_v_f32m1(outptr + 4 * 3, vl);
+                _sum4 = vle32_v_f32m1(outptr + 4 * 4, vl);
+                _sum5 = vle32_v_f32m1(outptr + 4 * 5, vl);
+                _sum6 = vle32_v_f32m1(outptr + 4 * 6, vl);
+                _sum7 = vle32_v_f32m1(outptr + 4 * 7, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA = vle32_v_f32m1(pA, vl);
+
+                _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl);
+                _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl);
+                _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl);
+                _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl);
+                _sum4 = vfmadd_vf_f32m1(_pA, pB[4], _sum4, vl);
+                _sum5 = vfmadd_vf_f32m1(_pA, pB[5], _sum5, vl);
+                _sum6 = vfmadd_vf_f32m1(_pA, pB[6], _sum6, vl);
+                _sum7 = vfmadd_vf_f32m1(_pA, pB[7], _sum7, vl);
+
+                pA += 4;
+                pB += 8;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 2, _sum2, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 3, _sum3, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 4, _sum4, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 5, _sum5, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 6, _sum6, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 7, _sum7, vl);
+                    outptr0 += 32;
+                }
+                if (out_elempack == 1)
+                {
+                    transpose4x8_ps(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl);
+
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum2, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 4, _sum3, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2, _sum4, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2 + 4, _sum5, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3, _sum6, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3 + 4, _sum7, vl);
+                    outptr0 += 8;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+                vse32_v_f32m1(outptr + 4, _sum1, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum2, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum3, vl);
+                vse32_v_f32m1(outptr + 4 * 4, _sum4, vl);
+                vse32_v_f32m1(outptr + 4 * 5, _sum5, vl);
+                vse32_v_f32m1(outptr + 4 * 6, _sum6, vl);
+                vse32_v_f32m1(outptr + 4 * 7, _sum7, vl);
+            }
+
+            outptr += 32;
+        }
+        for (; jj + 3 < max_jj; jj += 4)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+            vfloat32m1_t _sum2;
+            vfloat32m1_t _sum3;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+                _sum2 = vfmv_v_f_f32m1(0.f, vl);
+                _sum3 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum3 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = _sum0;
+                        _sum2 = _sum0;
+                        _sum3 = _sum0;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = vle32_v_f32m1(pC + 4, vl);
+                        _sum2 = vle32_v_f32m1(pC + 8, vl);
+                        _sum3 = vle32_v_f32m1(pC + 12, vl);
+                        pC += 16;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[2], vl);
+                        _sum3 = vfmv_v_f_f32m1(pC[3], vl);
+                        pC += 4;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+                _sum1 = vle32_v_f32m1(outptr + 4 * 1, vl);
+                _sum2 = vle32_v_f32m1(outptr + 4 * 2, vl);
+                _sum3 = vle32_v_f32m1(outptr + 4 * 3, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA = vle32_v_f32m1(pA, vl);
+
+                _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl);
+                _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl);
+                _sum2 = vfmadd_vf_f32m1(_pA, pB[2], _sum2, vl);
+                _sum3 = vfmadd_vf_f32m1(_pA, pB[3], _sum3, vl);
+                pA += 4;
+                pB += 4;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 2, _sum2, vl);
+                    vse32_v_f32m1(outptr0 + 4 * 3, _sum3, vl);
+                    outptr0 += 16;
+                }
+                if (out_elempack == 1)
+                {
+                    transpose4x4_ps(_sum0, _sum1, _sum2, _sum3, vl);
+
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 1, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 2, _sum2, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep * 3, _sum3, vl);
+                    outptr0 += 4;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+                vse32_v_f32m1(outptr + 4, _sum1, vl);
+                vse32_v_f32m1(outptr + 4 * 2, _sum2, vl);
+                vse32_v_f32m1(outptr + 4 * 3, _sum3, vl);
+            }
+
+            outptr += 16;
+        }
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = _sum0;
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = vle32_v_f32m1(pC + 4, vl);
+                        pC += 8;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[1], vl);
+                        pC += 2;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+                _sum1 = vle32_v_f32m1(outptr + 4, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA = vle32_v_f32m1(pA, vl);
+
+                _sum0 = vfmadd_vf_f32m1(_pA, pB[0], _sum0, vl);
+                _sum1 = vfmadd_vf_f32m1(_pA, pB[1], _sum1, vl);
+
+                pA += 4;
+                pB += 2;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    outptr0 += 8;
+                }
+                if (out_elempack == 1)
+                {
+                    float sum0[4];
+                    float sum1[4];
+                    vse32_v_f32m1(sum0, _sum0, vl);
+                    vse32_v_f32m1(sum1, _sum1, vl);
+
+                    outptr0[0] = sum0[0];
+                    outptr0[out_hstep] = sum0[1];
+                    outptr0[out_hstep * 2] = sum0[2];
+                    outptr0[out_hstep * 3] = sum0[3];
+                    outptr0[1] = sum1[0];
+                    outptr0[out_hstep + 1] = sum1[1];
+                    outptr0[out_hstep * 2 + 1] = sum1[2];
+                    outptr0[out_hstep * 3 + 1] = sum1[3];
+                    outptr0 += 2;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+                vse32_v_f32m1(outptr + 4, _sum1, vl);
+            }
+
+            outptr += 8;
+        }
+        for (; jj < max_jj; jj += 1)
+        {
+            vfloat32m1_t _sum0;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        pC += 4;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        pC += 1;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pA = vle32_v_f32m1(pA, vl);
+                vfloat32m1_t _pB = vfmv_v_f_f32m1(pB[0], vl);
+
+                _sum0 = vfmadd_vv_f32m1(_pA, _pB, _sum0, vl);
+
+                pA += 4;
+                pB += 1;
+            }
+
+            if (k_end)
+            {
+                if (out_elempack == 4)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    outptr0 += 4;
+                }
+                if (out_elempack == 1)
+                {
+                    float sum0[4];
+                    vse32_v_f32m1(sum0, _sum0, vl);
+
+                    outptr0[0] = sum0[0];
+                    outptr0[out_hstep] = sum0[1];
+                    outptr0[out_hstep * 2] = sum0[2];
+                    outptr0[out_hstep * 3] = sum0[3];
+                    outptr0++;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+            }
+
+            outptr += 4;
+        }
+
+        pAT += max_kk * 4;
+    }
+#endif // __riscv_vector
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j;
+
+        const float* pB = pBT;
+
+        if (pC)
+        {
+            if (broadcast_type_C == 1 || broadcast_type_C == 2)
+            {
+                pC = (const float*)CT_tile + i + ii;
+            }
+            if (broadcast_type_C == 4)
+            {
+                pC = (const float*)CT_tile + j;
+            }
+        }
+
+        int jj = 0;
+#if __riscv_vector
+        for (; jj + 11 < max_jj; jj += 12)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+            vfloat32m1_t _sum02;
+            vfloat32m1_t _sum10;
+            vfloat32m1_t _sum11;
+            vfloat32m1_t _sum12;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+                _sum02 = vfmv_v_f_f32m1(0.f, vl);
+                _sum10 = vfmv_v_f_f32m1(0.f, vl);
+                _sum11 = vfmv_v_f_f32m1(0.f, vl);
+                _sum12 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum02 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum12 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum02 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum12 = vfmv_v_f_f32m1(pC[1], vl);
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        vlseg2e32_v_f32m1(&_sum00, &_sum10, pC, vl);
+                        vlseg2e32_v_f32m1(&_sum01, &_sum11, pC + 8, vl);
+                        vlseg2e32_v_f32m1(&_sum02, &_sum12, pC + 16, vl);
+                        pC += 24;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        _sum02 = vle32_v_f32m1(pC + 8, vl);
+                        _sum10 = _sum00;
+                        _sum11 = _sum01;
+                        _sum12 = _sum02;
+                        pC += 12;
+                    }
+                }
+            }
+            else
+            {
+                vlseg2e32_v_f32m1(&_sum00, &_sum10, outptr, vl);
+                vlseg2e32_v_f32m1(&_sum01, &_sum11, outptr + 8, vl);
+                vlseg2e32_v_f32m1(&_sum02, &_sum12, outptr + 16, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl);
+                vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl);
+                vfloat32m1_t _pB2 = vle32_v_f32m1(pB + 8, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pB0, pA[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pB1, pA[0], _sum01, vl);
+                _sum02 = vfmadd_vf_f32m1(_pB2, pA[0], _sum02, vl);
+                _sum10 = vfmadd_vf_f32m1(_pB0, pA[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pB1, pA[1], _sum11, vl);
+                _sum12 = vfmadd_vf_f32m1(_pB2, pA[1], _sum12, vl);
+
+                pA += 2;
+                pB += 12;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + 8, _sum02, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 4, _sum11, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 8, _sum12, vl);
+                    outptr0 += 12;
+                }
+            }
+            else
+            {
+                store_float_v2(_sum00, _sum10, outptr, vl);
+                store_float_v2(_sum01, _sum11, outptr + 8, vl);
+                store_float_v2(_sum02, _sum12, outptr + 16, vl);
+            }
+
+            outptr += 24;
+        }
+        for (; jj + 7 < max_jj; jj += 8)
+        {
+            vfloat32m1_t _sum00;
+            vfloat32m1_t _sum01;
+            vfloat32m1_t _sum10;
+            vfloat32m1_t _sum11;
+
+            if (k == 0)
+            {
+                _sum00 = vfmv_v_f_f32m1(0.f, vl);
+                _sum01 = vfmv_v_f_f32m1(0.f, vl);
+                _sum10 = vfmv_v_f_f32m1(0.f, vl);
+                _sum11 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum00 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum01 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum10 = vfmv_v_f_f32m1(pC[1], vl);
+                        _sum11 = vfmv_v_f_f32m1(pC[1], vl);
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        vlseg2e32_v_f32m1(&_sum00, &_sum10, pC, vl);
+                        vlseg2e32_v_f32m1(&_sum01, &_sum11, pC + 8, vl);
+                        pC += 16;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum00 = vle32_v_f32m1(pC, vl);
+                        _sum01 = vle32_v_f32m1(pC + 4, vl);
+                        _sum10 = _sum00;
+                        _sum11 = _sum01;
+                        pC += 8;
+                    }
+                }
+            }
+            else
+            {
+                vlseg2e32_v_f32m1(&_sum00, &_sum10, outptr, vl);
+                vlseg2e32_v_f32m1(&_sum01, &_sum11, outptr + 8, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl);
+                vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl);
+
+                _sum00 = vfmadd_vf_f32m1(_pB0, pA[0], _sum00, vl);
+                _sum01 = vfmadd_vf_f32m1(_pB1, pA[0], _sum01, vl);
+                _sum10 = vfmadd_vf_f32m1(_pB0, pA[1], _sum10, vl);
+                _sum11 = vfmadd_vf_f32m1(_pB1, pA[1], _sum11, vl);
+                pA += 2;
+                pB += 8;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    vse32_v_f32m1(outptr0, _sum00, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum01, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum10, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep + 4, _sum11, vl);
+                    outptr0 += 8;
+                }
+            }
+            else
+            {
+                store_float_v2(_sum00, _sum10, outptr, vl);
+                store_float_v2(_sum01, _sum11, outptr + 8, vl);
+            }
+
+            outptr += 16;
+        }
+        for (; jj + 3 < max_jj; jj += 4)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[1], vl);
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        vlseg2e32_v_f32m1(&_sum0, &_sum1, pC, vl);
+                        pC += 8;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = _sum0;
+                        pC += 4;
+                    }
+                }
+            }
+            else
+            {
+                vfloat32m1_t _tmp0;
+                vfloat32m1_t _tmp1;
+                vlseg2e32_v_f32m1(&_tmp0, &_tmp1, outptr, vl);
+                _sum0 = _tmp0;
+                _sum1 = _tmp1;
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pB = vle32_v_f32m1(pB, vl);
+
+                _sum0 = vfmadd_vf_f32m1(_pB, pA[0], _sum0, vl);
+                _sum1 = vfmadd_vf_f32m1(_pB, pA[1], _sum1, vl);
+
+                pA += 2;
+                pB += 4;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + out_hstep, _sum1, vl);
+                    outptr0 += 4;
+                }
+            }
+            else
+            {
+                store_float_v2(_sum0, _sum1, outptr, vl);
+            }
+
+            outptr += 8;
+        }
+#endif // __riscv_vector
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            float sum00;
+            float sum01;
+            float sum10;
+            float sum11;
+
+            if (k == 0)
+            {
+                sum00 = 0.f;
+                sum01 = 0.f;
+                sum10 = 0.f;
+                sum11 = 0.f;
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        sum00 = pC[0];
+                        sum01 = pC[0];
+                        sum10 = pC[0];
+                        sum11 = pC[0];
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        sum00 = pC[0];
+                        sum01 = pC[1];
+                        sum10 = pC[0];
+                        sum11 = pC[1];
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        sum00 = pC[0];
+                        sum01 = pC[1];
+                        sum10 = pC[2];
+                        sum11 = pC[3];
+                        pC += 4;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        sum00 = pC[0];
+                        sum01 = pC[0];
+                        sum10 = pC[1];
+                        sum11 = pC[1];
+                        pC += 2;
+                    }
+                }
+            }
+            else
+            {
+                sum00 = outptr[0];
+                sum01 = outptr[1];
+                sum10 = outptr[2];
+                sum11 = outptr[3];
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                sum00 += pA[0] * pB[0];
+                sum01 += pA[1] * pB[0];
+                sum10 += pA[0] * pB[1];
+                sum11 += pA[1] * pB[1];
+
+                pA += 2;
+                pB += 2;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum00;
+                    outptr0[1] = sum10;
+                    outptr0[out_hstep] = sum01;
+                    outptr0[out_hstep + 1] = sum11;
+                    outptr0 += 2;
+                }
+            }
+            else
+            {
+                outptr[0] = sum00;
+                outptr[1] = sum01;
+                outptr[2] = sum10;
+                outptr[3] = sum11;
+            }
+
+            outptr += 4;
+        }
+        for (; jj < max_jj; jj += 1)
+        {
+            float sum0;
+            float sum1;
+
+            if (k == 0)
+            {
+                sum0 = 0.f;
+                sum1 = 0.f;
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0)
+                    {
+                        sum0 = pC[0];
+                        sum1 = pC[0];
+                    }
+                    if (broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        sum0 = pC[0];
+                        sum1 = pC[1];
+                    }
+                    if (broadcast_type_C == 3)
+                    {
+                        sum0 = pC[0];
+                        sum1 = pC[1];
+                        pC += 2;
+                    }
+                    if (broadcast_type_C == 4)
+                    {
+                        sum0 = pC[0];
+                        sum1 = pC[0];
+                        pC += 1;
+                    }
+                }
+            }
+            else
+            {
+                sum0 = outptr[0];
+                sum1 = outptr[1];
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                sum0 += pA[0] * pB[0];
+                sum1 += pA[1] * pB[0];
+                pA += 2;
+                pB += 1;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum0;
+                    outptr0[out_hstep] = sum1;
+                    outptr0++;
+                }
+            }
+            else
+            {
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+            }
+
+            outptr += 2;
+        }
+
+        pAT += max_kk * 2;
+    }
+    for (; ii < max_ii; ii += 1)
+    {
+        float* outptr0 = (float*)top_blob + (i + ii) * out_hstep + j;
+
+        const float* pB = pBT;
+
+        if (pC)
+        {
+            if (broadcast_type_C == 1 || broadcast_type_C == 2)
+            {
+                pC = (const float*)CT_tile + i + ii;
+            }
+            if (broadcast_type_C == 4)
+            {
+                pC = (const float*)CT_tile + j;
+            }
+        }
+
+        int jj = 0;
+#if __riscv_vector
+        for (; jj + 11 < max_jj; jj += 12)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+            vfloat32m1_t _sum2;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+                _sum2 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum2 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = vle32_v_f32m1(pC + 4, vl);
+                        _sum2 = vle32_v_f32m1(pC + 8, vl);
+                        pC += 12;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+                _sum1 = vle32_v_f32m1(outptr + 4, vl);
+                _sum2 = vle32_v_f32m1(outptr + 8, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl);
+                vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl);
+                vfloat32m1_t _pB2 = vle32_v_f32m1(pB + 8, vl);
+
+                vfloat32m1_t _pA0 = vfmv_v_f_f32m1(pA[0], vl);
+
+                _sum0 = vfmadd_vv_f32m1(_pA0, _pB0, _sum0, vl);
+                _sum1 = vfmadd_vv_f32m1(_pA0, _pB1, _sum1, vl);
+                _sum2 = vfmadd_vv_f32m1(_pA0, _pB2, _sum2, vl);
+
+                pA += 1;
+                pB += 12;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    vse32_v_f32m1(outptr0 + 8, _sum2, vl);
+                    outptr0 += 12;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+                vse32_v_f32m1(outptr + 4, _sum1, vl);
+                vse32_v_f32m1(outptr + 8, _sum2, vl);
+            }
+
+            outptr += 12;
+        }
+        for (; jj + 7 < max_jj; jj += 8)
+        {
+            vfloat32m1_t _sum0;
+            vfloat32m1_t _sum1;
+
+            if (k == 0)
+            {
+                _sum0 = vfmv_v_f_f32m1(0.f, vl);
+                _sum1 = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum0 = vfmv_v_f_f32m1(pC[0], vl);
+                        _sum1 = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
+                    {
+                        _sum0 = vle32_v_f32m1(pC, vl);
+                        _sum1 = vle32_v_f32m1(pC + 4, vl);
+                        pC += 8;
+                    }
+                }
+            }
+            else
+            {
+                _sum0 = vle32_v_f32m1(outptr, vl);
+                _sum1 = vle32_v_f32m1(outptr + 4, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pB0 = vle32_v_f32m1(pB, vl);
+                vfloat32m1_t _pB1 = vle32_v_f32m1(pB + 4, vl);
+
+                vfloat32m1_t _pA0 = vfmv_v_f_f32m1(pA[0], vl);
+                _sum0 = vfmadd_vv_f32m1(_pA0, _pB0, _sum0, vl);
+                _sum1 = vfmadd_vv_f32m1(_pA0, _pB1, _sum1, vl);
+
+                pA += 1;
+                pB += 8;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    vse32_v_f32m1(outptr0, _sum0, vl);
+                    vse32_v_f32m1(outptr0 + 4, _sum1, vl);
+                    outptr0 += 8;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum0, vl);
+                vse32_v_f32m1(outptr + 4, _sum1, vl);
+            }
+
+            outptr += 8;
+        }
+        for (; jj + 3 < max_jj; jj += 4)
+        {
+            vfloat32m1_t _sum;
+
+            if (k == 0)
+            {
+                _sum = vfmv_v_f_f32m1(0.f, vl);
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        _sum = vfmv_v_f_f32m1(pC[0], vl);
+                    }
+                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
+                    {
+                        _sum = vle32_v_f32m1(pC, vl);
+                        pC += 4;
+                    }
+                }
+            }
+            else
+            {
+                _sum = vle32_v_f32m1(outptr, vl);
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                vfloat32m1_t _pB = vle32_v_f32m1(pB, vl);
+                vfloat32m1_t _pA = vfmv_v_f_f32m1(pA[0], vl);
+
+                _sum = vfmadd_vv_f32m1(_pA, _pB, _sum, vl);
+
+                pA += 1;
+                pB += 4;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    vse32_v_f32m1(outptr0, _sum, vl);
+                    outptr0 += 4;
+                }
+            }
+            else
+            {
+                vse32_v_f32m1(outptr, _sum, vl);
+            }
+
+            outptr += 4;
+        }
+#endif // __riscv_vector
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            float sum0;
+            float sum1;
+
+            if (k == 0)
+            {
+                sum0 = 0.f;
+                sum1 = 0.f;
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        sum0 = pC[0];
+                        sum1 = pC[0];
+                    }
+                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
+                    {
+                        sum0 = pC[0];
+                        sum1 = pC[1];
+                        pC += 2;
+                    }
+                }
+            }
+            else
+            {
+                sum0 = outptr[0];
+                sum1 = outptr[1];
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                sum0 += pA[0] * pB[0];
+                sum1 += pA[0] * pB[1];
+
+                pA += 1;
+                pB += 2;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum0;
+                    outptr0[1] = sum1;
+                    outptr0 += 2;
+                }
+            }
+            else
+            {
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+            }
+
+            outptr += 2;
+        }
+        for (; jj < max_jj; jj += 1)
+        {
+            float sum;
+
+            if (k == 0)
+            {
+                sum = 0.f;
+
+                if (pC)
+                {
+                    if (broadcast_type_C == 0 || broadcast_type_C == 1 || broadcast_type_C == 2)
+                    {
+                        sum = pC[0];
+                    }
+                    if (broadcast_type_C == 3 || broadcast_type_C == 4)
+                    {
+                        sum = pC[0];
+                        pC += 1;
+                    }
+                }
+            }
+            else
+            {
+                sum = outptr[0];
+            }
+
+            const float* pA = pAT;
+            int kk = 0;
+            for (; kk < max_kk; kk += 1)
+            {
+                sum += pA[0] * pB[0];
+                pA += 1;
+                pB += 1;
+            }
+
+            if (k_end)
+            {
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = sum;
+                    outptr0++;
+                }
+            }
+            else
+            {
+                outptr[0] = sum;
+            }
+
+            outptr += 1;
+        }
+
+        pAT += max_kk;
+    }
+}
+
+static void get_optimal_tile_mnk(int M, int N, int K, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
+{
+    // resolve optimal tile size from cache size
+    const size_t l2_cache_size = get_cpu_level2_cache_size();
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
+    int tile_size = (int)sqrtf((float)l2_cache_size / 3 / sizeof(float));
+
+    TILE_M = std::max(8, tile_size / 8 * 8);
+    TILE_N = std::max(4, tile_size / 4 * 4);
+    TILE_K = std::max(8, tile_size / 8 * 8);
+
+    if (K > 0)
+    {
+        int nn_K = (K + TILE_K - 1) / TILE_K;
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
+
+        if (nn_K == 1)
+        {
+            tile_size = (int)((float)l2_cache_size / 2 / sizeof(float) / TILE_K);
+            TILE_M = std::max(8, tile_size / 8 * 8);
+            TILE_N = std::max(4, tile_size / 4 * 4);
+        }
+    }
+
+    TILE_M *= std::min(nT, get_physical_cpu_count());
+
+    if (M > 0)
+    {
+        int nn_M = (M + TILE_M - 1) / TILE_M;
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
+    }
+
+    if (N > 0)
+    {
+        int nn_N = (N + TILE_N - 1) / TILE_N;
+        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
+    }
+
+    if (nT > 1)
+    {
+        TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
+    }
+
+    // always take constant TILE_M/N/K value when provided
+    if (constant_TILE_M > 0)
+    {
+        TILE_M = (constant_TILE_M + 7) / 8 * 8;
+    }
+
+    if (constant_TILE_N > 0)
+    {
+        TILE_N = (constant_TILE_N + 3) / 4 * 4;
+    }
+
+    if (constant_TILE_K > 0)
+    {
+        TILE_K = (constant_TILE_K + 7) / 8 * 8;
+    }
+}
+
+static int gemm_riscv(const Mat& A, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int transA, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt)
+{
+    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+    const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
+
+    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+
+    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
+
+    int nn_M = (M + TILE_M - 1) / TILE_M;
+    int nn_N = (N + TILE_N - 1) / TILE_N;
+    int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator);
+    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    // pack B
+#if TIME_TEST
+    gettimeofday(&start_time, NULL);
+#endif
+    #pragma omp parallel for num_threads(nT)
+    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+    {
+        const int ppj = ppjk / nn_K;
+        const int ppk = ppjk % nn_K;
+
+        const int j = ppj * TILE_N;
+        const int k = ppk * TILE_K;
+
+        const int max_jj = std::min((N - j), TILE_N);
+        const int max_kk = std::min((K - k), TILE_K);
+
+        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+        if (transB)
+        {
+            pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl);
+        }
+        else
+        {
+            transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl);
+        }
+    }
+
+    Mat topT;
+    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppi = 0; ppi < nn_M; ppi++)
+    {
+        const int i = ppi * TILE_M;
+
+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        Mat topT_tile;
+        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+            topT_tile = topT.channel(get_omp_thread_num());
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            if (broadcast_type_C == 3)
+            {
+                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl);
+            }
+
+            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);
+
+                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);
+
+                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+                if (j == 0)
+                {
+                    if (transA)
+                    {
+                        transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl);
+                    }
+                    else
+                    {
+                        pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl);
+                    }
+                }
+
+                bool k_end = !output_transpose && k + TILE_K >= K;
+                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl);
+            }
+
+            if (output_transpose)
+            {
+                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int gemm_AT_riscv(const Mat& AT, const Mat& B, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int K, int transB, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt)
+{
+    const int N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
+
+    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
+
+    int nn_M = (M + TILE_M - 1) / TILE_M;
+    int nn_N = (N + TILE_N - 1) / TILE_N;
+    int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    Mat BT(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    // pack B
+    #pragma omp parallel for num_threads(nT)
+    for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+    {
+        const int ppj = ppjk / nn_K;
+        const int ppk = ppjk % nn_K;
+
+        const int j = ppj * TILE_N;
+        const int k = ppk * TILE_K;
+
+        const int max_jj = std::min((N - j), TILE_N);
+        const int max_kk = std::min((K - k), TILE_K);
+
+        Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+        if (transB)
+        {
+            pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl);
+        }
+        else
+        {
+            transpose_pack_B_tile(B, BT_tile, j, max_jj, k, max_kk, vl);
+        }
+    }
+
+    Mat topT;
+    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppi = 0; ppi < nn_M; ppi++)
+    {
+        const int i = ppi * TILE_M;
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        Mat topT_tile;
+        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+            topT_tile = topT.channel(get_omp_thread_num());
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            if (broadcast_type_C == 3)
+            {
+                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl);
+            }
+
+            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);
+                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);
+
+                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+                bool k_end = !output_transpose && k + TILE_K >= K;
+                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl);
+            }
+
+            if (output_transpose)
+            {
+                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int gemm_BT_riscv(const Mat& A, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int N, int K, int transA, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt)
+{
+    const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+
+    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
+
+    int nn_M = (M + TILE_M - 1) / TILE_M;
+    // int nn_N = (N + TILE_N - 1) / TILE_N;
+
+    Mat ATX(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, nT, 4u, opt.workspace_allocator);
+
+    Mat topT;
+    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppi = 0; ppi < nn_M; ppi++)
+    {
+        const int i = ppi * TILE_M;
+
+        // shadowed variable for less openmp task args
+        const int M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        const int K = transA ? (A.dims == 3 ? A.c : A.h) * A.elempack : A.w;
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        Mat topT_tile;
+        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+            topT_tile = topT.channel(get_omp_thread_num());
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            if (broadcast_type_C == 3)
+            {
+                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl);
+            }
+
+            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);
+
+                Mat AT_tile = ATX.channel(get_omp_thread_num()).row_range(k / TILE_K, 1);
+
+                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+                if (j == 0)
+                {
+                    if (transA)
+                    {
+                        transpose_pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl);
+                    }
+                    else
+                    {
+                        pack_A_tile(A, AT_tile, i, max_ii, k, max_kk, vl);
+                    }
+                }
+
+                bool k_end = !output_transpose && k + TILE_K >= K;
+
+                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl);
+            }
+
+            if (output_transpose)
+            {
+                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl);
+            }
+        }
+    }
+
+    return 0;
+}
+
+static int gemm_AT_BT_riscv(const Mat& AT, const Mat& BT, const Mat& C, Mat& top_blob, int broadcast_type_C, int M, int N, int K, int output_transpose, int constant_TILE_M, int constant_TILE_N, int constant_TILE_K, int nT, size_t vl, const Option& opt)
+{
+    // NCNN_LOGE("M/N/K = %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk(M, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, nT);
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d", TILE_M, TILE_N, TILE_K);
+
+    int nn_M = (M + TILE_M - 1) / TILE_M;
+    // int nn_N = (N + TILE_N - 1) / TILE_N;
+
+    Mat topT;
+    if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+        topT.create(TILE_N * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppi = 0; ppi < nn_M; ppi++)
+    {
+        const int i = ppi * TILE_M;
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        Mat topT_tile;
+        if (K > TILE_K || broadcast_type_C == 3 || output_transpose)
+            topT_tile = topT.channel(get_omp_thread_num());
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            if (broadcast_type_C == 3)
+            {
+                pack_A_tile(C, topT_tile, i, max_ii, j, max_jj, vl);
+            }
+
+            const Mat& CT_tile = broadcast_type_C == 3 ? topT_tile : C;
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                // NCNN_LOGE("max_ii/jj/kk = %d %d %d", max_ii, max_jj, max_kk);
+
+                Mat AT_tile = AT.channel(i / TILE_M).row_range(k / TILE_K, 1);
+
+                Mat BT_tile = BT.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+                bool k_end = !output_transpose && k + TILE_K >= K;
+
+                gemm_transB_packed_tile(AT_tile, BT_tile, CT_tile, topT_tile, top_blob, broadcast_type_C, i, max_ii, j, max_jj, k, max_kk, k_end, vl);
+            }
+
+            if (output_transpose)
+            {
+                transpose_unpack_output_tile(topT_tile, top_blob, i, max_ii, j, max_jj, vl);
+            }
+        }
+    }
+
+    return 0;
+}
+
+int Gemm_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+    std::vector<Mat> bottom_blobs(1, bottom_blob);
+    std::vector<Mat> top_blobs(1, top_blob);
+    int ret = forward(bottom_blobs, top_blobs, opt);
+    top_blob = top_blobs[0];
+    return ret;
+}
+
+int Gemm_riscv::create_pipeline(const Option& opt)
+{
+    if (constantA)
+    {
+        const int M = constantM;
+        const int K = constantK;
+
+        int TILE_M, TILE_N, TILE_K;
+        get_optimal_tile_mnk(M, 0, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+        const int nn_M = (M + TILE_M - 1) / TILE_M;
+
+        AT_data.create(TILE_K * TILE_M, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 4u, (Allocator*)0);
+        if (AT_data.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ppj = 0; ppj < nn_M; ppj++)
+        {
+            const int i = ppj * TILE_M;
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_ii = std::min((M - i), TILE_M);
+                const int max_kk = std::min((K - k), TILE_K);
+
+                Mat AT_tile = AT_data.channel(i / TILE_M).row_range(k / TILE_K, 1);
+
+                if (transA)
+                {
+                    transpose_pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk, vl);
+                }
+                else
+                {
+                    pack_A_tile(A_data, AT_tile, i, max_ii, k, max_kk, vl);
+                }
+            }
+        }
+
+        if (opt.lightmode)
+        {
+            A_data.release();
+        }
+    }
+
+    if (constantB)
+    {
+        const int N = constantN;
+        const int K = constantK;
+
+        int TILE_M, TILE_N, TILE_K;
+        get_optimal_tile_mnk(0, N, K, constant_TILE_M, constant_TILE_N, constant_TILE_K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+        const int nn_N = (N + TILE_N - 1) / TILE_N;
+
+        BT_data.create(TILE_K * TILE_N, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, (Allocator*)0);
+        if (BT_data.empty())
+            return -100;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ppj = 0; ppj < nn_N; ppj++)
+        {
+            const int j = ppj * TILE_N;
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_jj = std::min((N - j), TILE_N);
+                const int max_kk = std::min((K - k), TILE_K);
+
+                Mat BT_tile = BT_data.channel(j / TILE_N).row_range(k / TILE_K, 1);
+
+                if (transB)
+                {
+                    pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk, vl);
+                }
+                else
+                {
+                    transpose_pack_B_tile(B_data, BT_tile, j, max_jj, k, max_kk, vl);
+                }
+            }
+        }
+
+        if (opt.lightmode)
+        {
+            B_data.release();
+        }
+    }
+
+    if (constantC && constant_broadcast_type_C != -1)
+    {
+        CT_data = C_data;
+
+#if __riscv_vector
+        if (constant_broadcast_type_C == 3 && opt.use_packing_layout)
+        {
+            int C_elempack = constantM % 4 == 0 ? 4 : 1;
+            convert_packing(C_data, CT_data, C_elempack, opt);
+        }
+#endif // __riscv_vector
+
+        // pre-multiply C with beta
+        if (beta != 1.f)
+        {
+            Mat C2;
+            C2.create_like(CT_data);
+
+            const int size = CT_data.total() * CT_data.elempack;
+            for (int i = 0; i < size; i++)
+            {
+                C2[i] = CT_data[i] * beta;
+            }
+
+            CT_data = C2;
+        }
+
+        if (opt.lightmode)
+        {
+            C_data.release();
+        }
+    }
+
+    if (constantA || constantB || constantC)
+    {
+        nT = opt.num_threads;
+    }
+
+    return 0;
+}
+
+int Gemm_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    int M;
+    int N;
+    if (constantA && constantB)
+    {
+        M = constantM;
+        N = constantN;
+    }
+    else if (constantA)
+    {
+        const Mat& B = bottom_blobs[0];
+        M = constantM;
+        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
+    }
+    else if (constantB)
+    {
+        const Mat& A = bottom_blobs[0];
+        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        N = constantN;
+    }
+    else
+    {
+        const Mat& A = bottom_blobs[0];
+        const Mat& B = bottom_blobs[1];
+        M = transA ? A.w : (A.dims == 3 ? A.c : A.h) * A.elempack;
+        N = transB ? (B.dims == 3 ? B.c : B.h) * B.elempack : B.w;
+    }
+
+    Mat C;
+    int broadcast_type_C = 0;
+    if (constantC)
+    {
+        C = CT_data;
+        broadcast_type_C = constant_broadcast_type_C;
+    }
+    else
+    {
+        if (constantA && constantB)
+        {
+            C = bottom_blobs.size() == 1 ? bottom_blobs[0] : Mat();
+        }
+        else if (constantA)
+        {
+            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
+        }
+        else if (constantB)
+        {
+            C = bottom_blobs.size() == 2 ? bottom_blobs[1] : Mat();
+        }
+        else
+        {
+            C = bottom_blobs.size() == 3 ? bottom_blobs[2] : Mat();
+        }
+
+        if (!C.empty())
+        {
+            if (C.dims == 1 && C.w == 1)
+            {
+                // scalar
+                broadcast_type_C = 0;
+            }
+            if (C.dims == 1 && C.w * C.elempack == M)
+            {
+                // M
+                // auto broadcast from h to w is the ncnn-style convention
+                broadcast_type_C = 1;
+            }
+            if (C.dims == 1 && C.w * C.elempack == N)
+            {
+                // N
+                broadcast_type_C = 4;
+            }
+            if (C.dims == 2 && C.w == 1 && C.h * C.elempack == M)
+            {
+                // Mx1
+                broadcast_type_C = 2;
+            }
+            if (C.dims == 2 && C.w == N && C.h * C.elempack == M)
+            {
+                // MxN
+                broadcast_type_C = 3;
+            }
+            if (C.dims == 2 && C.w == N && C.h * C.elempack == 1)
+            {
+                // 1xN
+                broadcast_type_C = 4;
+            }
+
+            // pre-multiply C with beta
+            if (beta != 1.f)
+            {
+                Mat CT_data;
+                CT_data.create_like(C, opt.workspace_allocator);
+
+                const int size = C.total() * C.elempack;
+                for (int i = 0; i < size; i++)
+                {
+                    CT_data[i] = C[i] * beta;
+                }
+
+                C = CT_data;
+            }
+        }
+    }
+
+    int out_elempack = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        int outh = output_transpose ? N : M;
+        out_elempack = outh % 4 == 0 ? 4 : 1;
+    }
+#endif // __riscv_vector
+    if (output_elempack)
+        out_elempack = output_elempack;
+    size_t out_elemsize = 4u * out_elempack;
+
+    Mat& top_blob = top_blobs[0];
+    if (output_transpose)
+    {
+        if (output_N1M)
+            top_blob.create(M, 1, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create(M, N / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    }
+    else
+    {
+        if (output_N1M)
+            top_blob.create(N, 1, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+        else
+            top_blob.create(N, M / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    }
+    if (top_blob.empty())
+        return -100;
+
+    int _nT = nT ? nT : opt.num_threads;
+    if (nT != 0 && opt.num_threads != nT)
+    {
+        // force num_threads the same as in create_pipeline
+        // so we could use pre-packed A/B from the same tile config
+        NCNN_LOGE("opt.num_threads %d changed, gemm will use load-time value %d", opt.num_threads, nT);
+    }
+
+    int ret = 0;
+    if (constantA && constantB)
+    {
+        ret = gemm_AT_BT_riscv(AT_data, BT_data, C, top_blob, broadcast_type_C, constantM, constantN, constantK, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt);
+    }
+    else if (constantA)
+    {
+        const Mat& B = bottom_blobs[0];
+        ret = gemm_AT_riscv(AT_data, B, C, top_blob, broadcast_type_C, constantM, constantK, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt);
+    }
+    else if (constantB)
+    {
+        const Mat& A = bottom_blobs[0];
+        ret = gemm_BT_riscv(A, BT_data, C, top_blob, broadcast_type_C, constantN, constantK, transA, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt);
+    }
+    else
+    {
+        const Mat& A = bottom_blobs[0];
+        const Mat& B = bottom_blobs[1];
+        ret = gemm_riscv(A, B, C, top_blob, broadcast_type_C, transA, transB, output_transpose, constant_TILE_M, constant_TILE_N, constant_TILE_K, _nT, vl, opt);
+    }
+    if (ret != 0)
+        return ret;
+
+    // multiply top_blob with alpha
+    if (alpha != 1.f)
+    {
+        const int size = top_blob.total() * out_elempack;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i = 0; i < size; i++)
+        {
+            top_blob[i] *= alpha;
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/riscv/gemm_riscv.h b/src/layer/riscv/gemm_riscv.h
new file mode 100644
index 000000000000..b92add638915
--- /dev/null
+++ b/src/layer/riscv/gemm_riscv.h
@@ -0,0 +1,43 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_GEMM_RISCV_H
+#define LAYER_GEMM_RISCV_H
+
+#include "gemm.h"
+
+namespace ncnn {
+
+class Gemm_riscv : virtual public Gemm
+{
+public:
+    Gemm_riscv();
+
+    virtual int create_pipeline(const Option& opt);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+    // public:
+    int nT;
+    size_t vl;
+    Mat AT_data;
+    Mat BT_data;
+    Mat CT_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GEMM_RISCV_H
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index 596bf4435c64..938d3ce39985 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -86,6 +86,282 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
     return vloxei32_v_f32m8(ptr, bindex, vl);
 }
 
+static inline void transpose8x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+                                   vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+                                   vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+                                   vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+                                   vfloat32m1_t& _r7l, vfloat32m1_t& _r7h, size_t vl)
+{
+    float tmp[8][8];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
+    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0h, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
+    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1h, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
+    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2h, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
+    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3h, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
+    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4h, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
+    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5h, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
+    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6h, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
+    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+}
+
+static inline void transpose4x4_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, size_t vl)
+{
+    float tmp[4][4];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+}
+
+static inline void transpose8x12_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3h,
+                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4h,
+                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5h,
+                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6h,
+                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7h,
+                                    vfloat32m1_t& _r8l, vfloat32m1_t& _r8h,
+                                    vfloat32m1_t& _r9l, vfloat32m1_t& _r9h,
+                                    vfloat32m1_t& _ral, vfloat32m1_t& _rah,
+                                    vfloat32m1_t& _rbl, vfloat32m1_t& _rbh, size_t vl)
+{
+    float tmp[8][12];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0l, vl);
+    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 12, _r0h, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1l, vl);
+    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 12, _r1h, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2l, vl);
+    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 12, _r2h, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3l, vl);
+    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 12, _r3h, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4l, vl);
+    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 12, _r4h, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5l, vl);
+    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 12, _r5h, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6l, vl);
+    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 12, _r6h, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7l, vl);
+    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 12, _r7h, vl);
+    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8l, vl);
+    vsse32_v_f32m1(&tmp[4][8], sizeof(float) * 12, _r8h, vl);
+    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9l, vl);
+    vsse32_v_f32m1(&tmp[4][9], sizeof(float) * 12, _r9h, vl);
+    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ral, vl);
+    vsse32_v_f32m1(&tmp[4][10], sizeof(float) * 12, _rah, vl);
+    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rbl, vl);
+    vsse32_v_f32m1(&tmp[4][11], sizeof(float) * 12, _rbh, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 15 * 4, vl);
+    _r8l = vle32_v_f32m1(ptr + 16 * 4, vl);
+    _r8h = vle32_v_f32m1(ptr + 17 * 4, vl);
+    _r9l = vle32_v_f32m1(ptr + 18 * 4, vl);
+    _r9h = vle32_v_f32m1(ptr + 19 * 4, vl);
+    _ral = vle32_v_f32m1(ptr + 20 * 4, vl);
+    _rah = vle32_v_f32m1(ptr + 21 * 4, vl);
+    _rbl = vle32_v_f32m1(ptr + 22 * 4, vl);
+    _rbh = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose12x8_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0m, vfloat32m1_t& _r0h,
+                                    vfloat32m1_t& _r1l, vfloat32m1_t& _r1m, vfloat32m1_t& _r1h,
+                                    vfloat32m1_t& _r2l, vfloat32m1_t& _r2m, vfloat32m1_t& _r2h,
+                                    vfloat32m1_t& _r3l, vfloat32m1_t& _r3m, vfloat32m1_t& _r3h,
+                                    vfloat32m1_t& _r4l, vfloat32m1_t& _r4m, vfloat32m1_t& _r4h,
+                                    vfloat32m1_t& _r5l, vfloat32m1_t& _r5m, vfloat32m1_t& _r5h,
+                                    vfloat32m1_t& _r6l, vfloat32m1_t& _r6m, vfloat32m1_t& _r6h,
+                                    vfloat32m1_t& _r7l, vfloat32m1_t& _r7m, vfloat32m1_t& _r7h, size_t vl)
+{
+    float tmp[12][8];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0l, vl);
+    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 8, _r0m, vl);
+    vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r0h, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1l, vl);
+    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 8, _r1m, vl);
+    vsse32_v_f32m1(&tmp[8][0], sizeof(float) * 8, _r1h, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2l, vl);
+    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 8, _r2m, vl);
+    vsse32_v_f32m1(&tmp[8][2], sizeof(float) * 8, _r2h, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3l, vl);
+    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 8, _r3m, vl);
+    vsse32_v_f32m1(&tmp[8][3], sizeof(float) * 8, _r3h, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4l, vl);
+    vsse32_v_f32m1(&tmp[4][4], sizeof(float) * 8, _r4m, vl);
+    vsse32_v_f32m1(&tmp[8][4], sizeof(float) * 8, _r4h, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5l, vl);
+    vsse32_v_f32m1(&tmp[4][5], sizeof(float) * 8, _r5m, vl);
+    vsse32_v_f32m1(&tmp[8][5], sizeof(float) * 8, _r5h, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6l, vl);
+    vsse32_v_f32m1(&tmp[4][6], sizeof(float) * 8, _r6m, vl);
+    vsse32_v_f32m1(&tmp[8][6], sizeof(float) * 8, _r6h, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7l, vl);
+    vsse32_v_f32m1(&tmp[4][7], sizeof(float) * 8, _r7m, vl);
+    vsse32_v_f32m1(&tmp[8][7], sizeof(float) * 8, _r7h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0m = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r1m = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r2m = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _r3m = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 11 * 4, vl);
+    _r4l = vle32_v_f32m1(ptr + 12 * 4, vl);
+    _r4m = vle32_v_f32m1(ptr + 13 * 4, vl);
+    _r4h = vle32_v_f32m1(ptr + 14 * 4, vl);
+    _r5l = vle32_v_f32m1(ptr + 15 * 4, vl);
+    _r5m = vle32_v_f32m1(ptr + 16 * 4, vl);
+    _r5h = vle32_v_f32m1(ptr + 17 * 4, vl);
+    _r6l = vle32_v_f32m1(ptr + 18 * 4, vl);
+    _r6m = vle32_v_f32m1(ptr + 19 * 4, vl);
+    _r6h = vle32_v_f32m1(ptr + 20 * 4, vl);
+    _r7l = vle32_v_f32m1(ptr + 21 * 4, vl);
+    _r7m = vle32_v_f32m1(ptr + 22 * 4, vl);
+    _r7h = vle32_v_f32m1(ptr + 23 * 4, vl);
+}
+
+static inline void transpose4x8_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, size_t vl)
+{
+    float tmp[4][8];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 8, _r0, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 8, _r1, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 8, _r2, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 8, _r3, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 8, _r4, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 8, _r5, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 8, _r6, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 8, _r7, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+
+static inline void transpose4x12_ps(vfloat32m1_t& _r0, vfloat32m1_t& _r1, vfloat32m1_t& _r2, vfloat32m1_t& _r3, vfloat32m1_t& _r4, vfloat32m1_t& _r5, vfloat32m1_t& _r6, vfloat32m1_t& _r7, vfloat32m1_t& _r8, vfloat32m1_t& _r9, vfloat32m1_t& _ra, vfloat32m1_t& _rb, size_t vl)
+{
+    float tmp[4][12];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 12, _r0, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 12, _r1, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 12, _r2, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 12, _r3, vl);
+    vsse32_v_f32m1(&tmp[0][4], sizeof(float) * 12, _r4, vl);
+    vsse32_v_f32m1(&tmp[0][5], sizeof(float) * 12, _r5, vl);
+    vsse32_v_f32m1(&tmp[0][6], sizeof(float) * 12, _r6, vl);
+    vsse32_v_f32m1(&tmp[0][7], sizeof(float) * 12, _r7, vl);
+    vsse32_v_f32m1(&tmp[0][8], sizeof(float) * 12, _r8, vl);
+    vsse32_v_f32m1(&tmp[0][9], sizeof(float) * 12, _r9, vl);
+    vsse32_v_f32m1(&tmp[0][10], sizeof(float) * 12, _ra, vl);
+    vsse32_v_f32m1(&tmp[0][11], sizeof(float) * 12, _rb, vl);
+    float* ptr = (float*)tmp;
+    _r0 = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r1 = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r2 = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r3 = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r4 = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r5 = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r6 = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r7 = vle32_v_f32m1(ptr + 7 * 4, vl);
+    _r8 = vle32_v_f32m1(ptr + 8 * 4, vl);
+    _r9 = vle32_v_f32m1(ptr + 9 * 4, vl);
+    _ra = vle32_v_f32m1(ptr + 10 * 4, vl);
+    _rb = vle32_v_f32m1(ptr + 11 * 4, vl);
+}
+
+static inline void transpose8x4_ps(vfloat32m1_t& _r0l, vfloat32m1_t& _r0h,
+                                   vfloat32m1_t& _r1l, vfloat32m1_t& _r1h,
+                                   vfloat32m1_t& _r2l, vfloat32m1_t& _r2h,
+                                   vfloat32m1_t& _r3l, vfloat32m1_t& _r3h, size_t vl)
+{
+    float tmp[8][4];
+    vsse32_v_f32m1(&tmp[0][0], sizeof(float) * 4, _r0l, vl);
+    vsse32_v_f32m1(&tmp[4][0], sizeof(float) * 4, _r0h, vl);
+    vsse32_v_f32m1(&tmp[0][1], sizeof(float) * 4, _r1l, vl);
+    vsse32_v_f32m1(&tmp[4][1], sizeof(float) * 4, _r1h, vl);
+    vsse32_v_f32m1(&tmp[0][2], sizeof(float) * 4, _r2l, vl);
+    vsse32_v_f32m1(&tmp[4][2], sizeof(float) * 4, _r2h, vl);
+    vsse32_v_f32m1(&tmp[0][3], sizeof(float) * 4, _r3l, vl);
+    vsse32_v_f32m1(&tmp[4][3], sizeof(float) * 4, _r3h, vl);
+    float* ptr = (float*)tmp;
+    _r0l = vle32_v_f32m1(ptr + 0 * 4, vl);
+    _r0h = vle32_v_f32m1(ptr + 1 * 4, vl);
+    _r1l = vle32_v_f32m1(ptr + 2 * 4, vl);
+    _r1h = vle32_v_f32m1(ptr + 3 * 4, vl);
+    _r2l = vle32_v_f32m1(ptr + 4 * 4, vl);
+    _r2h = vle32_v_f32m1(ptr + 5 * 4, vl);
+    _r3l = vle32_v_f32m1(ptr + 6 * 4, vl);
+    _r3h = vle32_v_f32m1(ptr + 7 * 4, vl);
+}
+
+static inline void store_float_v2(vfloat32m1_t& vector1, vfloat32m1_t& vector2, float* buf, size_t vl)
+{
+    vsse32_v_f32m1(buf + 0, sizeof(float) * 2, vector1, vl);
+    vsse32_v_f32m1(buf + 1, sizeof(float) * 2, vector2, vl);
+}
+
+static inline void store_float_v4(vfloat32m1_t& vector1, vfloat32m1_t& vector2, vfloat32m1_t& vector3, vfloat32m1_t& vector4, float* buf, size_t vl)
+{
+    vsse32_v_f32m1(buf + 0, sizeof(float) * 4, vector1, vl);
+    vsse32_v_f32m1(buf + 1, sizeof(float) * 4, vector2, vl);
+    vsse32_v_f32m1(buf + 2, sizeof(float) * 4, vector3, vl);
+    vsse32_v_f32m1(buf + 3, sizeof(float) * 4, vector4, vl);
+}
+
 #if __riscv_zfh
 static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 {
diff --git a/src/layer/vulkan/convolution1d_vulkan.cpp b/src/layer/vulkan/convolution1d_vulkan.cpp
new file mode 100644
index 000000000000..f9d9135ceb90
--- /dev/null
+++ b/src/layer/vulkan/convolution1d_vulkan.cpp
@@ -0,0 +1,423 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "convolution1d_vulkan.h"
+
+#include "layer_shader_type.h"
+#include "layer_type.h"
+
+namespace ncnn {
+
+Convolution1D_vulkan::Convolution1D_vulkan()
+{
+    support_vulkan = true;
+    support_image_storage = true;
+
+    padding = 0;
+
+    pipeline_convolution1d = 0;
+}
+
+int Convolution1D_vulkan::create_pipeline(const Option& _opt)
+{
+    if (dynamic_weight)
+    {
+        support_vulkan = false;
+        support_image_storage = false;
+        return 0;
+    }
+
+    Option opt = _opt;
+
+    const int maxk = kernel_w;
+    int num_input = weight_data_size / maxk / num_output;
+
+    int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    {
+        padding = ncnn::create_layer(ncnn::LayerType::Padding);
+        padding->vkdev = vkdev;
+
+        ncnn::ParamDict pd;
+        pd.set(0, 0);
+        pd.set(1, 0);
+        pd.set(2, pad_left);
+        pd.set(3, pad_right);
+        pd.set(4, 0);
+        pd.set(5, pad_value);
+
+        padding->load_param(pd);
+
+        padding->create_pipeline(opt);
+    }
+
+    {
+        Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
+
+        weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
+
+        for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
+        {
+            float* g00 = weight_data_packed.channel(q / out_elempack);
+
+            for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
+            {
+                for (int k = 0; k < maxk; k++)
+                {
+                    for (int i = 0; i < out_elempack; i++)
+                    {
+                        const Mat k0 = weight_data_r2.channel(q + i);
+
+                        for (int j = 0; j < elempack; j++)
+                        {
+                            const float* k00 = k0.row(p + j);
+                            g00[0] = k00[k];
+                            g00++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    if (bias_term)
+    {
+        convert_packing(bias_data, bias_data_packed, out_elempack, opt);
+    }
+
+    {
+        std::vector<vk_specialization_type> specializations(7 + 10);
+        specializations[0].i = kernel_w;
+        specializations[1].i = dilation_w;
+        specializations[2].i = stride_w;
+        specializations[3].i = bias_term;
+        specializations[4].i = activation_type;
+        specializations[5].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[6].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[7 + 0].i = 0;
+        specializations[7 + 1].i = 0;
+        specializations[7 + 2].i = 0;
+        specializations[7 + 3].i = 0;
+        specializations[7 + 4].i = 0;
+        specializations[7 + 5].i = 0;
+        specializations[7 + 6].i = 0;
+        specializations[7 + 7].i = 0;
+        specializations[7 + 8].i = 0;
+        specializations[7 + 9].i = 0;
+
+        int shader_type_index = -1;
+        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d;
+        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack4;
+        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack1to4;
+        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d_pack4to1;
+        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack8;
+        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack1to8;
+        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution1d_pack8to1;
+        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution1d_pack4to8;
+        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution1d_pack8to4;
+
+        pipeline_convolution1d = new Pipeline(vkdev);
+        pipeline_convolution1d->set_optimal_local_size_xyz(1, 1, 1);
+        pipeline_convolution1d->create(shader_type_index, opt, specializations);
+    }
+
+    return 0;
+}
+
+int Convolution1D_vulkan::destroy_pipeline(const Option& opt)
+{
+    if (padding)
+    {
+        padding->destroy_pipeline(opt);
+        delete padding;
+        padding = 0;
+    }
+
+    delete pipeline_convolution1d;
+    pipeline_convolution1d = 0;
+
+    return 0;
+}
+
+int Convolution1D_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
+{
+    if (padding)
+    {
+        padding->upload_model(cmd, opt);
+    }
+
+    if (support_image_storage && opt.use_image_storage)
+    {
+        cmd.record_upload(weight_data_packed, weight_data_gpu_image, opt);
+    }
+    else
+    {
+        cmd.record_upload(weight_data_packed, weight_data_gpu, opt);
+    }
+
+    weight_data_packed.release();
+
+    if (bias_term)
+    {
+        if (support_image_storage && opt.use_image_storage)
+        {
+            cmd.record_upload(bias_data_packed, bias_data_gpu_image, opt);
+        }
+        else
+        {
+            cmd.record_upload(bias_data_packed, bias_data_gpu, opt);
+        }
+
+        bias_data_packed.release();
+    }
+
+    return 0;
+}
+
+int Convolution1D_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    VkMat bottom_blob_bordered = bottom_blob;
+    if (pad_left > 0 || pad_right > 0)
+    {
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+    else if (pad_left == -233 && pad_right == -233)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad / 2;
+            padding_params[3] = wpad - wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad - wpad / 2;
+            padding_params[3] = wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+
+    int outw = (bottom_blob_bordered.w - kernel_extent_w) / stride_w + 1;
+
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    top_blob.create(outw, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkMat> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu;
+    bindings[3] = bias_data_gpu;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = top_blob.cstep;
+
+    VkMat dispatcher;
+    dispatcher.w = (top_blob.w + 1) / 2;
+    dispatcher.h = (top_blob.h + 1) / 2;
+    dispatcher.c = (top_blob.c + 1) / 2;
+
+    cmd.record_pipeline(pipeline_convolution1d, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+int Convolution1D_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+
+    VkImageMat bottom_blob_bordered = bottom_blob;
+    if (pad_left > 0 || pad_right > 0)
+    {
+        Option opt_pad = opt;
+        opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+        padding->forward(bottom_blob, bottom_blob_bordered, cmd, opt_pad);
+    }
+    else if (pad_left == -233 && pad_right == -233)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad / 2;
+            padding_params[3] = wpad - wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+            std::vector<VkImageMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkImageMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+    else if (pad_left == -234 && pad_right == -234)
+    {
+        int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
+        if (wpad > 0)
+        {
+            Option opt_pad = opt;
+            opt_pad.blob_vkallocator = opt.workspace_vkallocator;
+
+            VkImageMat padding_param_blob(6, (size_t)4u, 1, opt.staging_vkallocator);
+            int* padding_params = padding_param_blob.mapped();
+
+            padding_params[0] = 0;
+            padding_params[1] = 0;
+            padding_params[2] = wpad - wpad / 2;
+            padding_params[3] = wpad / 2;
+            padding_params[4] = 0;
+            padding_params[5] = 0;
+
+            std::vector<VkImageMat> padding_inputs(2);
+            padding_inputs[0] = bottom_blob;
+            padding_inputs[1] = padding_param_blob;
+
+            std::vector<VkImageMat> padding_outputs(1);
+            padding->forward(padding_inputs, padding_outputs, cmd, opt_pad);
+            bottom_blob_bordered = padding_outputs[0];
+        }
+    }
+
+    int outw = (bottom_blob_bordered.w - kernel_extent_w) / stride_w + 1;
+
+    int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
+
+    size_t out_elemsize = elemsize / elempack * out_elempack;
+
+    if (opt.use_fp16_packed && !opt.use_fp16_storage)
+    {
+        if (out_elempack == 8) out_elemsize = 8 * 2u;
+        if (out_elempack == 4) out_elemsize = 4 * 2u;
+        if (out_elempack == 1) out_elemsize = 4u;
+    }
+
+    top_blob.create(outw, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+    if (top_blob.empty())
+        return -100;
+
+    std::vector<VkImageMat> bindings(4);
+    bindings[0] = bottom_blob_bordered;
+    bindings[1] = top_blob;
+    bindings[2] = weight_data_gpu_image;
+    bindings[3] = bias_data_gpu_image;
+
+    std::vector<vk_constant_type> constants(10);
+    constants[0].i = bottom_blob_bordered.dims;
+    constants[1].i = bottom_blob_bordered.w;
+    constants[2].i = bottom_blob_bordered.h;
+    constants[3].i = bottom_blob_bordered.c;
+    constants[4].i = 0; //bottom_blob_bordered.cstep;
+    constants[5].i = top_blob.dims;
+    constants[6].i = top_blob.w;
+    constants[7].i = top_blob.h;
+    constants[8].i = top_blob.c;
+    constants[9].i = 0; //top_blob.cstep;
+
+    VkImageMat dispatcher;
+    dispatcher.w = (top_blob.w + 1) / 2;
+    dispatcher.h = (top_blob.h + 1) / 2;
+    dispatcher.c = (top_blob.c + 1) / 2;
+
+    cmd.record_pipeline(pipeline_convolution1d, bindings, constants, dispatcher);
+
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/vulkan/convolution1d_vulkan.h b/src/layer/vulkan/convolution1d_vulkan.h
new file mode 100644
index 000000000000..4fb22040daa2
--- /dev/null
+++ b/src/layer/vulkan/convolution1d_vulkan.h
@@ -0,0 +1,53 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_CONVOLUTION1D_VULKAN_H
+#define LAYER_CONVOLUTION1D_VULKAN_H
+
+#include "convolution1d.h"
+
+namespace ncnn {
+
+class Convolution1D_vulkan : virtual public Convolution1D
+{
+public:
+    Convolution1D_vulkan();
+
+    virtual int create_pipeline(const Option& opt);
+    virtual int destroy_pipeline(const Option& opt);
+
+    virtual int upload_model(VkTransfer& cmd, const Option& opt);
+
+    using Convolution1D::forward;
+    virtual int forward(const VkMat& bottom_blob, VkMat& top_blob, VkCompute& cmd, const Option& opt) const;
+    virtual int forward(const VkImageMat& bottom_blob, VkImageMat& top_blob, VkCompute& cmd, const Option& opt) const;
+
+public:
+    ncnn::Layer* padding;
+
+    Mat weight_data_packed;
+    Mat bias_data_packed;
+
+    VkMat weight_data_gpu;
+    VkMat bias_data_gpu;
+
+    VkImageMat weight_data_gpu_image;
+    VkImageMat bias_data_gpu_image;
+
+    Pipeline* pipeline_convolution1d;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_CONVOLUTION1D_VULKAN_H
diff --git a/src/layer/vulkan/shader/convolution1d.comp b/src/layer/vulkan/shader/convolution1d.comp
new file mode 100644
index 000000000000..3403a50f1939
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afp sum0 = afp(0.0f);
+    afp sum1 = afp(0.0f);
+    afp sum2 = afp(0.0f);
+    afp sum3 = afp(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld1(bias_data, gy2.x);
+        sum2 = buffer_ld1(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afp k0 = image3d_ld1(weight_blob, ivec3(wx, y, gy2.x));
+            afp k1 = image3d_ld1(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w);
+            afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afp k0 = buffer_ld1(weight_data, w_offset.x + x);
+            afp k1 = buffer_ld1(weight_data, w_offset.y + x);
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack1to4.comp b/src/layer/vulkan/shader/convolution1d_pack1to4.comp
new file mode 100644
index 000000000000..98e6fadd3c18
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack1to4.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec4 sum0 = afpvec4(0.0f);
+    afpvec4 sum1 = afpvec4(0.0f);
+    afpvec4 sum2 = afpvec4(0.0f);
+    afpvec4 sum3 = afpvec4(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld4(bias_data, gy2.x);
+        sum2 = buffer_ld4(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w);
+            afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec4 k0 = buffer_ld4(weight_data, w_offset.x + x);
+            afpvec4 k1 = buffer_ld4(weight_data, w_offset.y + x);
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack1to8.comp b/src/layer/vulkan/shader/convolution1d_pack1to8.comp
new file mode 100644
index 000000000000..c32bc2114e58
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack1to8.comp
@@ -0,0 +1,186 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld8(bias_data, gy2.x);
+        sum2 = buffer_ld8(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = image3d_ld1(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afp v1 = image3d_ld1(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0[0] += v0 * k0[0];
+            sum0[1] += v0 * k0[1];
+            sum1[0] += v1 * k0[0];
+            sum1[1] += v1 * k0[1];
+            sum2[0] += v0 * k1[0];
+            sum2[1] += v0 * k1[1];
+            sum3[0] += v1 * k1[0];
+            sum3[1] += v1 * k1[1];
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afp v0 = buffer_ld1(bottom_blob_data, v_offset.x + x * dilation_w);
+            afp v1 = buffer_ld1(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, w_offset.x + x);
+            afpvec8 k1 = buffer_ld8(weight_data, w_offset.y + x);
+
+            sum0[0] += v0 * k0[0];
+            sum0[1] += v0 * k0[1];
+            sum1[0] += v1 * k0[0];
+            sum1[1] += v1 * k0[1];
+            sum2[0] += v0 * k1[0];
+            sum2[1] += v0 * k1[1];
+            sum3[0] += v1 * k1[0];
+            sum3[1] += v1 * k1[1];
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack4.comp b/src/layer/vulkan/shader/convolution1d_pack4.comp
new file mode 100644
index 000000000000..f1e125867852
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack4.comp
@@ -0,0 +1,208 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+#else
+layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
+#endif
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec4 sum0 = afpvec4(0.0f);
+    afpvec4 sum1 = afpvec4(0.0f);
+    afpvec4 sum2 = afpvec4(0.0f);
+    afpvec4 sum3 = afpvec4(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld4(bias_data, gy2.x);
+        sum2 = buffer_ld4(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+
+            afpmat4 k0 = afpmat4(
+                image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.x)),
+                image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.x)),
+                image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.x)),
+                image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.x))
+            );
+            afpmat4 k1 = afpmat4(
+                image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.y)),
+                image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.y)),
+                image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.y)),
+                image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.y))
+            );
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+
+            wx += 4;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
+            // GL_EXT_shader_16bit_storage does not define f16mat4 type :(
+            afpmat4 k0 = afpmat4(
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 0),
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 1),
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 2),
+                buffer_ld4(weight_data, (w_offset.x + x) * 4 + 3)
+            );
+            afpmat4 k1 = afpmat4(
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 0),
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 1),
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 2),
+                buffer_ld4(weight_data, (w_offset.y + x) * 4 + 3)
+            );
+#else
+            afpmat4 k0 = sfp2afpmat4(weight_data[w_offset.x + x]);
+            afpmat4 k1 = sfp2afpmat4(weight_data[w_offset.y + x]);
+#endif
+
+            sum0 += v0 * k0;
+            sum1 += v1 * k0;
+            sum2 += v0 * k1;
+            sum3 += v1 * k1;
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack4to1.comp b/src/layer/vulkan/shader/convolution1d_pack4to1.comp
new file mode 100644
index 000000000000..1f5c87e1835b
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack4to1.comp
@@ -0,0 +1,177 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afp sum0 = afp(0.0f);
+    afp sum1 = afp(0.0f);
+    afp sum2 = afp(0.0f);
+    afp sum3 = afp(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld1(bias_data, gy2.x);
+        sum2 = buffer_ld1(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += dot(v0, k0);
+            sum1 += dot(v1, k0);
+            sum2 += dot(v0, k1);
+            sum3 += dot(v1, k1);
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec4 k0 = buffer_ld4(weight_data, w_offset.x + x);
+            afpvec4 k1 = buffer_ld4(weight_data, w_offset.y + x);
+            
+            sum0 += dot(v0, k0);
+            sum1 += dot(v1, k0);
+            sum2 += dot(v0, k1);
+            sum3 += dot(v1, k1);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack4to8.comp b/src/layer/vulkan/shader/convolution1d_pack4to8.comp
new file mode 100644
index 000000000000..1133b097ac09
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack4to8.comp
@@ -0,0 +1,270 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld8(bias_data, gy2.x);
+        sum2 = buffer_ld8(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = image3d_ld4(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec4 v1 = image3d_ld4(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+
+            afpvec4 k0 = image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.x));
+            afpvec4 k1 = image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.x));
+            afpvec4 k2 = image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.x));
+            afpvec4 k3 = image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.x));
+            afpvec4 k4 = image3d_ld4(weight_blob, ivec3(wx + 4, y, gy2.x));
+            afpvec4 k5 = image3d_ld4(weight_blob, ivec3(wx + 5, y, gy2.x));
+            afpvec4 k6 = image3d_ld4(weight_blob, ivec3(wx + 6, y, gy2.x));
+            afpvec4 k7 = image3d_ld4(weight_blob, ivec3(wx + 7, y, gy2.x));
+
+            afpvec4 k8 = image3d_ld4(weight_blob, ivec3(wx + 0, y, gy2.y));
+            afpvec4 k9 = image3d_ld4(weight_blob, ivec3(wx + 1, y, gy2.y));
+            afpvec4 ka = image3d_ld4(weight_blob, ivec3(wx + 2, y, gy2.y));
+            afpvec4 kb = image3d_ld4(weight_blob, ivec3(wx + 3, y, gy2.y));
+            afpvec4 kc = image3d_ld4(weight_blob, ivec3(wx + 4, y, gy2.y));
+            afpvec4 kd = image3d_ld4(weight_blob, ivec3(wx + 5, y, gy2.y));
+            afpvec4 ke = image3d_ld4(weight_blob, ivec3(wx + 6, y, gy2.y));
+            afpvec4 kf = image3d_ld4(weight_blob, ivec3(wx + 7, y, gy2.y));
+
+            sum0[0].r += dot(v0, k0);
+            sum0[0].g += dot(v0, k1);
+            sum0[0].b += dot(v0, k2);
+            sum0[0].a += dot(v0, k3);
+            sum0[1].r += dot(v0, k4);
+            sum0[1].g += dot(v0, k5);
+            sum0[1].b += dot(v0, k6);
+            sum0[1].a += dot(v0, k7);
+
+            sum1[0].r += dot(v1, k0);
+            sum1[0].g += dot(v1, k1);
+            sum1[0].b += dot(v1, k2);
+            sum1[0].a += dot(v1, k3);
+            sum1[1].r += dot(v1, k4);
+            sum1[1].g += dot(v1, k5);
+            sum1[1].b += dot(v1, k6);
+            sum1[1].a += dot(v1, k7);
+
+            sum2[0].r += dot(v0, k8);
+            sum2[0].g += dot(v0, k9);
+            sum2[0].b += dot(v0, ka);
+            sum2[0].a += dot(v0, kb);
+            sum2[1].r += dot(v0, kc);
+            sum2[1].g += dot(v0, kd);
+            sum2[1].b += dot(v0, ke);
+            sum2[1].a += dot(v0, kf);
+
+            sum3[0].r += dot(v1, k8);
+            sum3[0].g += dot(v1, k9);
+            sum3[0].b += dot(v1, ka);
+            sum3[0].a += dot(v1, kb);
+            sum3[1].r += dot(v1, kc);
+            sum3[1].g += dot(v1, kd);
+            sum3[1].b += dot(v1, ke);
+            sum3[1].a += dot(v1, kf);
+
+            wx += 8;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec4 v1 = buffer_ld4(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec4 k0 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 0);
+            afpvec4 k1 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 1);
+            afpvec4 k2 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 2);
+            afpvec4 k3 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 3);
+            afpvec4 k4 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 4);
+            afpvec4 k5 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 5);
+            afpvec4 k6 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 6);
+            afpvec4 k7 = buffer_ld4(weight_data, (w_offset.x + x) * 8 + 7);
+
+            afpvec4 k8 = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 0);
+            afpvec4 k9 = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 1);
+            afpvec4 ka = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 2);
+            afpvec4 kb = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 3);
+            afpvec4 kc = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 4);
+            afpvec4 kd = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 5);
+            afpvec4 ke = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 6);
+            afpvec4 kf = buffer_ld4(weight_data, (w_offset.y + x) * 8 + 7);
+
+            sum0[0].r += dot(v0, k0);
+            sum0[0].g += dot(v0, k1);
+            sum0[0].b += dot(v0, k2);
+            sum0[0].a += dot(v0, k3);
+            sum0[1].r += dot(v0, k4);
+            sum0[1].g += dot(v0, k5);
+            sum0[1].b += dot(v0, k6);
+            sum0[1].a += dot(v0, k7);
+
+            sum1[0].r += dot(v1, k0);
+            sum1[0].g += dot(v1, k1);
+            sum1[0].b += dot(v1, k2);
+            sum1[0].a += dot(v1, k3);
+            sum1[1].r += dot(v1, k4);
+            sum1[1].g += dot(v1, k5);
+            sum1[1].b += dot(v1, k6);
+            sum1[1].a += dot(v1, k7);
+
+            sum2[0].r += dot(v0, k8);
+            sum2[0].g += dot(v0, k9);
+            sum2[0].b += dot(v0, ka);
+            sum2[0].a += dot(v0, kb);
+            sum2[1].r += dot(v0, kc);
+            sum2[1].g += dot(v0, kd);
+            sum2[1].b += dot(v0, ke);
+            sum2[1].a += dot(v0, kf);
+
+            sum3[0].r += dot(v1, k8);
+            sum3[0].g += dot(v1, k9);
+            sum3[0].b += dot(v1, ka);
+            sum3[0].a += dot(v1, kb);
+            sum3[1].r += dot(v1, kc);
+            sum3[1].g += dot(v1, kd);
+            sum3[1].b += dot(v1, ke);
+            sum3[1].a += dot(v1, kf);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack8.comp b/src/layer/vulkan/shader/convolution1d_pack8.comp
new file mode 100644
index 000000000000..fff72ade8290
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack8.comp
@@ -0,0 +1,270 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec8 sum0 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum1 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum2 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+    afpvec8 sum3 = afpvec8(afpvec4(0.0f), afpvec4(0.0f));
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld8(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld8(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld8(bias_data, gy2.x);
+        sum2 = buffer_ld8(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.x));
+            afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.x));
+            afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.x));
+            afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 4, y, gy2.x));
+            afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 5, y, gy2.x));
+            afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 6, y, gy2.x));
+            afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 7, y, gy2.x));
+
+            afpvec8 k8 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.y));
+            afpvec8 k9 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.y));
+            afpvec8 ka = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.y));
+            afpvec8 kb = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.y));
+            afpvec8 kc = image3d_ld8(weight_blob, ivec3(wx + 4, y, gy2.y));
+            afpvec8 kd = image3d_ld8(weight_blob, ivec3(wx + 5, y, gy2.y));
+            afpvec8 ke = image3d_ld8(weight_blob, ivec3(wx + 6, y, gy2.y));
+            afpvec8 kf = image3d_ld8(weight_blob, ivec3(wx + 7, y, gy2.y));
+
+            sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+            sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+            sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+            sum2[0].r += dot(v0[0], k8[0]) + dot(v0[1], k8[1]);
+            sum2[0].g += dot(v0[0], k9[0]) + dot(v0[1], k9[1]);
+            sum2[0].b += dot(v0[0], ka[0]) + dot(v0[1], ka[1]);
+            sum2[0].a += dot(v0[0], kb[0]) + dot(v0[1], kb[1]);
+            sum2[1].r += dot(v0[0], kc[0]) + dot(v0[1], kc[1]);
+            sum2[1].g += dot(v0[0], kd[0]) + dot(v0[1], kd[1]);
+            sum2[1].b += dot(v0[0], ke[0]) + dot(v0[1], ke[1]);
+            sum2[1].a += dot(v0[0], kf[0]) + dot(v0[1], kf[1]);
+
+            sum3[0].r += dot(v1[0], k8[0]) + dot(v1[1], k8[1]);
+            sum3[0].g += dot(v1[0], k9[0]) + dot(v1[1], k9[1]);
+            sum3[0].b += dot(v1[0], ka[0]) + dot(v1[1], ka[1]);
+            sum3[0].a += dot(v1[0], kb[0]) + dot(v1[1], kb[1]);
+            sum3[1].r += dot(v1[0], kc[0]) + dot(v1[1], kc[1]);
+            sum3[1].g += dot(v1[0], kd[0]) + dot(v1[1], kd[1]);
+            sum3[1].b += dot(v1[0], ke[0]) + dot(v1[1], ke[1]);
+            sum3[1].a += dot(v1[0], kf[0]) + dot(v1[1], kf[1]);
+
+            wx += 8;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 0);
+            afpvec8 k1 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 1);
+            afpvec8 k2 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 2);
+            afpvec8 k3 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 3);
+            afpvec8 k4 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 4);
+            afpvec8 k5 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 5);
+            afpvec8 k6 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 6);
+            afpvec8 k7 = buffer_ld8(weight_data, (w_offset.x + x) * 8 + 7);
+
+            afpvec8 k8 = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 0);
+            afpvec8 k9 = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 1);
+            afpvec8 ka = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 2);
+            afpvec8 kb = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 3);
+            afpvec8 kc = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 4);
+            afpvec8 kd = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 5);
+            afpvec8 ke = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 6);
+            afpvec8 kf = buffer_ld8(weight_data, (w_offset.y + x) * 8 + 7);
+
+            sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+            sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+            sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+            sum2[0].r += dot(v0[0], k8[0]) + dot(v0[1], k8[1]);
+            sum2[0].g += dot(v0[0], k9[0]) + dot(v0[1], k9[1]);
+            sum2[0].b += dot(v0[0], ka[0]) + dot(v0[1], ka[1]);
+            sum2[0].a += dot(v0[0], kb[0]) + dot(v0[1], kb[1]);
+            sum2[1].r += dot(v0[0], kc[0]) + dot(v0[1], kc[1]);
+            sum2[1].g += dot(v0[0], kd[0]) + dot(v0[1], kd[1]);
+            sum2[1].b += dot(v0[0], ke[0]) + dot(v0[1], ke[1]);
+            sum2[1].a += dot(v0[0], kf[0]) + dot(v0[1], kf[1]);
+
+            sum3[0].r += dot(v1[0], k8[0]) + dot(v1[1], k8[1]);
+            sum3[0].g += dot(v1[0], k9[0]) + dot(v1[1], k9[1]);
+            sum3[0].b += dot(v1[0], ka[0]) + dot(v1[1], ka[1]);
+            sum3[0].a += dot(v1[0], kb[0]) + dot(v1[1], kb[1]);
+            sum3[1].r += dot(v1[0], kc[0]) + dot(v1[1], kc[1]);
+            sum3[1].g += dot(v1[0], kd[0]) + dot(v1[1], kd[1]);
+            sum3[1].b += dot(v1[0], ke[0]) + dot(v1[1], ke[1]);
+            sum3[1].a += dot(v1[0], kf[0]) + dot(v1[1], kf[1]);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st8(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st8(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st8(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st8(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st8(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack8to1.comp b/src/layer/vulkan/shader/convolution1d_pack8to1.comp
new file mode 100644
index 000000000000..9d08d3b11af7
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack8to1.comp
@@ -0,0 +1,178 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc1) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afp sum0 = afp(0.0f);
+    afp sum1 = afp(0.0f);
+    afp sum2 = afp(0.0f);
+    afp sum3 = afp(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld1(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld1(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld1(bias_data, gy2.x);
+        sum2 = buffer_ld1(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx, y, gy2.y));
+
+            sum0 += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum1 += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum2 += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum3 += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+
+            wx += 1;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, w_offset.x + x);
+            afpvec8 k1 = buffer_ld8(weight_data, w_offset.y + x);
+
+            sum0 += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum1 += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum2 += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum3 += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st1(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st1(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st1(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st1(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st1(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/vulkan/shader/convolution1d_pack8to4.comp b/src/layer/vulkan/shader/convolution1d_pack8to4.comp
new file mode 100644
index 000000000000..86ca696d5840
--- /dev/null
+++ b/src/layer/vulkan/shader/convolution1d_pack8to4.comp
@@ -0,0 +1,220 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+#extension GL_GOOGLE_include_directive: enable
+#include "vulkan_activation.comp"
+
+layout (constant_id = 0) const int kernel_w = 1;
+layout (constant_id = 1) const int dilation_w = 1;
+layout (constant_id = 2) const int stride_w = 1;
+layout (constant_id = 3) const int bias_term = 0;
+layout (constant_id = 4) const int activation_type = 0;
+layout (constant_id = 5) const float activation_param_0 = 0;
+layout (constant_id = 6) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 7
+layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+
+#if NCNN_image_shader
+layout (binding = 0) uniform unfp sampler3D bottom_blob;
+layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_blob;
+layout (binding = 2) uniform unfp sampler3D weight_blob;
+layout (binding = 3) uniform unfp sampler3D bias_blob;
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };
+layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
+#endif
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x) * 2;
+    int gy = int(gl_GlobalInvocationID.y) * 2;
+	
+    if (gx >= psc(outw) || gy >= psc(outh))
+        return;
+
+    const ivec2 gx2 = gx + ivec2(0, 1);
+    const ivec2 gy2 = gy + ivec2(0, 1);
+
+    afpvec4 sum0 = afpvec4(0.0f);
+    afpvec4 sum1 = afpvec4(0.0f);
+    afpvec4 sum2 = afpvec4(0.0f);
+    afpvec4 sum3 = afpvec4(0.0f);
+
+    if (bias_term == 1)
+    {
+#if NCNN_image_shader
+        sum0 = image3d_ld4(bias_blob, ivec3(gy2.x, 0, 0));
+        sum2 = image3d_ld4(bias_blob, ivec3(gy2.y, 0, 0));
+#else
+        sum0 = buffer_ld4(bias_data, gy2.x);
+        sum2 = buffer_ld4(bias_data, gy2.y);
+#endif
+        sum1 = sum0;
+        sum3 = sum2;
+    }
+
+#if NCNN_image_shader
+
+    ivec2 v_offset = gx2 * stride_w;
+
+    for (int y = 0; y < psc(h); y++)
+    {
+        int wx = 0;
+
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = image3d_ld8(bottom_blob, ivec3(v_offset.x + x * dilation_w, y, 0));
+            afpvec8 v1 = image3d_ld8(bottom_blob, ivec3(v_offset.y + x * dilation_w, y, 0));
+            
+            afpvec8 k0 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.x));
+            afpvec8 k1 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.x));
+            afpvec8 k2 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.x));
+            afpvec8 k3 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.x));
+            afpvec8 k4 = image3d_ld8(weight_blob, ivec3(wx + 0, y, gy2.y));
+            afpvec8 k5 = image3d_ld8(weight_blob, ivec3(wx + 1, y, gy2.y));
+            afpvec8 k6 = image3d_ld8(weight_blob, ivec3(wx + 2, y, gy2.y));
+            afpvec8 k7 = image3d_ld8(weight_blob, ivec3(wx + 3, y, gy2.y));
+
+            sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+
+            sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+
+            sum2.r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum2.g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum2.b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum2.a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum3.r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum3.g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum3.b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum3.a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+
+            wx += 4;
+        }
+    }
+    
+#else
+
+    ivec2 v_offset = gx2 * stride_w;
+    ivec2 w_offset = gy2 * psc(h) * kernel_w;
+    
+    for (int y = 0; y < psc(h); y++)
+    {    
+        for (int x = 0; x < kernel_w; x++)
+        {
+            afpvec8 v0 = buffer_ld8(bottom_blob_data, v_offset.x + x * dilation_w);
+            afpvec8 v1 = buffer_ld8(bottom_blob_data, v_offset.y + x * dilation_w);
+            
+            afpvec8 k0 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 0);
+            afpvec8 k1 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 1);
+            afpvec8 k2 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 2);
+            afpvec8 k3 = buffer_ld8(weight_data, (w_offset.x + x) * 4 + 3);
+            afpvec8 k4 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 0);
+            afpvec8 k5 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 1);
+            afpvec8 k6 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 2);
+            afpvec8 k7 = buffer_ld8(weight_data, (w_offset.y + x) * 4 + 3);
+
+            sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
+            sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
+            sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
+            sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
+
+            sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
+            sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
+            sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
+            sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
+
+            sum2.r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
+            sum2.g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
+            sum2.b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
+            sum2.a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);
+
+            sum3.r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
+            sum3.g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
+            sum3.b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
+            sum3.a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);
+        }       
+        v_offset += psc(w);
+        w_offset += kernel_w;
+    }
+    
+#endif	
+
+    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
+    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
+    sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
+    sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
+    
+#if NCNN_image_shader
+
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.x, 0), sum0);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.x, 0), sum1);
+    image3d_st4(top_blob, ivec3(gx2.x, gy2.y, 0), sum2);
+    image3d_st4(top_blob, ivec3(gx2.y, gy2.y, 0), sum3);
+    
+#else
+	
+    const int gi = gy * psc(outw) + gx;
+
+    buffer_st4(top_blob_data, gi, sum0);
+    if (gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + 1, sum1);
+    if (gy + 1 < psc(outh)) buffer_st4(top_blob_data, gi + psc(outw), sum2);
+    if (gy + 1 < psc(outh) && gx + 1 < psc(outw)) buffer_st4(top_blob_data, gi + psc(outw) + 1, sum3);
+    
+#endif
+}
\ No newline at end of file
diff --git a/src/layer/x86/convolution_3x3_int8.h b/src/layer/x86/convolution_3x3_int8.h
index a5c5dfe4e71d..ceaf75b92e1f 100644
--- a/src/layer/x86/convolution_3x3_int8.h
+++ b/src/layer/x86/convolution_3x3_int8.h
@@ -78,833 +78,6 @@ static void conv3x3s1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat&
     }
 }
 
-static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
-{
-    kernel_tm.create(4 * 4, inch, outch, (size_t)2u);
-
-    // G
-    const short ktm[4][3] = {
-        {2, 0, 0},
-        {1, 1, 1},
-        {1, -1, 1},
-        {0, 0, 2}
-    };
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        for (int q = 0; q < inch; q++)
-        {
-            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
-            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
-
-            // transform kernel
-            const signed char* k0 = kernel0;
-            const signed char* k1 = kernel0 + 3;
-            const signed char* k2 = kernel0 + 6;
-
-            // h
-            short tmp[4][3];
-            for (int i = 0; i < 4; i++)
-            {
-                tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
-                tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
-                tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
-            }
-
-            // U
-            for (int j = 0; j < 4; j++)
-            {
-                short* tmpp = &tmp[j][0];
-
-                for (int i = 0; i < 4; i++)
-                {
-                    kernel_tm0[j * 4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
-                }
-            }
-        }
-    }
-}
-
-static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int inch = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    // pad to 2n+2, winograd F(2,3)
-    Mat bottom_blob_bordered = bottom_blob;
-
-    outw = (outw + 1) / 2 * 2;
-    outh = (outh + 1) / 2 * 2;
-
-    w = outw + 2;
-    h = outh + 2;
-    Option opt_b = opt;
-    opt_b.blob_allocator = opt.workspace_allocator;
-    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
-
-    // BEGIN transform input
-    Mat bottom_blob_tm;
-    {
-        int w_tm = outw / 2 * 4;
-        int h_tm = outh / 2 * 4;
-
-        int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
-        int nRowBlocks = w_tm / 4;
-
-        const int tiles = nColBlocks * nRowBlocks;
-
-        bottom_blob_tm.create(4 * 4, tiles, inch, 2u, opt.workspace_allocator);
-
-        // BT
-        // const float itm[4][4] = {
-        //     {1.0f,  0.0f, -1.0f,  0.0f},
-        //     {0.0f,  1.0f,  1.00f, 0.0f},
-        //     {0.0f, -1.0f,  1.00f, 0.0f},
-        //     {0.0f, -1.0f,  0.00f, 1.0f}
-        // };
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < inch; q++)
-        {
-            const signed char* img = bottom_blob_bordered.channel(q);
-            short* out_tm0 = bottom_blob_tm.channel(q);
-
-            for (int j = 0; j < nColBlocks; j++)
-            {
-                const signed char* r0 = img + w * j * 2;
-                const signed char* r1 = r0 + w;
-                const signed char* r2 = r1 + w;
-                const signed char* r3 = r2 + w;
-
-                for (int i = 0; i < nRowBlocks; i++)
-                {
-                    short d0[4], d1[4], d2[4], d3[4];
-                    short w0[4], w1[4], w2[4], w3[4];
-                    short t0[4], t1[4], t2[4], t3[4];
-                    // load
-                    for (int n = 0; n < 4; n++)
-                    {
-                        d0[n] = r0[n];
-                        d1[n] = r1[n];
-                        d2[n] = r2[n];
-                        d3[n] = r3[n];
-                    }
-                    // w = B_t * d
-                    for (int n = 0; n < 4; n++)
-                    {
-                        w0[n] = d0[n] - d2[n];
-                        w1[n] = d1[n] + d2[n];
-                        w2[n] = d2[n] - d1[n];
-                        w3[n] = d3[n] - d1[n];
-                    }
-                    // transpose d to d_t
-                    {
-                        t0[0] = w0[0];
-                        t1[0] = w0[1];
-                        t2[0] = w0[2];
-                        t3[0] = w0[3];
-                        t0[1] = w1[0];
-                        t1[1] = w1[1];
-                        t2[1] = w1[2];
-                        t3[1] = w1[3];
-                        t0[2] = w2[0];
-                        t1[2] = w2[1];
-                        t2[2] = w2[2];
-                        t3[2] = w2[3];
-                        t0[3] = w3[0];
-                        t1[3] = w3[1];
-                        t2[3] = w3[2];
-                        t3[3] = w3[3];
-                    }
-                    // U = B_t * d_t
-                    for (int n = 0; n < 4; n++)
-                    {
-                        d0[n] = t0[n] - t2[n];
-                        d1[n] = t1[n] + t2[n];
-                        d2[n] = t2[n] - t1[n];
-                        d3[n] = t3[n] - t1[n];
-                    }
-                    // save to out_tm
-                    for (int n = 0; n < 4; n++)
-                    {
-                        out_tm0[n] = d0[n];
-                        out_tm0[n + 4] = d1[n];
-                        out_tm0[n + 8] = d2[n];
-                        out_tm0[n + 12] = d3[n];
-                    }
-
-                    r0 += 2;
-                    r1 += 2;
-                    r2 += 2;
-                    r3 += 2;
-
-                    out_tm0 += 16;
-                }
-            }
-        }
-    }
-    bottom_blob_bordered = Mat();
-
-    // BEGIN dot
-    Mat top_blob_tm;
-    {
-        int w_tm = outw / 2 * 4;
-        int h_tm = outh / 2 * 4;
-
-        int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
-        int nRowBlocks = w_tm / 4;
-
-        const int tiles = nColBlocks * nRowBlocks;
-
-        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
-
-        int nn_outch = outch >> 2;
-        int remain_outch_start = nn_outch << 2;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int pp = 0; pp < nn_outch; pp++)
-        {
-            int p = pp * 4;
-
-            Mat out0_tm = top_blob_tm.channel(p);
-            Mat out1_tm = top_blob_tm.channel(p + 1);
-            Mat out2_tm = top_blob_tm.channel(p + 2);
-            Mat out3_tm = top_blob_tm.channel(p + 3);
-
-            const Mat kernel0_tm = kernel_tm.channel(p);
-            const Mat kernel1_tm = kernel_tm.channel(p + 1);
-            const Mat kernel2_tm = kernel_tm.channel(p + 2);
-            const Mat kernel3_tm = kernel_tm.channel(p + 3);
-
-            for (int i = 0; i < tiles; i++)
-            {
-                int* output0_tm = out0_tm.row<int>(i);
-                int* output1_tm = out1_tm.row<int>(i);
-                int* output2_tm = out2_tm.row<int>(i);
-                int* output3_tm = out3_tm.row<int>(i);
-
-                int sum0[16] = {0};
-                int sum1[16] = {0};
-                int sum2[16] = {0};
-                int sum3[16] = {0};
-
-                int q = 0;
-                for (; q + 3 < inch; q += 4)
-                {
-                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
-                    const short* r1 = bottom_blob_tm.channel(q + 1).row<short>(i);
-                    const short* r2 = bottom_blob_tm.channel(q + 2).row<short>(i);
-                    const short* r3 = bottom_blob_tm.channel(q + 3).row<short>(i);
-
-                    const short* k0 = kernel0_tm.row<short>(q);
-                    const short* k1 = kernel1_tm.row<short>(q);
-                    const short* k2 = kernel2_tm.row<short>(q);
-                    const short* k3 = kernel3_tm.row<short>(q);
-
-                    for (int n = 0; n < 16; n++)
-                    {
-                        sum0[n] += (int)r0[n] * k0[n];
-                        k0 += 16;
-                        sum0[n] += (int)r1[n] * k0[n];
-                        k0 += 16;
-                        sum0[n] += (int)r2[n] * k0[n];
-                        k0 += 16;
-                        sum0[n] += (int)r3[n] * k0[n];
-                        k0 -= 16 * 3;
-
-                        sum1[n] += (int)r0[n] * k1[n];
-                        k1 += 16;
-                        sum1[n] += (int)r1[n] * k1[n];
-                        k1 += 16;
-                        sum1[n] += (int)r2[n] * k1[n];
-                        k1 += 16;
-                        sum1[n] += (int)r3[n] * k1[n];
-                        k1 -= 16 * 3;
-
-                        sum2[n] += (int)r0[n] * k2[n];
-                        k2 += 16;
-                        sum2[n] += (int)r1[n] * k2[n];
-                        k2 += 16;
-                        sum2[n] += (int)r2[n] * k2[n];
-                        k2 += 16;
-                        sum2[n] += (int)r3[n] * k2[n];
-                        k2 -= 16 * 3;
-
-                        sum3[n] += (int)r0[n] * k3[n];
-                        k3 += 16;
-                        sum3[n] += (int)r1[n] * k3[n];
-                        k3 += 16;
-                        sum3[n] += (int)r2[n] * k3[n];
-                        k3 += 16;
-                        sum3[n] += (int)r3[n] * k3[n];
-                        k3 -= 16 * 3;
-                    }
-                }
-
-                for (; q < inch; q++)
-                {
-                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
-
-                    const short* k0 = kernel0_tm.row<short>(q);
-                    const short* k1 = kernel1_tm.row<short>(q);
-                    const short* k2 = kernel2_tm.row<short>(q);
-                    const short* k3 = kernel3_tm.row<short>(q);
-
-                    for (int n = 0; n < 16; n++)
-                    {
-                        sum0[n] += (int)r0[n] * k0[n];
-                        sum1[n] += (int)r0[n] * k1[n];
-                        sum2[n] += (int)r0[n] * k2[n];
-                        sum3[n] += (int)r0[n] * k3[n];
-                    }
-                }
-
-                for (int n = 0; n < 16; n++)
-                {
-                    output0_tm[n] = sum0[n];
-                    output1_tm[n] = sum1[n];
-                    output2_tm[n] = sum2[n];
-                    output3_tm[n] = sum3[n];
-                }
-            }
-        }
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = remain_outch_start; p < outch; p++)
-        {
-            Mat out0_tm = top_blob_tm.channel(p);
-            const Mat kernel0_tm = kernel_tm.channel(p);
-
-            for (int i = 0; i < tiles; i++)
-            {
-                int* output0_tm = out0_tm.row<int>(i);
-
-                int sum0[16] = {0};
-
-                int q = 0;
-                for (; q + 3 < inch; q += 4)
-                {
-                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
-                    const short* r1 = bottom_blob_tm.channel(q + 1).row<short>(i);
-                    const short* r2 = bottom_blob_tm.channel(q + 2).row<short>(i);
-                    const short* r3 = bottom_blob_tm.channel(q + 3).row<short>(i);
-
-                    const short* k0 = kernel0_tm.row<short>(q);
-                    const short* k1 = kernel0_tm.row<short>(q + 1);
-                    const short* k2 = kernel0_tm.row<short>(q + 2);
-                    const short* k3 = kernel0_tm.row<short>(q + 3);
-
-                    for (int n = 0; n < 16; n++)
-                    {
-                        sum0[n] += (int)r0[n] * k0[n];
-                        sum0[n] += (int)r1[n] * k1[n];
-                        sum0[n] += (int)r2[n] * k2[n];
-                        sum0[n] += (int)r3[n] * k3[n];
-                    }
-                }
-
-                for (; q < inch; q++)
-                {
-                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
-                    const short* k0 = kernel0_tm.row<short>(q);
-
-                    for (int n = 0; n < 16; n++)
-                    {
-                        sum0[n] += (int)r0[n] * k0[n];
-                    }
-                }
-
-                for (int n = 0; n < 16; n++)
-                {
-                    output0_tm[n] = sum0[n];
-                }
-            }
-        }
-    }
-    bottom_blob_tm = Mat();
-    // END dot
-
-    // BEGIN transform output
-    Mat top_blob_bordered;
-    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
-    {
-        // AT
-        // const float itm[2][4] = {
-        //     {1.0f,  1.0f,  1.0f,  0.0f},
-        //     {0.0f,  1.0f, -1.0f,  1.0f}
-        // };
-
-        int w_tm = outw / 2 * 4;
-        int h_tm = outh / 2 * 4;
-
-        int nColBlocks = h_tm / 4; // may be the block num in Feathercnn
-        int nRowBlocks = w_tm / 4;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < outch; p++)
-        {
-            Mat out_tm = top_blob_tm.channel(p);
-            Mat out = top_blob_bordered.channel(p);
-
-            for (int j = 0; j < nColBlocks; j++)
-            {
-                int* outRow0 = out.row<int>(j * 2);
-                int* outRow1 = out.row<int>(j * 2 + 1);
-
-                for (int i = 0; i < nRowBlocks; i++)
-                {
-                    int* out_tile = out_tm.row<int>(j * nRowBlocks + i);
-
-                    int s0[4], s1[4], s2[4], s3[4];
-                    int w0[4], w1[4];
-                    int d0[2], d1[2], d2[2], d3[2];
-                    int o0[2], o1[2];
-                    // load
-                    for (int n = 0; n < 4; n++)
-                    {
-                        s0[n] = out_tile[n];
-                        s1[n] = out_tile[n + 4];
-                        s2[n] = out_tile[n + 8];
-                        s3[n] = out_tile[n + 12];
-                    }
-                    // w = A_T * W
-                    for (int n = 0; n < 4; n++)
-                    {
-                        w0[n] = s0[n] + s1[n] + s2[n];
-                        w1[n] = s1[n] - s2[n] + s3[n];
-                    }
-                    // transpose w to w_t
-                    {
-                        d0[0] = w0[0];
-                        d0[1] = w1[0];
-                        d1[0] = w0[1];
-                        d1[1] = w1[1];
-                        d2[0] = w0[2];
-                        d2[1] = w1[2];
-                        d3[0] = w0[3];
-                        d3[1] = w1[3];
-                    }
-                    // Y = A_T * w_t
-                    for (int n = 0; n < 2; n++)
-                    {
-                        o0[n] = d0[n] + d1[n] + d2[n];
-                        o1[n] = d1[n] - d2[n] + d3[n];
-                    }
-                    // save to top blob tm,why right 2,because the G' = G*2
-                    outRow0[0] = o0[0] >> 2;
-                    outRow0[1] = o0[1] >> 2;
-                    outRow1[0] = o1[0] >> 2;
-                    outRow1[1] = o1[1] >> 2;
-
-                    outRow0 += 2;
-                    outRow1 += 2;
-                }
-            }
-        }
-    }
-    // END transform output
-
-    // cut result pad
-    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
-}
-
-static void conv3x3s1_winograd43_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
-{
-    kernel_tm.create(6 * 6, inch, outch, (size_t)2u);
-
-    // G
-    // const float ktm[6][3] = {
-    //     {  1.0f/4,     0.0f,    0.0f},
-    //     { -1.0f/6,  -1.0f/6, -1.0f/6},
-    //     { -1.0f/6,   1.0f/6, -1.0f/6},
-    //     { 1.0f/24,  1.0f/12,  1.0f/6},
-    //     { 1.0f/24, -1.0f/12,  1.0f/6},
-    //     {    0.0f,     0.0f,    1.0f}
-    // };
-    const short ktm[6][3] = {
-        {6, 0, 0},
-        {-4, -4, -4},
-        {-4, 4, -4},
-        {1, 2, 4},
-        {1, -2, 4},
-        {0, 0, 24}
-    };
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        for (int q = 0; q < inch; q++)
-        {
-            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
-            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
-
-            // transform kernel
-            const signed char* k0 = kernel0;
-            const signed char* k1 = kernel0 + 3;
-            const signed char* k2 = kernel0 + 6;
-
-            // h
-            short tmp[6][3];
-            for (int i = 0; i < 6; i++)
-            {
-                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
-                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
-                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
-            }
-
-            // U
-            for (int j = 0; j < 6; j++)
-            {
-                short* tmpp = &tmp[j][0];
-
-                for (int i = 0; i < 6; i++)
-                {
-                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
-                }
-            }
-        }
-    }
-}
-
-static void conv3x3s1_winograd43_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int inch = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    // pad to 4n+2, winograd F(4,3)
-    Mat bottom_blob_bordered = bottom_blob;
-
-    outw = (outw + 3) / 4 * 4;
-    outh = (outh + 3) / 4 * 4;
-
-    w = outw + 2;
-    h = outh + 2;
-    Option opt_b = opt;
-    opt_b.blob_allocator = opt.workspace_allocator;
-    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt_b);
-
-    // BEGIN transform input
-    Mat bottom_blob_tm;
-    {
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
-        int nRowBlocks = w_tm / 6;
-
-        const int tiles = nColBlocks * nRowBlocks;
-
-        bottom_blob_tm.create(6 * 6, tiles, inch, 2u, opt.workspace_allocator);
-
-        // BT
-        // const float itm[4][4] = {
-        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
-        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
-        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
-        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
-        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
-        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
-        // };
-
-        // 0 =	4 * r00  - 5 * r02	+ r04
-        // 1 = -4 * (r01 + r02)  + r03 + r04
-        // 2 =	4 * (r01 - r02)  - r03 + r04
-        // 3 = -2 * r01 - r02 + 2 * r03 + r04
-        // 4 =	2 * r01 - r02 - 2 * r03 + r04
-        // 5 =	4 * r01 - 5 * r03 + r05
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < inch; q++)
-        {
-            const signed char* img = bottom_blob_bordered.channel(q);
-            short* out_tm0 = bottom_blob_tm.channel(q);
-
-            for (int j = 0; j < nColBlocks; j++)
-            {
-                const signed char* r0 = img + w * j * 4;
-                const signed char* r1 = r0 + w;
-                const signed char* r2 = r1 + w;
-                const signed char* r3 = r2 + w;
-                const signed char* r4 = r3 + w;
-                const signed char* r5 = r4 + w;
-
-                for (int i = 0; i < nRowBlocks; i++)
-                {
-                    short d0[6], d1[6], d2[6], d3[6], d4[6], d5[6];
-                    short w0[6], w1[6], w2[6], w3[6], w4[6], w5[6];
-                    short t0[6], t1[6], t2[6], t3[6], t4[6], t5[6];
-
-                    // load
-                    for (int n = 0; n < 6; n++)
-                    {
-                        d0[n] = r0[n];
-                        d1[n] = r1[n];
-                        d2[n] = r2[n];
-                        d3[n] = r3[n];
-                        d4[n] = r4[n];
-                        d5[n] = r5[n];
-                    }
-                    // w = B_t * d
-                    for (int n = 0; n < 6; n++)
-                    {
-                        w0[n] = 4 * d0[n] - 5 * d2[n] + d4[n];
-                        w1[n] = -4 * d1[n] - 4 * d2[n] + d3[n] + d4[n];
-                        w2[n] = 4 * d1[n] - 4 * d2[n] - d3[n] + d4[n];
-                        w3[n] = -2 * d1[n] - d2[n] + 2 * d3[n] + d4[n];
-                        w4[n] = 2 * d1[n] - d2[n] - 2 * d3[n] + d4[n];
-                        w5[n] = 4 * d1[n] - 5 * d3[n] + d5[n];
-                    }
-                    // transpose d to d_t
-                    {
-                        t0[0] = w0[0];
-                        t1[0] = w0[1];
-                        t2[0] = w0[2];
-                        t3[0] = w0[3];
-                        t4[0] = w0[4];
-                        t5[0] = w0[5];
-                        t0[1] = w1[0];
-                        t1[1] = w1[1];
-                        t2[1] = w1[2];
-                        t3[1] = w1[3];
-                        t4[1] = w1[4];
-                        t5[1] = w1[5];
-                        t0[2] = w2[0];
-                        t1[2] = w2[1];
-                        t2[2] = w2[2];
-                        t3[2] = w2[3];
-                        t4[2] = w2[4];
-                        t5[2] = w2[5];
-                        t0[3] = w3[0];
-                        t1[3] = w3[1];
-                        t2[3] = w3[2];
-                        t3[3] = w3[3];
-                        t4[3] = w3[4];
-                        t5[3] = w3[5];
-                        t0[4] = w4[0];
-                        t1[4] = w4[1];
-                        t2[4] = w4[2];
-                        t3[4] = w4[3];
-                        t4[4] = w4[4];
-                        t5[4] = w4[5];
-                        t0[5] = w5[0];
-                        t1[5] = w5[1];
-                        t2[5] = w5[2];
-                        t3[5] = w5[3];
-                        t4[5] = w5[4];
-                        t5[5] = w5[5];
-                    }
-                    // d = B_t * d_t
-                    for (int n = 0; n < 6; n++)
-                    {
-                        d0[n] = 4 * t0[n] - 5 * t2[n] + t4[n];
-                        d1[n] = -4 * t1[n] - 4 * t2[n] + t3[n] + t4[n];
-                        d2[n] = 4 * t1[n] - 4 * t2[n] - t3[n] + t4[n];
-                        d3[n] = -2 * t1[n] - t2[n] + 2 * t3[n] + t4[n];
-                        d4[n] = 2 * t1[n] - t2[n] - 2 * t3[n] + t4[n];
-                        d5[n] = 4 * t1[n] - 5 * t3[n] + t5[n];
-                    }
-                    // save to out_tm
-                    for (int n = 0; n < 6; n++)
-                    {
-                        out_tm0[n] = d0[n];
-                        out_tm0[n + 6] = d1[n];
-                        out_tm0[n + 12] = d2[n];
-                        out_tm0[n + 18] = d3[n];
-                        out_tm0[n + 24] = d4[n];
-                        out_tm0[n + 30] = d5[n];
-                    }
-
-                    r0 += 4;
-                    r1 += 4;
-                    r2 += 4;
-                    r3 += 4;
-                    r4 += 4;
-                    r5 += 4;
-
-                    out_tm0 += 36;
-                }
-            }
-        }
-    }
-    bottom_blob_bordered = Mat();
-
-    // BEGIN dot
-    Mat top_blob_tm;
-    {
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
-        int nRowBlocks = w_tm / 6;
-
-        const int tiles = nColBlocks * nRowBlocks;
-
-        top_blob_tm.create(36, tiles, outch, 4u, opt.workspace_allocator);
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < outch; p++)
-        {
-            Mat out0_tm = top_blob_tm.channel(p);
-            const Mat kernel0_tm = kernel_tm.channel(p);
-
-            for (int i = 0; i < tiles; i++)
-            {
-                int* output0_tm = out0_tm.row<int>(i);
-
-                int sum0[36] = {0};
-
-                for (int q = 0; q < inch; q++)
-                {
-                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
-                    const short* k0 = kernel0_tm.row<short>(q);
-
-                    for (int n = 0; n < 36; n++)
-                    {
-                        sum0[n] += (int)r0[n] * k0[n];
-                    }
-                }
-
-                for (int n = 0; n < 36; n++)
-                {
-                    output0_tm[n] = sum0[n];
-                }
-            }
-        }
-    }
-    bottom_blob_tm = Mat();
-    // END dot
-
-    // BEGIN transform output
-    Mat top_blob_bordered;
-    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
-    {
-        // AT
-        // const float itm[4][6] = {
-        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
-        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
-        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
-        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
-        // };
-
-        // 0 =	r00 + r01 + r02 + r03 +	r04
-        // 1 =		  r01 - r02 + 2 * (r03 - r04)
-        // 2 =		  r01 + r02 + 4 * (r03 + r04)
-        // 3 =		  r01 - r02 + 8 * (r03 - r04)  + r05
-
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        int nColBlocks = h_tm / 6; // may be the block num in Feathercnn
-        int nRowBlocks = w_tm / 6;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < outch; p++)
-        {
-            Mat out_tm = top_blob_tm.channel(p);
-            Mat out = top_blob_bordered.channel(p);
-
-            for (int j = 0; j < nColBlocks; j++)
-            {
-                int* outRow0 = out.row<int>(j * 4);
-                int* outRow1 = out.row<int>(j * 4 + 1);
-                int* outRow2 = out.row<int>(j * 4 + 2);
-                int* outRow3 = out.row<int>(j * 4 + 3);
-
-                for (int i = 0; i < nRowBlocks; i++)
-                {
-                    int* out_tile = out_tm.row<int>(j * nRowBlocks + i);
-
-                    int s0[6], s1[6], s2[6], s3[6], s4[6], s5[6];
-                    int w0[6], w1[6], w2[6], w3[6];
-                    int d0[4], d1[4], d2[4], d3[4], d4[4], d5[4];
-                    int o0[4], o1[4], o2[4], o3[4];
-                    // load
-                    for (int n = 0; n < 6; n++)
-                    {
-                        s0[n] = out_tile[n];
-                        s1[n] = out_tile[n + 6];
-                        s2[n] = out_tile[n + 12];
-                        s3[n] = out_tile[n + 18];
-                        s4[n] = out_tile[n + 24];
-                        s5[n] = out_tile[n + 30];
-                    }
-                    // w = A_T * W
-                    for (int n = 0; n < 6; n++)
-                    {
-                        w0[n] = s0[n] + s1[n] + s2[n] + s3[n] + s4[n];
-                        w1[n] = s1[n] - s2[n] + 2 * s3[n] - 2 * s4[n];
-                        w2[n] = s1[n] + s2[n] + 4 * s3[n] + 4 * s4[n];
-                        w3[n] = s1[n] - s2[n] + 8 * s3[n] - 8 * s4[n] + s5[n];
-                    }
-                    // transpose w to w_t
-                    {
-                        d0[0] = w0[0];
-                        d0[1] = w1[0];
-                        d0[2] = w2[0];
-                        d0[3] = w3[0];
-                        d1[0] = w0[1];
-                        d1[1] = w1[1];
-                        d1[2] = w2[1];
-                        d1[3] = w3[1];
-                        d2[0] = w0[2];
-                        d2[1] = w1[2];
-                        d2[2] = w2[2];
-                        d2[3] = w3[2];
-                        d3[0] = w0[3];
-                        d3[1] = w1[3];
-                        d3[2] = w2[3];
-                        d3[3] = w3[3];
-                        d4[0] = w0[4];
-                        d4[1] = w1[4];
-                        d4[2] = w2[4];
-                        d4[3] = w3[4];
-                        d5[0] = w0[5];
-                        d5[1] = w1[5];
-                        d5[2] = w2[5];
-                        d5[3] = w3[5];
-                    }
-                    // Y = A_T * w_t
-                    for (int n = 0; n < 4; n++)
-                    {
-                        o0[n] = d0[n] + d1[n] + d2[n] + d3[n] + d4[n];
-                        o1[n] = d1[n] - d2[n] + 2 * d3[n] - 2 * d4[n];
-                        o2[n] = d1[n] + d2[n] + 4 * d3[n] + 4 * d4[n];
-                        o3[n] = d1[n] - d2[n] + 8 * d3[n] - 8 * d4[n] + d5[n];
-                    }
-                    // save to top blob tm
-                    for (int n = 0; n < 4; n++)
-                    {
-                        outRow0[n] = o0[n] / 576;
-                        outRow1[n] = o1[n] / 576;
-                        outRow2[n] = o2[n] / 576;
-                        outRow3[n] = o3[n] / 576;
-                    }
-
-                    outRow0 += 4;
-                    outRow1 += 4;
-                    outRow2 += 4;
-                    outRow3 += 4;
-                }
-            }
-        }
-    }
-    // END transform output
-
-    // cut result pad
-    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
-}
-
 static void conv3x3s2_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
 {
     int w = bottom_blob.w;
diff --git a/src/layer/x86/convolution_3x3_pack8to1_int8.h b/src/layer/x86/convolution_3x3_pack8to1_int8.h
deleted file mode 100644
index d5957faf6d89..000000000000
--- a/src/layer/x86/convolution_3x3_pack8to1_int8.h
+++ /dev/null
@@ -1,1125 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx512vnni(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to1_int8_sse_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avxvnni(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to1_int8_sse_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx2(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to1_int8_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_xop(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to1_int8_sse_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-#endif
-
-static void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(const Mat& kernel, Mat& kernel_tm_pack8to1, int inch, int outch, const Option& opt)
-{
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
-    if (ncnn::cpu_support_x86_avx512_vnni())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx512vnni(kernel, kernel_tm_pack8to1, inch, outch, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
-    if (ncnn::cpu_support_x86_avx_vnni())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avxvnni(kernel, kernel_tm_pack8to1, inch, outch, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
-    if (ncnn::cpu_support_x86_avx2())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx2(kernel, kernel_tm_pack8to1, inch, outch, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
-    if (ncnn::cpu_support_x86_xop())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_xop(kernel, kernel_tm_pack8to1, inch, outch, opt);
-        return;
-    }
-#endif
-#endif
-
-    // winograd43 transform kernel
-    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
-
-    const short ktm[6][3] = {
-        {6, 0, 0},
-        {-4, -4, -4},
-        {-4, 4, -4},
-        {1, 2, 4},
-        {1, -2, 4},
-        {0, 0, 6}
-    };
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        for (int q = 0; q < inch; q++)
-        {
-            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
-            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
-
-            // transform kernel
-            const signed char* k0 = kernel0;
-            const signed char* k1 = kernel0 + 3;
-            const signed char* k2 = kernel0 + 6;
-
-            // h
-            short tmp[6][3];
-            for (int i = 0; i < 6; i++)
-            {
-                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
-                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
-                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
-            }
-
-            // U
-            for (int j = 0; j < 6; j++)
-            {
-                short* tmpp = &tmp[j][0];
-
-                for (int i = 0; i < 6; i++)
-                {
-                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
-                }
-            }
-        }
-    }
-
-    // interleave
-    // src = 36-inch-outch
-    // dst = 4b-8a-inch/8a-36-outch/4b
-    kernel_tm_pack8to1.create(8 * inch / 8, 36, outch / 4 + outch % 4, (size_t)2u * 4, 4);
-
-    int p = 0;
-    for (; p + 3 < outch; p += 4)
-    {
-        Mat g0 = kernel_tm_pack8to1.channel(p / 4);
-
-        for (int k = 0; k < 36; k++)
-        {
-            short* g00 = g0.row<short>(k);
-
-            for (int q = 0; q + 7 < inch; q += 8)
-            {
-                for (int i = 0; i < 4; i++)
-                {
-                    for (int j = 0; j < 8; j++)
-                    {
-                        const short* k00 = kernel_tm.channel(p + i).row<const short>(q + j);
-                        g00[0] = k00[k];
-                        g00 += 1;
-                    }
-                }
-            }
-        }
-    }
-    for (; p < outch; p++)
-    {
-        const Mat k0 = kernel_tm.channel(p);
-
-        Mat g0 = kernel_tm_pack8to1.channel(p / 4 + p % 4);
-
-        for (int k = 0; k < 36; k++)
-        {
-            short* g00 = g0.row<short>(k);
-
-            for (int q = 0; q + 7 < inch; q += 8)
-            {
-                for (int i = 0; i < 8; i++)
-                {
-                    g00[0] = k0.row<const short>(q + i)[k];
-
-                    g00 += 1;
-                }
-            }
-        }
-    }
-}
-
-static void conv3x3s1_winograd43_pack8to1_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
-{
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
-    if (ncnn::cpu_support_x86_avx512_vnni())
-    {
-        conv3x3s1_winograd43_pack8to1_int8_sse_avx512vnni(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
-    if (ncnn::cpu_support_x86_avx_vnni())
-    {
-        conv3x3s1_winograd43_pack8to1_int8_sse_avxvnni(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
-    if (ncnn::cpu_support_x86_avx2())
-    {
-        conv3x3s1_winograd43_pack8to1_int8_sse_avx2(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
-    if (ncnn::cpu_support_x86_xop())
-    {
-        conv3x3s1_winograd43_pack8to1_int8_sse_xop(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-#endif
-
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int inch = bottom_blob.c;
-    //     size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    // pad to 4n+2
-    Mat bottom_blob_bordered = bottom_blob;
-
-    outw = (outw + 3) / 4 * 4;
-    outh = (outh + 3) / 4 * 4;
-
-    w = outw + 2;
-    h = outh + 2;
-    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
-
-    // BEGIN transform input
-    Mat bottom_blob_tm;
-    {
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        const int tiles = w_tm / 6 * h_tm / 6;
-
-        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
-
-        // const float itm[4][4] = {
-        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
-        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
-        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
-        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
-        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
-        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
-        // };
-
-        // 0 =  4 * r00 - 5 * r02 + r04
-        // 1 = -4 * (r01 + r02) + r04 + r03
-        // 2 =  4 * (r01 - r02) + r04 - r03
-        // 3 = -2 * (r01 - r03) + r04 - r02
-        // 4 =  2 * (r01 - r03) + r04 - r02
-        // 5 =  4 * r01 - 5 * r03 + r05
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < inch; q++)
-        {
-            const Mat img0 = bottom_blob_bordered.channel(q);
-            Mat img0_tm = bottom_blob_tm.channel(q);
-
-            short tmp[6][6][8];
-
-            // tile
-            for (int i = 0; i < h_tm / 6; i++)
-            {
-                for (int j = 0; j < w_tm / 6; j++)
-                {
-                    const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;
-
-                    for (int m = 0; m < 6; m++)
-                    {
-                        // TODO use _mm_cvtepi8_epi16 on sse4.1
-                        __m128i _r00_01 = _mm_loadu_si128((const __m128i*)r0);
-                        __m128i _r02_03 = _mm_loadu_si128((const __m128i*)(r0 + 16));
-                        __m128i _r04_05 = _mm_loadu_si128((const __m128i*)(r0 + 32));
-                        __m128i _extr0001 = _mm_cmpgt_epi8(_mm_setzero_si128(), _r00_01);
-                        __m128i _extr0203 = _mm_cmpgt_epi8(_mm_setzero_si128(), _r02_03);
-                        __m128i _extr0405 = _mm_cmpgt_epi8(_mm_setzero_si128(), _r04_05);
-                        __m128i _r00 = _mm_unpacklo_epi8(_r00_01, _extr0001);
-                        __m128i _r01 = _mm_unpackhi_epi8(_r00_01, _extr0001);
-                        __m128i _r02 = _mm_unpacklo_epi8(_r02_03, _extr0203);
-                        __m128i _r03 = _mm_unpackhi_epi8(_r02_03, _extr0203);
-                        __m128i _r04 = _mm_unpacklo_epi8(_r04_05, _extr0405);
-                        __m128i _r05 = _mm_unpackhi_epi8(_r04_05, _extr0405);
-
-                        __m128i _v5 = _mm_set1_epi16(5);
-
-                        __m128i _tmp0m = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_r00, 2), _r04), _mm_mullo_epi16(_r02, _v5));
-                        __m128i _tmp1m = _mm_sub_epi16(_mm_add_epi16(_r04, _r03), _mm_slli_epi16(_mm_add_epi16(_r01, _r02), 2));
-                        __m128i _tmp2m = _mm_add_epi16(_mm_sub_epi16(_r04, _r03), _mm_slli_epi16(_mm_sub_epi16(_r01, _r02), 2));
-                        __m128i _tmp3m = _mm_sub_epi16(_mm_sub_epi16(_r04, _r02), _mm_slli_epi16(_mm_sub_epi16(_r01, _r03), 1));
-                        __m128i _tmp4m = _mm_add_epi16(_mm_sub_epi16(_r04, _r02), _mm_slli_epi16(_mm_sub_epi16(_r01, _r03), 1));
-                        __m128i _tmp5m = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_r01, 2), _r05), _mm_mullo_epi16(_r03, _v5));
-
-                        _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0m);
-                        _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1m);
-                        _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2m);
-                        _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3m);
-                        _mm_storeu_si128((__m128i*)tmp[4][m], _tmp4m);
-                        _mm_storeu_si128((__m128i*)tmp[5][m], _tmp5m);
-
-                        r0 += w * 8;
-                    }
-
-                    short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
-                    short* r0_tm_1 = r0_tm_0 + tiles * 8;
-                    short* r0_tm_2 = r0_tm_0 + tiles * 16;
-                    short* r0_tm_3 = r0_tm_0 + tiles * 24;
-                    short* r0_tm_4 = r0_tm_0 + tiles * 32;
-                    short* r0_tm_5 = r0_tm_0 + tiles * 40;
-
-                    for (int m = 0; m < 6; m++)
-                    {
-                        __m128i _tmp00 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
-                        __m128i _tmp01 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
-                        __m128i _tmp02 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
-                        __m128i _tmp03 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
-                        __m128i _tmp04 = _mm_loadu_si128((const __m128i*)tmp[m][4]);
-                        __m128i _tmp05 = _mm_loadu_si128((const __m128i*)tmp[m][5]);
-
-                        __m128i _v5 = _mm_set1_epi16(5);
-
-                        __m128i _r0tm0 = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_tmp00, 2), _tmp04), _mm_mullo_epi16(_tmp02, _v5));
-                        __m128i _r0tm1 = _mm_sub_epi16(_mm_add_epi16(_tmp04, _tmp03), _mm_slli_epi16(_mm_add_epi16(_tmp01, _tmp02), 2));
-                        __m128i _r0tm2 = _mm_add_epi16(_mm_sub_epi16(_tmp04, _tmp03), _mm_slli_epi16(_mm_sub_epi16(_tmp01, _tmp02), 2));
-                        __m128i _r0tm3 = _mm_sub_epi16(_mm_sub_epi16(_tmp04, _tmp02), _mm_slli_epi16(_mm_sub_epi16(_tmp01, _tmp03), 1));
-                        __m128i _r0tm4 = _mm_add_epi16(_mm_sub_epi16(_tmp04, _tmp02), _mm_slli_epi16(_mm_sub_epi16(_tmp01, _tmp03), 1));
-                        __m128i _r0tm5 = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_tmp01, 2), _tmp05), _mm_mullo_epi16(_tmp03, _v5));
-
-                        _mm_storeu_si128((__m128i*)r0_tm_0, _r0tm0);
-                        _mm_storeu_si128((__m128i*)r0_tm_1, _r0tm1);
-                        _mm_storeu_si128((__m128i*)r0_tm_2, _r0tm2);
-                        _mm_storeu_si128((__m128i*)r0_tm_3, _r0tm3);
-                        _mm_storeu_si128((__m128i*)r0_tm_4, _r0tm4);
-                        _mm_storeu_si128((__m128i*)r0_tm_5, _r0tm5);
-
-                        r0_tm_0 += tiles * 48;
-                        r0_tm_1 += tiles * 48;
-                        r0_tm_2 += tiles * 48;
-                        r0_tm_3 += tiles * 48;
-                        r0_tm_4 += tiles * 48;
-                        r0_tm_5 += tiles * 48;
-                    }
-                }
-            }
-        }
-    }
-    bottom_blob_bordered = Mat();
-    // END transform input
-
-    // BEGIN dot
-    Mat top_blob_tm;
-    {
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        const int tiles = h_tm / 6 * w_tm / 6;
-
-        // permute
-        //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
-        Mat bottom_blob_tm2;
-#if __AVX2__
-        if (tiles >= 4)
-            bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
-        else if (tiles >= 2)
-            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
-        else // if (tiles >= 1)
-            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);
-#else
-        if (tiles >= 2)
-            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
-        else // if (tiles >= 1)
-            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);
-#endif
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int r = 0; r < 36; r++)
-        {
-            Mat tm2 = bottom_blob_tm2.channel(r);
-
-            // tile
-            int i = 0;
-#if __AVX2__
-            for (; i + 3 < tiles; i += 4)
-            {
-                short* tmpptr = tm2.row<short>(i / 4);
-
-                const short* r0 = bottom_blob_tm;
-
-                r0 += (r * tiles + i) * 8;
-
-                for (int q = 0; q < inch; q++)
-                {
-                    __m256i _r0 = _mm256_loadu_si256((const __m256i*)r0);
-                    __m256i _r1 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
-                    _mm256_storeu_si256((__m256i*)tmpptr, _r0);
-                    _mm256_storeu_si256((__m256i*)(tmpptr + 16), _r1);
-                    r0 += bottom_blob_tm.cstep * 8;
-                    tmpptr += 32;
-                }
-            }
-#endif
-            for (; i + 1 < tiles; i += 2)
-            {
-#if __AVX2__
-                short* tmpptr = tm2.row<short>(i / 4 + (i % 4) / 2);
-#else
-                short* tmpptr = tm2.row<short>(i / 2);
-#endif
-
-                const short* r0 = bottom_blob_tm;
-
-                r0 += (r * tiles + i) * 8;
-
-                for (int q = 0; q < inch; q++)
-                {
-                    __m128i _r0 = _mm_loadu_si128((const __m128i*)r0);
-                    __m128i _r1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
-                    _mm_storeu_si128((__m128i*)tmpptr, _r0);
-                    _mm_storeu_si128((__m128i*)(tmpptr + 8), _r1);
-                    r0 += bottom_blob_tm.cstep * 8;
-                    tmpptr += 16;
-                }
-            }
-            for (; i < tiles; i++)
-            {
-#if __AVX2__
-                short* tmpptr = tm2.row<short>(i / 4 + (i % 4) / 2 + i % 2);
-#else
-                short* tmpptr = tm2.row<short>(i / 2 + i % 2);
-#endif
-
-                const short* r0 = bottom_blob_tm;
-
-                r0 += (r * tiles + i) * 8;
-
-                for (int q = 0; q < inch; q++)
-                {
-                    __m128i _r0 = _mm_loadu_si128((const __m128i*)r0);
-                    _mm_storeu_si128((__m128i*)tmpptr, _r0);
-                    r0 += bottom_blob_tm.cstep * 8;
-                    tmpptr += 8;
-                }
-            }
-        }
-
-        bottom_blob_tm = Mat();
-        // permute end
-
-        top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);
-
-        int nn_outch = 0;
-        int remain_outch_start = 0;
-
-        nn_outch = outch >> 2;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int pp = 0; pp < nn_outch; pp++)
-        {
-            int p = pp * 4;
-
-            int* output0_tm = top_blob_tm.channel(p);
-            int* output1_tm = top_blob_tm.channel(p + 1);
-            int* output2_tm = top_blob_tm.channel(p + 2);
-            int* output3_tm = top_blob_tm.channel(p + 3);
-
-            const Mat kernel0_tm = kernel_tm.channel(p / 4);
-
-            for (int r = 0; r < 36; r++)
-            {
-                const Mat bb2 = bottom_blob_tm2.channel(r);
-
-                int i = 0;
-#if __AVX2__
-                for (; i + 3 < tiles; i += 4)
-                {
-                    const short* r0 = bb2.row<const short>(i / 4);
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    int nn = inch; // inch always > 0
-
-                    __m256i _sum00_11 = _mm256_setzero_si256();
-                    __m256i _sum10_01 = _mm256_setzero_si256();
-                    __m256i _sum02_13 = _mm256_setzero_si256();
-                    __m256i _sum12_03 = _mm256_setzero_si256();
-
-                    __m256i _sum04_15 = _mm256_setzero_si256();
-                    __m256i _sum14_05 = _mm256_setzero_si256();
-                    __m256i _sum06_17 = _mm256_setzero_si256();
-                    __m256i _sum16_07 = _mm256_setzero_si256();
-
-                    for (int j = 0; j < nn; j++)
-                    {
-                        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
-
-                        __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
-                        __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
-
-                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
-                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
-                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
-                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
-#else
-                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
-                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
-                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
-                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
-#endif
-
-                        __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
-
-                        __m256i _val32 = _mm256_permute4x64_epi64(_val23, 78);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23, _w01);
-                        _sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32, _w01);
-                        _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23, _w23);
-                        _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32, _w23);
-#else
-                        _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23, _w01));
-                        _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32, _w01));
-                        _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23, _w23));
-                        _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32, _w23));
-#endif
-
-                        r0 += 32;
-                        k0 += 32;
-                    }
-
-                    // transpose 4x8
-                    {
-                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
-                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
-                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
-                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
-                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-                    {
-                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05);
-                        _tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07);
-                        _tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05);
-                        _tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07);
-                        _sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
-                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
-
-                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05);
-                    _sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07);
-                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17);
-
-                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
-                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
-                    _sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask);
-
-                    int sum[16];
-                    _mm256_storeu_si256((__m256i*)sum, _sum00_11);
-                    _mm256_storeu_si256((__m256i*)(sum + 8), _sum04_15);
-
-                    output0_tm[0] = sum[0];
-                    output1_tm[0] = sum[1];
-                    output2_tm[0] = sum[2];
-                    output3_tm[0] = sum[3];
-                    output0_tm[1] = sum[4];
-                    output1_tm[1] = sum[5];
-                    output2_tm[1] = sum[6];
-                    output3_tm[1] = sum[7];
-                    output0_tm[2] = sum[8];
-                    output1_tm[2] = sum[9];
-                    output2_tm[2] = sum[10];
-                    output3_tm[2] = sum[11];
-                    output0_tm[3] = sum[12];
-                    output1_tm[3] = sum[13];
-                    output2_tm[3] = sum[14];
-                    output3_tm[3] = sum[15];
-                    output0_tm += 4;
-                    output1_tm += 4;
-                    output2_tm += 4;
-                    output3_tm += 4;
-                }
-#endif
-                for (; i + 1 < tiles; i += 2)
-                {
-#if __AVX2__
-                    const short* r0 = bb2.row<const short>(i / 4 + (i % 4) / 2);
-#else
-                    const short* r0 = bb2.row<const short>(i / 2);
-#endif
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    int nn = inch; // inch always > 0
-
-#if __AVX2__
-                    __m256i _sum00_11 = _mm256_setzero_si256();
-                    __m256i _sum10_01 = _mm256_setzero_si256();
-                    __m256i _sum02_13 = _mm256_setzero_si256();
-                    __m256i _sum12_03 = _mm256_setzero_si256();
-#else
-                    __m128i _sum00 = _mm_setzero_si128();
-                    __m128i _sum01 = _mm_setzero_si128();
-                    __m128i _sum02 = _mm_setzero_si128();
-                    __m128i _sum03 = _mm_setzero_si128();
-                    __m128i _sum10 = _mm_setzero_si128();
-                    __m128i _sum11 = _mm_setzero_si128();
-                    __m128i _sum12 = _mm_setzero_si128();
-                    __m128i _sum13 = _mm_setzero_si128();
-#endif
-
-                    for (int j = 0; j < nn; j++)
-                    {
-#if __AVX2__
-                        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
-
-                        __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
-                        __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
-
-                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
-                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
-                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
-                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
-#else
-                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
-                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
-                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
-                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
-#endif
-#else
-                        // 0 1 2 3 4 5 6 7
-                        __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
-                        __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
-
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-                        __m128i _w1 = _mm_loadu_si128((const __m128i*)(k0 + 8));
-                        __m128i _w2 = _mm_loadu_si128((const __m128i*)(k0 + 16));
-                        __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
-
-#if __XOP__
-                        _sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
-                        _sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
-                        _sum02 = _mm_maddd_epi16(_val0, _w2, _sum02);
-                        _sum03 = _mm_maddd_epi16(_val0, _w3, _sum03);
-                        _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
-                        _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
-                        _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
-                        _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
-#else
-                        _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
-                        _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
-                        _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
-                        _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
-                        _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
-                        _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
-                        _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
-                        _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
-#endif
-#endif
-
-                        r0 += 16;
-                        k0 += 32;
-                    }
-
-#if __AVX2__
-                    // transpose 4x8
-                    {
-                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
-                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
-                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
-                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
-                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
-                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
-
-                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
-                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
-
-                    int sum[8];
-                    _mm256_storeu_si256((__m256i*)sum, _sum00_11);
-#else
-                    // transpose 4x4
-                    {
-                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
-                        _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
-                        _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
-                        _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
-                        _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-                    {
-                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
-                        _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
-                        _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
-                        _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
-                        _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum00 = _mm_add_epi32(_sum00, _sum01);
-                    _sum02 = _mm_add_epi32(_sum02, _sum03);
-                    _sum10 = _mm_add_epi32(_sum10, _sum11);
-                    _sum12 = _mm_add_epi32(_sum12, _sum13);
-
-                    _sum00 = _mm_add_epi32(_sum00, _sum02);
-                    _sum10 = _mm_add_epi32(_sum10, _sum12);
-
-                    int sum[8];
-                    _mm_storeu_si128((__m128i*)sum, _sum00);
-                    _mm_storeu_si128((__m128i*)(sum + 4), _sum10);
-#endif
-
-                    output0_tm[0] = sum[0];
-                    output1_tm[0] = sum[1];
-                    output2_tm[0] = sum[2];
-                    output3_tm[0] = sum[3];
-                    output0_tm[1] = sum[4];
-                    output1_tm[1] = sum[5];
-                    output2_tm[1] = sum[6];
-                    output3_tm[1] = sum[7];
-                    output0_tm += 2;
-                    output1_tm += 2;
-                    output2_tm += 2;
-                    output3_tm += 2;
-                }
-                for (; i < tiles; i++)
-                {
-#if __AVX2__
-                    const short* r0 = bb2.row<const short>(i / 4 + (i % 4) / 2 + i % 2);
-#else
-                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
-#endif
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    int nn = inch; // inch always > 0
-
-#if __AVX2__
-                    __m256i _sum0_1 = _mm256_setzero_si256();
-                    __m256i _sum2_3 = _mm256_setzero_si256();
-#else
-                    __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
-                    __m128i _sum2 = _mm_setzero_si128();
-                    __m128i _sum3 = _mm_setzero_si128();
-#endif
-
-                    for (int j = 0; j < nn; j++)
-                    {
-                        // 0 1 2 3 4 5 6 7
-                        __m128i _val = _mm_loadu_si128((const __m128i*)r0);
-
-#if __AVX2__
-                        __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
-                        __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
-
-                        __m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23);
-#else
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23));
-#endif
-#else
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-                        __m128i _w1 = _mm_loadu_si128((const __m128i*)(k0 + 8));
-                        __m128i _w2 = _mm_loadu_si128((const __m128i*)(k0 + 16));
-                        __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
-
-#if __XOP__
-                        _sum0 = _mm_maddd_epi16(_val, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val, _w1, _sum1);
-                        _sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
-                        _sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
-#else
-                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
-                        _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
-                        _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
-                        _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
-#endif
-#endif
-
-                        r0 += 8;
-                        k0 += 32;
-                    }
-
-#if __AVX2__
-                    __m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0);
-                    __m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1);
-                    __m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0);
-                    __m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1);
-#endif
-
-                    // transpose 4x4
-                    {
-                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
-                        _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
-                        _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
-                        _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
-                        _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-                    _sum2 = _mm_add_epi32(_sum2, _sum3);
-
-                    _sum0 = _mm_add_epi32(_sum0, _sum2);
-
-                    int sum[4];
-                    _mm_storeu_si128((__m128i*)sum, _sum0);
-
-                    output0_tm[0] = sum[0];
-                    output1_tm[0] = sum[1];
-                    output2_tm[0] = sum[2];
-                    output3_tm[0] = sum[3];
-                    output0_tm += 1;
-                    output1_tm += 1;
-                    output2_tm += 1;
-                    output3_tm += 1;
-                }
-            }
-        }
-
-        remain_outch_start += nn_outch << 2;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = remain_outch_start; p < outch; p++)
-        {
-            int* output0_tm = top_blob_tm.channel(p);
-
-            const Mat kernel0_tm = kernel_tm.channel(p / 4 + p % 4);
-
-            for (int r = 0; r < 36; r++)
-            {
-                const Mat bb2 = bottom_blob_tm2.channel(r);
-
-                int i = 0;
-#if __AVX2__
-                for (; i + 3 < tiles; i += 4)
-                {
-                    const short* r0 = bb2.row<const short>(i / 4);
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    __m256i _sum01 = _mm256_setzero_si256();
-                    __m256i _sum23 = _mm256_setzero_si256();
-
-                    for (int q = 0; q < inch; q++)
-                    {
-                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
-                        __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
-
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-                        __m256i _w01 = _mm256_inserti128_si256(_mm256_castsi128_si256(_w0), _w0, 1);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum01 = _mm256_dpwssd_epi32(_sum01, _val01, _w01);
-                        _sum23 = _mm256_dpwssd_epi32(_sum23, _val23, _w01);
-#else
-                        _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01, _w01));
-                        _sum23 = _mm256_add_epi32(_sum23, _mm256_madd_epi16(_val23, _w01));
-#endif
-
-                        k0 += 8;
-                        r0 += 32;
-                    }
-
-                    __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
-                    __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
-                    __m128i _sum2 = _mm256_extracti128_si256(_sum23, 0);
-                    __m128i _sum3 = _mm256_extracti128_si256(_sum23, 1);
-
-                    output0_tm[0] = _mm_reduce_add_epi32(_sum0);
-                    output0_tm[1] = _mm_reduce_add_epi32(_sum1);
-                    output0_tm[2] = _mm_reduce_add_epi32(_sum2);
-                    output0_tm[3] = _mm_reduce_add_epi32(_sum3);
-                    output0_tm += 4;
-                }
-#endif
-                for (; i + 1 < tiles; i += 2)
-                {
-#if __AVX2__
-                    const short* r0 = bb2.row<const short>(i / 4 + (i % 4) / 2);
-#else
-                    const short* r0 = bb2.row<const short>(i / 2);
-#endif
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-#if __AVX2__
-                    __m256i _sum01 = _mm256_setzero_si256();
-#else
-                    __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
-#endif
-
-                    for (int q = 0; q < inch; q++)
-                    {
-#if __AVX2__
-                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
-
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-                        __m256i _w01 = _mm256_inserti128_si256(_mm256_castsi128_si256(_w0), _w0, 1);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum01 = _mm256_dpwssd_epi32(_sum01, _val01, _w01);
-#else
-                        _sum01 = _mm256_add_epi32(_sum01, _mm256_madd_epi16(_val01, _w01));
-#endif
-#else
-                        __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
-                        __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
-
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-
-#if __XOP__
-                        _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val1, _w0, _sum1);
-#else
-                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
-                        _sum1 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum1);
-#endif
-#endif
-
-                        k0 += 8;
-                        r0 += 16;
-                    }
-
-#if __AVX2__
-                    __m128i _sum0 = _mm256_extracti128_si256(_sum01, 0);
-                    __m128i _sum1 = _mm256_extracti128_si256(_sum01, 1);
-#endif
-
-                    output0_tm[0] = _mm_reduce_add_epi32(_sum0);
-                    output0_tm[1] = _mm_reduce_add_epi32(_sum1);
-                    output0_tm += 2;
-                }
-                for (; i < tiles; i++)
-                {
-#if __AVX2__
-                    const short* r0 = bb2.row<const short>(i / 4 + (i % 4) / 2 + i % 2);
-#else
-                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
-#endif
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    __m128i _sum0 = _mm_setzero_si128();
-
-                    for (int q = 0; q < inch; q++)
-                    {
-                        __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
-
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-
-#if __XOP__
-                        _sum0 = _mm_maddd_epi16(_val0, _w0, _sum0);
-#else
-                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum0);
-#endif
-
-                        k0 += 8;
-                        r0 += 8;
-                    }
-
-                    output0_tm[0] = _mm_reduce_add_epi32(_sum0);
-                    output0_tm++;
-                }
-            }
-        }
-    }
-    bottom_blob_tm = Mat();
-    // END dot
-
-    // BEGIN transform output
-    Mat top_blob_bordered;
-    if (outw == top_blob.w && outh == top_blob.h)
-    {
-        top_blob_bordered = top_blob;
-    }
-    else
-    {
-        top_blob_bordered.create(outw, outh, outch, 4u, 1, opt.workspace_allocator);
-    }
-    {
-        // const float otm[4][6] = {
-        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
-        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
-        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
-        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
-        // };
-
-        // 0 = r00 + (r01 + r02) + (r03 + r04)
-        // 1 =       (r01 - r02) + (r03 - r04) * 2
-        // 2 =       (r01 + r02) + (r03 + r04) * 4
-        // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
-
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-        const int tiles = w_tm / 6 * h_tm / 6;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < outch; p++)
-        {
-            const Mat out0_tm = top_blob_tm.channel(p);
-            Mat out0 = top_blob_bordered.channel(p);
-
-            int tmp[4][6];
-
-            // tile
-            for (int i = 0; i < outh / 4; i++)
-            {
-                for (int j = 0; j < outw / 4; j++)
-                {
-                    //                     top_blob_tm.create(tiles, 36, outch, 4u, 1, opt.workspace_allocator);
-
-                    const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 1;
-                    const int* output0_tm_1 = output0_tm_0 + tiles * 1;
-                    const int* output0_tm_2 = output0_tm_0 + tiles * 2;
-                    const int* output0_tm_3 = output0_tm_0 + tiles * 3;
-                    const int* output0_tm_4 = output0_tm_0 + tiles * 4;
-                    const int* output0_tm_5 = output0_tm_0 + tiles * 5;
-
-                    int* output0 = out0.row<int>(i * 4) + j * 4;
-
-                    // 0 = r00 + (r01 + r02) + (r03 + r04)
-                    // 1 =       (r01 - r02) + (r03 - r04) * 2
-                    // 2 =       (r01 + r02) + (r03 + r04) * 4
-                    // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
-
-                    // TODO sse optimize
-                    for (int m = 0; m < 5; m++)
-                    {
-                        int tmp02a = output0_tm_1[0] + output0_tm_2[0];
-                        int tmp13a = output0_tm_1[0] - output0_tm_2[0];
-
-                        int tmp02b = output0_tm_3[0] + output0_tm_4[0];
-                        int tmp13b = output0_tm_3[0] - output0_tm_4[0];
-
-                        tmp[0][m] = output0_tm_0[0] + tmp02a + tmp02b;
-                        tmp[1][m] = tmp13a + tmp13b * 2;
-                        tmp[2][m] = tmp02a + tmp02b * 4;
-                        tmp[3][m] = output0_tm_5[0] * 4 + tmp13a + tmp13b * 8;
-
-                        output0_tm_0 += tiles * 6;
-                        output0_tm_1 += tiles * 6;
-                        output0_tm_2 += tiles * 6;
-                        output0_tm_3 += tiles * 6;
-                        output0_tm_4 += tiles * 6;
-                        output0_tm_5 += tiles * 6;
-                    }
-                    for (int m = 5; m < 6; m++)
-                    {
-                        int tmp02a = output0_tm_1[0] + output0_tm_2[0];
-                        int tmp13a = output0_tm_1[0] - output0_tm_2[0];
-
-                        int tmp02b = output0_tm_3[0] + output0_tm_4[0];
-                        int tmp13b = output0_tm_3[0] - output0_tm_4[0];
-
-                        tmp[0][m] = (output0_tm_0[0] + tmp02a + tmp02b) * 4;
-                        tmp[1][m] = (tmp13a + tmp13b * 2) * 4;
-                        tmp[2][m] = (tmp02a + tmp02b * 4) * 4;
-                        tmp[3][m] = (output0_tm_5[0] * 4 + tmp13a + tmp13b * 8) * 4;
-
-                        output0_tm_0 += tiles * 6;
-                        output0_tm_1 += tiles * 6;
-                        output0_tm_2 += tiles * 6;
-                        output0_tm_3 += tiles * 6;
-                        output0_tm_4 += tiles * 6;
-                        output0_tm_5 += tiles * 6;
-                    }
-
-                    for (int m = 0; m < 4; m++)
-                    {
-                        const int* tmp0 = tmp[m];
-
-                        int tmp02a = tmp0[1] + tmp0[2];
-                        int tmp13a = tmp0[1] - tmp0[2];
-
-                        int tmp02b = tmp0[3] + tmp0[4];
-                        int tmp13b = tmp0[3] - tmp0[4];
-
-                        output0[0] = (tmp0[0] + tmp02a + tmp02b) / 576;
-                        output0[1] = (tmp13a + tmp13b * 2) / 576;
-                        output0[2] = (tmp02a + tmp02b * 4) / 576;
-                        output0[3] = (tmp0[5] + tmp13a + tmp13b * 8) / 576;
-
-                        output0 += outw;
-                    }
-                }
-            }
-        }
-    }
-    // END transform output
-
-    // cut result pad
-    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
-}
diff --git a/src/layer/x86/convolution_3x3_pack8to4_int8.h b/src/layer/x86/convolution_3x3_pack8to4_int8.h
deleted file mode 100644
index 2bb48ce1903a..000000000000
--- a/src/layer/x86/convolution_3x3_pack8to4_int8.h
+++ /dev/null
@@ -1,945 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx512vnni(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to4_int8_sse_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avxvnni(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to4_int8_sse_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx2(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to4_int8_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_xop(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt);
-void conv3x3s1_winograd43_pack8to4_int8_sse_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt);
-#endif
-#endif
-
-static void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(const Mat& kernel, Mat& kernel_tm_pack8, int inch, int outch, const Option& opt)
-{
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
-    if (ncnn::cpu_support_x86_avx512_vnni())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx512vnni(kernel, kernel_tm_pack8, inch, outch, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
-    if (ncnn::cpu_support_x86_avx_vnni())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avxvnni(kernel, kernel_tm_pack8, inch, outch, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
-    if (ncnn::cpu_support_x86_avx2())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx2(kernel, kernel_tm_pack8, inch, outch, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
-    if (ncnn::cpu_support_x86_xop())
-    {
-        conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_xop(kernel, kernel_tm_pack8, inch, outch, opt);
-        return;
-    }
-#endif
-#endif
-
-    // winograd43 transform kernel
-    Mat kernel_tm(6 * 6, inch, outch, (size_t)2u);
-
-    const short ktm[6][3] = {
-        {6, 0, 0},
-        {-4, -4, -4},
-        {-4, 4, -4},
-        {1, 2, 4},
-        {1, -2, 4},
-        {0, 0, 6}
-    };
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        for (int q = 0; q < inch; q++)
-        {
-            const signed char* kernel0 = (const signed char*)kernel + p * inch * 9 + q * 9;
-            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
-
-            // transform kernel
-            const signed char* k0 = kernel0;
-            const signed char* k1 = kernel0 + 3;
-            const signed char* k2 = kernel0 + 6;
-
-            // h
-            short tmp[6][3];
-            for (int i = 0; i < 6; i++)
-            {
-                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
-                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
-                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
-            }
-
-            // U
-            for (int j = 0; j < 6; j++)
-            {
-                short* tmpp = &tmp[j][0];
-
-                for (int i = 0; i < 6; i++)
-                {
-                    kernel_tm0[j * 6 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
-                }
-            }
-        }
-    }
-
-    // interleave
-    // src = 36-inch-outch
-    // dst = 4b-8a-inch/8a-36-outch/4b
-    kernel_tm_pack8.create(inch / 8, 36, outch / 4, (size_t)2u * 32, 32);
-
-    int q = 0;
-    for (; q + 3 < outch; q += 4)
-    {
-        Mat g0 = kernel_tm_pack8.channel(q / 4);
-
-        for (int k = 0; k < 36; k++)
-        {
-            short* g00 = g0.row<short>(k);
-
-            for (int p = 0; p + 7 < inch; p += 8)
-            {
-                for (int i = 0; i < 4; i++)
-                {
-                    for (int j = 0; j < 8; j++)
-                    {
-                        const short* k00 = kernel_tm.channel(q + i).row<const short>(p + j);
-                        g00[0] = k00[k];
-                        g00 += 1;
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void conv3x3s1_winograd43_pack8to4_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
-{
-#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
-#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
-    if (ncnn::cpu_support_x86_avx512_vnni())
-    {
-        conv3x3s1_winograd43_pack8to4_int8_sse_avx512vnni(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
-    if (ncnn::cpu_support_x86_avx_vnni())
-    {
-        conv3x3s1_winograd43_pack8to4_int8_sse_avxvnni(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
-    if (ncnn::cpu_support_x86_avx2())
-    {
-        conv3x3s1_winograd43_pack8to4_int8_sse_avx2(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-
-#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
-    if (ncnn::cpu_support_x86_xop())
-    {
-        conv3x3s1_winograd43_pack8to4_int8_sse_xop(bottom_blob, top_blob, kernel_tm, opt);
-        return;
-    }
-#endif
-#endif
-
-    int w = bottom_blob.w;
-    int h = bottom_blob.h;
-    int inch = bottom_blob.c;
-    //     size_t elemsize = bottom_blob.elemsize;
-    int elempack = bottom_blob.elempack;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    // pad to 4n+2
-    Mat bottom_blob_bordered = bottom_blob;
-
-    outw = (outw + 3) / 4 * 4;
-    outh = (outh + 3) / 4 * 4;
-
-    w = outw + 2;
-    h = outh + 2;
-    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, BORDER_CONSTANT, 0.f, opt);
-
-    // BEGIN transform input
-    Mat bottom_blob_tm;
-    {
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        const int tiles = w_tm / 6 * h_tm / 6;
-
-        bottom_blob_tm.create(tiles, 36, inch, 2u * elempack, elempack, opt.workspace_allocator);
-
-        // const float itm[4][4] = {
-        //     {4.0f, 0.0f, -5.0f, 0.0f, 1.0f, 0.0f},
-        //     {0.0f,-4.0f, -4.0f, 1.0f, 1.0f, 0.0f},
-        //     {0.0f, 4.0f, -4.0f,-1.0f, 1.0f, 0.0f},
-        //     {0.0f,-2.0f, -1.0f, 2.0f, 1.0f, 0.0f},
-        //     {0.0f, 2.0f, -1.0f,-2.0f, 1.0f, 0.0f},
-        //     {0.0f, 4.0f,  0.0f,-5.0f, 0.0f, 1.0f}
-        // };
-
-        // 0 =  4 * r00 - 5 * r02 + r04
-        // 1 = -4 * (r01 + r02) + r04 + r03
-        // 2 =  4 * (r01 - r02) + r04 - r03
-        // 3 = -2 * (r01 - r03) + r04 - r02
-        // 4 =  2 * (r01 - r03) + r04 - r02
-        // 5 =  4 * r01 - 5 * r03 + r05
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < inch; q++)
-        {
-            const Mat img0 = bottom_blob_bordered.channel(q);
-            Mat img0_tm = bottom_blob_tm.channel(q);
-
-            short tmp[6][6][8];
-
-            // tile
-            for (int i = 0; i < h_tm / 6; i++)
-            {
-                for (int j = 0; j < w_tm / 6; j++)
-                {
-                    const signed char* r0 = img0.row<const signed char>(i * 4) + (j * 4) * 8;
-
-                    for (int m = 0; m < 6; m++)
-                    {
-                        // TODO use _mm_cvtepi8_epi16 on sse4.1
-                        __m128i _r00_01 = _mm_loadu_si128((const __m128i*)r0);
-                        __m128i _r02_03 = _mm_loadu_si128((const __m128i*)(r0 + 16));
-                        __m128i _r04_05 = _mm_loadu_si128((const __m128i*)(r0 + 32));
-                        __m128i _extr0001 = _mm_cmpgt_epi8(_mm_setzero_si128(), _r00_01);
-                        __m128i _extr0203 = _mm_cmpgt_epi8(_mm_setzero_si128(), _r02_03);
-                        __m128i _extr0405 = _mm_cmpgt_epi8(_mm_setzero_si128(), _r04_05);
-                        __m128i _r00 = _mm_unpacklo_epi8(_r00_01, _extr0001);
-                        __m128i _r01 = _mm_unpackhi_epi8(_r00_01, _extr0001);
-                        __m128i _r02 = _mm_unpacklo_epi8(_r02_03, _extr0203);
-                        __m128i _r03 = _mm_unpackhi_epi8(_r02_03, _extr0203);
-                        __m128i _r04 = _mm_unpacklo_epi8(_r04_05, _extr0405);
-                        __m128i _r05 = _mm_unpackhi_epi8(_r04_05, _extr0405);
-
-                        __m128i _v5 = _mm_set1_epi16(5);
-
-                        __m128i _tmp0m = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_r00, 2), _r04), _mm_mullo_epi16(_r02, _v5));
-                        __m128i _tmp1m = _mm_sub_epi16(_mm_add_epi16(_r04, _r03), _mm_slli_epi16(_mm_add_epi16(_r01, _r02), 2));
-                        __m128i _tmp2m = _mm_add_epi16(_mm_sub_epi16(_r04, _r03), _mm_slli_epi16(_mm_sub_epi16(_r01, _r02), 2));
-                        __m128i _tmp3m = _mm_sub_epi16(_mm_sub_epi16(_r04, _r02), _mm_slli_epi16(_mm_sub_epi16(_r01, _r03), 1));
-                        __m128i _tmp4m = _mm_add_epi16(_mm_sub_epi16(_r04, _r02), _mm_slli_epi16(_mm_sub_epi16(_r01, _r03), 1));
-                        __m128i _tmp5m = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_r01, 2), _r05), _mm_mullo_epi16(_r03, _v5));
-
-                        _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0m);
-                        _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1m);
-                        _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2m);
-                        _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3m);
-                        _mm_storeu_si128((__m128i*)tmp[4][m], _tmp4m);
-                        _mm_storeu_si128((__m128i*)tmp[5][m], _tmp5m);
-
-                        r0 += w * 8;
-                    }
-
-                    short* r0_tm_0 = (short*)img0_tm + (i * w_tm / 6 + j) * 8;
-                    short* r0_tm_1 = r0_tm_0 + tiles * 8;
-                    short* r0_tm_2 = r0_tm_0 + tiles * 16;
-                    short* r0_tm_3 = r0_tm_0 + tiles * 24;
-                    short* r0_tm_4 = r0_tm_0 + tiles * 32;
-                    short* r0_tm_5 = r0_tm_0 + tiles * 40;
-
-                    for (int m = 0; m < 6; m++)
-                    {
-                        __m128i _tmp00 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
-                        __m128i _tmp01 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
-                        __m128i _tmp02 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
-                        __m128i _tmp03 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
-                        __m128i _tmp04 = _mm_loadu_si128((const __m128i*)tmp[m][4]);
-                        __m128i _tmp05 = _mm_loadu_si128((const __m128i*)tmp[m][5]);
-
-                        __m128i _v5 = _mm_set1_epi16(5);
-
-                        __m128i _r0tm0 = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_tmp00, 2), _tmp04), _mm_mullo_epi16(_tmp02, _v5));
-                        __m128i _r0tm1 = _mm_sub_epi16(_mm_add_epi16(_tmp04, _tmp03), _mm_slli_epi16(_mm_add_epi16(_tmp01, _tmp02), 2));
-                        __m128i _r0tm2 = _mm_add_epi16(_mm_sub_epi16(_tmp04, _tmp03), _mm_slli_epi16(_mm_sub_epi16(_tmp01, _tmp02), 2));
-                        __m128i _r0tm3 = _mm_sub_epi16(_mm_sub_epi16(_tmp04, _tmp02), _mm_slli_epi16(_mm_sub_epi16(_tmp01, _tmp03), 1));
-                        __m128i _r0tm4 = _mm_add_epi16(_mm_sub_epi16(_tmp04, _tmp02), _mm_slli_epi16(_mm_sub_epi16(_tmp01, _tmp03), 1));
-                        __m128i _r0tm5 = _mm_sub_epi16(_mm_add_epi16(_mm_slli_epi16(_tmp01, 2), _tmp05), _mm_mullo_epi16(_tmp03, _v5));
-
-                        _mm_storeu_si128((__m128i*)r0_tm_0, _r0tm0);
-                        _mm_storeu_si128((__m128i*)r0_tm_1, _r0tm1);
-                        _mm_storeu_si128((__m128i*)r0_tm_2, _r0tm2);
-                        _mm_storeu_si128((__m128i*)r0_tm_3, _r0tm3);
-                        _mm_storeu_si128((__m128i*)r0_tm_4, _r0tm4);
-                        _mm_storeu_si128((__m128i*)r0_tm_5, _r0tm5);
-
-                        r0_tm_0 += tiles * 48;
-                        r0_tm_1 += tiles * 48;
-                        r0_tm_2 += tiles * 48;
-                        r0_tm_3 += tiles * 48;
-                        r0_tm_4 += tiles * 48;
-                        r0_tm_5 += tiles * 48;
-                    }
-                }
-            }
-        }
-    }
-    bottom_blob_bordered = Mat();
-    // END transform input
-
-    // BEGIN dot
-    Mat top_blob_tm;
-    {
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-
-        const int tiles = h_tm / 6 * w_tm / 6;
-
-        // permute
-        //         bottom_blob_tm.create(tiles, 36, inch, elemsize, elempack, opt.workspace_allocator);
-        Mat bottom_blob_tm2;
-#if __AVX2__
-        if (tiles >= 4)
-            bottom_blob_tm2.create(4 * inch, tiles / 4 + (tiles % 4) / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
-        else if (tiles >= 2)
-            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
-        else // if (tiles >= 1)
-            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);
-#else
-        if (tiles >= 2)
-            bottom_blob_tm2.create(2 * inch, tiles / 2 + tiles % 2, 36, 2u * elempack, elempack, opt.workspace_allocator);
-        else // if (tiles >= 1)
-            bottom_blob_tm2.create(1 * inch, tiles, 36, 2u * elempack, elempack, opt.workspace_allocator);
-#endif
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int r = 0; r < 36; r++)
-        {
-            Mat tm2 = bottom_blob_tm2.channel(r);
-
-            // tile
-            int i = 0;
-#if __AVX2__
-            for (; i + 3 < tiles; i += 4)
-            {
-                short* tmpptr = tm2.row<short>(i / 4);
-
-                const short* r0 = bottom_blob_tm;
-
-                r0 += (r * tiles + i) * 8;
-
-                for (int q = 0; q < inch; q++)
-                {
-                    __m256i _r0 = _mm256_loadu_si256((const __m256i*)r0);
-                    __m256i _r1 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
-                    _mm256_storeu_si256((__m256i*)tmpptr, _r0);
-                    _mm256_storeu_si256((__m256i*)(tmpptr + 16), _r1);
-                    r0 += bottom_blob_tm.cstep * 8;
-                    tmpptr += 32;
-                }
-            }
-#endif
-            for (; i + 1 < tiles; i += 2)
-            {
-#if __AVX2__
-                short* tmpptr = tm2.row<short>(i / 4 + (i % 4) / 2);
-#else
-                short* tmpptr = tm2.row<short>(i / 2);
-#endif
-
-                const short* r0 = bottom_blob_tm;
-
-                r0 += (r * tiles + i) * 8;
-
-                for (int q = 0; q < inch; q++)
-                {
-                    __m128i _r0 = _mm_loadu_si128((const __m128i*)r0);
-                    __m128i _r1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
-                    _mm_storeu_si128((__m128i*)tmpptr, _r0);
-                    _mm_storeu_si128((__m128i*)(tmpptr + 8), _r1);
-                    r0 += bottom_blob_tm.cstep * 8;
-                    tmpptr += 16;
-                }
-            }
-            for (; i < tiles; i++)
-            {
-#if __AVX2__
-                short* tmpptr = tm2.row<short>(i / 4 + (i % 4) / 2 + i % 2);
-#else
-                short* tmpptr = tm2.row<short>(i / 2 + i % 2);
-#endif
-
-                const short* r0 = bottom_blob_tm;
-
-                r0 += (r * tiles + i) * 8;
-
-                for (int q = 0; q < inch; q++)
-                {
-                    __m128i _r0 = _mm_loadu_si128((const __m128i*)r0);
-                    _mm_storeu_si128((__m128i*)tmpptr, _r0);
-                    r0 += bottom_blob_tm.cstep * 8;
-                    tmpptr += 8;
-                }
-            }
-        }
-
-        bottom_blob_tm = Mat();
-        // permute end
-
-        top_blob_tm.create(tiles, 36, outch, 4u * 4, 4, opt.workspace_allocator);
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < outch; p++)
-        {
-            int* output0_tm = top_blob_tm.channel(p);
-
-            const Mat kernel0_tm = kernel_tm.channel(p);
-
-            for (int r = 0; r < 36; r++)
-            {
-                const Mat bb2 = bottom_blob_tm2.channel(r);
-
-                int i = 0;
-#if __AVX2__
-                for (; i + 3 < tiles; i += 4)
-                {
-                    const short* r0 = bb2.row<const short>(i / 4);
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    int nn = inch; // inch always > 0
-
-                    __m256i _sum00_11 = _mm256_setzero_si256();
-                    __m256i _sum10_01 = _mm256_setzero_si256();
-                    __m256i _sum02_13 = _mm256_setzero_si256();
-                    __m256i _sum12_03 = _mm256_setzero_si256();
-
-                    __m256i _sum04_15 = _mm256_setzero_si256();
-                    __m256i _sum14_05 = _mm256_setzero_si256();
-                    __m256i _sum06_17 = _mm256_setzero_si256();
-                    __m256i _sum16_07 = _mm256_setzero_si256();
-
-                    for (int j = 0; j < nn; j++)
-                    {
-                        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
-
-                        __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
-                        __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
-
-                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
-                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
-                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
-                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
-#else
-                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
-                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
-                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
-                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
-#endif
-
-                        __m256i _val23 = _mm256_loadu_si256((const __m256i*)(r0 + 16));
-
-                        __m256i _val32 = _mm256_permute4x64_epi64(_val23, 78);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum04_15 = _mm256_dpwssd_epi32(_sum04_15, _val23, _w01);
-                        _sum14_05 = _mm256_dpwssd_epi32(_sum14_05, _val32, _w01);
-                        _sum06_17 = _mm256_dpwssd_epi32(_sum06_17, _val23, _w23);
-                        _sum16_07 = _mm256_dpwssd_epi32(_sum16_07, _val32, _w23);
-#else
-                        _sum04_15 = _mm256_add_epi32(_sum04_15, _mm256_madd_epi16(_val23, _w01));
-                        _sum14_05 = _mm256_add_epi32(_sum14_05, _mm256_madd_epi16(_val32, _w01));
-                        _sum06_17 = _mm256_add_epi32(_sum06_17, _mm256_madd_epi16(_val23, _w23));
-                        _sum16_07 = _mm256_add_epi32(_sum16_07, _mm256_madd_epi16(_val32, _w23));
-#endif
-
-                        r0 += 32;
-                        k0 += 32;
-                    }
-
-                    // transpose 4x8
-                    {
-                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
-                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
-                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
-                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
-                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-                    {
-                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm256_unpacklo_epi32(_sum04_15, _sum14_05);
-                        _tmp1 = _mm256_unpacklo_epi32(_sum06_17, _sum16_07);
-                        _tmp2 = _mm256_unpackhi_epi32(_sum04_15, _sum14_05);
-                        _tmp3 = _mm256_unpackhi_epi32(_sum06_17, _sum16_07);
-                        _sum04_15 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum14_05 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum06_17 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum16_07 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
-                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
-
-                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum14_05);
-                    _sum06_17 = _mm256_add_epi32(_sum06_17, _sum16_07);
-                    _sum04_15 = _mm256_add_epi32(_sum04_15, _sum06_17);
-
-                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
-                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
-                    _sum04_15 = _mm256_permutevar8x32_epi32(_sum04_15, _perm_mask);
-
-                    _mm256_storeu_si256((__m256i*)output0_tm, _sum00_11);
-                    _mm256_storeu_si256((__m256i*)(output0_tm + 8), _sum04_15);
-                    output0_tm += 16;
-                }
-#endif
-                for (; i + 1 < tiles; i += 2)
-                {
-#if __AVX2__
-                    const short* r0 = bb2.row<const short>(i / 4 + (i % 4) / 2);
-#else
-                    const short* r0 = bb2.row<const short>(i / 2);
-#endif
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    int nn = inch; // inch always > 0
-
-#if __AVX2__
-                    __m256i _sum00_11 = _mm256_setzero_si256();
-                    __m256i _sum10_01 = _mm256_setzero_si256();
-                    __m256i _sum02_13 = _mm256_setzero_si256();
-                    __m256i _sum12_03 = _mm256_setzero_si256();
-#else
-                    __m128i _sum00 = _mm_setzero_si128();
-                    __m128i _sum01 = _mm_setzero_si128();
-                    __m128i _sum02 = _mm_setzero_si128();
-                    __m128i _sum03 = _mm_setzero_si128();
-                    __m128i _sum10 = _mm_setzero_si128();
-                    __m128i _sum11 = _mm_setzero_si128();
-                    __m128i _sum12 = _mm_setzero_si128();
-                    __m128i _sum13 = _mm_setzero_si128();
-#endif
-
-                    for (int j = 0; j < nn; j++)
-                    {
-#if __AVX2__
-                        // 0 1 2 3 4 5 6 7 8 9 a b c d e f
-                        __m256i _val01 = _mm256_loadu_si256((const __m256i*)r0);
-
-                        __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
-                        __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
-
-                        __m256i _val10 = _mm256_permute4x64_epi64(_val01, 78);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum00_11 = _mm256_dpwssd_epi32(_sum00_11, _val01, _w01);
-                        _sum10_01 = _mm256_dpwssd_epi32(_sum10_01, _val10, _w01);
-                        _sum02_13 = _mm256_dpwssd_epi32(_sum02_13, _val01, _w23);
-                        _sum12_03 = _mm256_dpwssd_epi32(_sum12_03, _val10, _w23);
-#else
-                        _sum00_11 = _mm256_add_epi32(_sum00_11, _mm256_madd_epi16(_val01, _w01));
-                        _sum10_01 = _mm256_add_epi32(_sum10_01, _mm256_madd_epi16(_val10, _w01));
-                        _sum02_13 = _mm256_add_epi32(_sum02_13, _mm256_madd_epi16(_val01, _w23));
-                        _sum12_03 = _mm256_add_epi32(_sum12_03, _mm256_madd_epi16(_val10, _w23));
-#endif
-#else
-                        // 0 1 2 3 4 5 6 7
-                        __m128i _val0 = _mm_loadu_si128((const __m128i*)r0);
-                        __m128i _val1 = _mm_loadu_si128((const __m128i*)(r0 + 8));
-
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-                        __m128i _w1 = _mm_loadu_si128((const __m128i*)(k0 + 8));
-                        __m128i _w2 = _mm_loadu_si128((const __m128i*)(k0 + 16));
-                        __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
-
-#if __XOP__
-                        _sum00 = _mm_maddd_epi16(_val0, _w0, _sum00);
-                        _sum01 = _mm_maddd_epi16(_val0, _w1, _sum01);
-                        _sum02 = _mm_maddd_epi16(_val0, _w2, _sum02);
-                        _sum03 = _mm_maddd_epi16(_val0, _w3, _sum03);
-                        _sum10 = _mm_maddd_epi16(_val1, _w0, _sum10);
-                        _sum11 = _mm_maddd_epi16(_val1, _w1, _sum11);
-                        _sum12 = _mm_maddd_epi16(_val1, _w2, _sum12);
-                        _sum13 = _mm_maddd_epi16(_val1, _w3, _sum13);
-#else
-                        _sum00 = _mm_add_epi32(_mm_madd_epi16(_val0, _w0), _sum00);
-                        _sum01 = _mm_add_epi32(_mm_madd_epi16(_val0, _w1), _sum01);
-                        _sum02 = _mm_add_epi32(_mm_madd_epi16(_val0, _w2), _sum02);
-                        _sum03 = _mm_add_epi32(_mm_madd_epi16(_val0, _w3), _sum03);
-                        _sum10 = _mm_add_epi32(_mm_madd_epi16(_val1, _w0), _sum10);
-                        _sum11 = _mm_add_epi32(_mm_madd_epi16(_val1, _w1), _sum11);
-                        _sum12 = _mm_add_epi32(_mm_madd_epi16(_val1, _w2), _sum12);
-                        _sum13 = _mm_add_epi32(_mm_madd_epi16(_val1, _w3), _sum13);
-#endif
-#endif
-
-                        r0 += 16;
-                        k0 += 32;
-                    }
-
-#if __AVX2__
-                    // transpose 4x8
-                    {
-                        __m256i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm256_unpacklo_epi32(_sum00_11, _sum10_01);
-                        _tmp1 = _mm256_unpacklo_epi32(_sum02_13, _sum12_03);
-                        _tmp2 = _mm256_unpackhi_epi32(_sum00_11, _sum10_01);
-                        _tmp3 = _mm256_unpackhi_epi32(_sum02_13, _sum12_03);
-                        _sum00_11 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum10_01 = _mm256_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum02_13 = _mm256_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum12_03 = _mm256_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum10_01);
-                    _sum02_13 = _mm256_add_epi32(_sum02_13, _sum12_03);
-                    _sum00_11 = _mm256_add_epi32(_sum00_11, _sum02_13);
-
-                    __m256i _perm_mask = _mm256_set_epi32(6, 3, 4, 1, 7, 2, 5, 0);
-                    _sum00_11 = _mm256_permutevar8x32_epi32(_sum00_11, _perm_mask);
-
-                    _mm256_storeu_si256((__m256i*)output0_tm, _sum00_11);
-#else
-                    // transpose 4x4
-                    {
-                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm_unpacklo_epi32(_sum00, _sum01);
-                        _tmp1 = _mm_unpacklo_epi32(_sum02, _sum03);
-                        _tmp2 = _mm_unpackhi_epi32(_sum00, _sum01);
-                        _tmp3 = _mm_unpackhi_epi32(_sum02, _sum03);
-                        _sum00 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum01 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum02 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum03 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-                    {
-                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm_unpacklo_epi32(_sum10, _sum11);
-                        _tmp1 = _mm_unpacklo_epi32(_sum12, _sum13);
-                        _tmp2 = _mm_unpackhi_epi32(_sum10, _sum11);
-                        _tmp3 = _mm_unpackhi_epi32(_sum12, _sum13);
-                        _sum10 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum11 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum12 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum13 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum00 = _mm_add_epi32(_sum00, _sum01);
-                    _sum02 = _mm_add_epi32(_sum02, _sum03);
-                    _sum10 = _mm_add_epi32(_sum10, _sum11);
-                    _sum12 = _mm_add_epi32(_sum12, _sum13);
-
-                    _sum00 = _mm_add_epi32(_sum00, _sum02);
-                    _sum10 = _mm_add_epi32(_sum10, _sum12);
-
-                    _mm_storeu_si128((__m128i*)output0_tm, _sum00);
-                    _mm_storeu_si128((__m128i*)(output0_tm + 4), _sum10);
-#endif
-                    output0_tm += 8;
-                }
-                for (; i < tiles; i++)
-                {
-#if __AVX2__
-                    const short* r0 = bb2.row<const short>(i / 4 + (i % 4) / 2 + i % 2);
-#else
-                    const short* r0 = bb2.row<const short>(i / 2 + i % 2);
-#endif
-                    const short* k0 = kernel0_tm.row<const short>(r);
-
-                    int nn = inch; // inch always > 0
-
-#if __AVX2__
-                    __m256i _sum0_1 = _mm256_setzero_si256();
-                    __m256i _sum2_3 = _mm256_setzero_si256();
-#else
-                    __m128i _sum0 = _mm_setzero_si128();
-                    __m128i _sum1 = _mm_setzero_si128();
-                    __m128i _sum2 = _mm_setzero_si128();
-                    __m128i _sum3 = _mm_setzero_si128();
-#endif
-
-                    for (int j = 0; j < nn; j++)
-                    {
-                        // 0 1 2 3 4 5 6 7
-                        __m128i _val = _mm_loadu_si128((const __m128i*)r0);
-#if __AVX2__
-                        __m256i _w01 = _mm256_loadu_si256((const __m256i*)k0);
-                        __m256i _w23 = _mm256_loadu_si256((const __m256i*)(k0 + 16));
-
-                        __m256i _valval = _mm256_inserti128_si256(_mm256_castsi128_si256(_val), _val, 1);
-
-#if __AVXVNNI__ || __AVX512VNNI__
-                        _sum0_1 = _mm256_dpwssd_epi32(_sum0_1, _valval, _w01);
-                        _sum2_3 = _mm256_dpwssd_epi32(_sum2_3, _valval, _w23);
-#else
-                        _sum0_1 = _mm256_add_epi32(_sum0_1, _mm256_madd_epi16(_valval, _w01));
-                        _sum2_3 = _mm256_add_epi32(_sum2_3, _mm256_madd_epi16(_valval, _w23));
-#endif
-#else
-                        __m128i _w0 = _mm_loadu_si128((const __m128i*)k0);
-                        __m128i _w1 = _mm_loadu_si128((const __m128i*)(k0 + 8));
-                        __m128i _w2 = _mm_loadu_si128((const __m128i*)(k0 + 16));
-                        __m128i _w3 = _mm_loadu_si128((const __m128i*)(k0 + 24));
-
-#if __XOP__
-                        _sum0 = _mm_maddd_epi16(_val, _w0, _sum0);
-                        _sum1 = _mm_maddd_epi16(_val, _w1, _sum1);
-                        _sum2 = _mm_maddd_epi16(_val, _w2, _sum2);
-                        _sum3 = _mm_maddd_epi16(_val, _w3, _sum3);
-#else
-                        _sum0 = _mm_add_epi32(_mm_madd_epi16(_val, _w0), _sum0);
-                        _sum1 = _mm_add_epi32(_mm_madd_epi16(_val, _w1), _sum1);
-                        _sum2 = _mm_add_epi32(_mm_madd_epi16(_val, _w2), _sum2);
-                        _sum3 = _mm_add_epi32(_mm_madd_epi16(_val, _w3), _sum3);
-#endif
-#endif
-
-                        r0 += 8;
-                        k0 += 32;
-                    }
-
-#if __AVX2__
-                    __m128i _sum0 = _mm256_extracti128_si256(_sum0_1, 0);
-                    __m128i _sum1 = _mm256_extracti128_si256(_sum0_1, 1);
-                    __m128i _sum2 = _mm256_extracti128_si256(_sum2_3, 0);
-                    __m128i _sum3 = _mm256_extracti128_si256(_sum2_3, 1);
-#endif
-
-                    // transpose 4x4
-                    {
-                        __m128i _tmp0, _tmp1, _tmp2, _tmp3;
-                        _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
-                        _tmp1 = _mm_unpacklo_epi32(_sum2, _sum3);
-                        _tmp2 = _mm_unpackhi_epi32(_sum0, _sum1);
-                        _tmp3 = _mm_unpackhi_epi32(_sum2, _sum3);
-                        _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp1);
-                        _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp1);
-                        _sum2 = _mm_unpacklo_epi64(_tmp2, _tmp3);
-                        _sum3 = _mm_unpackhi_epi64(_tmp2, _tmp3);
-                    }
-
-                    _sum0 = _mm_add_epi32(_sum0, _sum1);
-                    _sum2 = _mm_add_epi32(_sum2, _sum3);
-
-                    _sum0 = _mm_add_epi32(_sum0, _sum2);
-
-                    _mm_storeu_si128((__m128i*)output0_tm, _sum0);
-                    output0_tm += 4;
-                }
-            }
-        }
-    }
-    bottom_blob_tm = Mat();
-    // END dot
-
-    // BEGIN transform output
-    Mat top_blob_bordered;
-    if (outw == top_blob.w && outh == top_blob.h)
-    {
-        top_blob_bordered = top_blob;
-    }
-    else
-    {
-        top_blob_bordered.create(outw, outh, outch, 4u * 4, 4, opt.workspace_allocator);
-    }
-    {
-        // const float otm[4][6] = {
-        //     {1.0f, 1.0f,  1.0f, 1.0f,  1.0f, 0.0f},
-        //     {0.0f, 1.0f, -1.0f, 2.0f, -2.0f, 0.0f},
-        //     {0.0f, 1.0f,  1.0f, 4.0f,  4.0f, 0.0f},
-        //     {0.0f, 1.0f, -1.0f, 8.0f, -8.0f, 1.0f}
-        // };
-
-        // 0 = r00 + (r01 + r02) + (r03 + r04)
-        // 1 =       (r01 - r02) + (r03 - r04) * 2
-        // 2 =       (r01 + r02) + (r03 + r04) * 4
-        // 3 = r05 + (r01 - r02) + (r03 - r04) * 8
-
-        int w_tm = outw / 4 * 6;
-        int h_tm = outh / 4 * 6;
-        const int tiles = w_tm / 6 * h_tm / 6;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p = 0; p < outch; p++)
-        {
-            const Mat out0_tm = top_blob_tm.channel(p);
-            Mat out0 = top_blob_bordered.channel(p);
-
-            int tmp[4][6][4];
-
-            // tile
-            for (int i = 0; i < outh / 4; i++)
-            {
-                for (int j = 0; j < outw / 4; j++)
-                {
-                    // top_blob_tm.create(tiles, 36, outch, elemsize, elempack);
-
-                    const int* output0_tm_0 = (const int*)out0_tm + (i * w_tm / 6 + j) * 4;
-                    const int* output0_tm_1 = output0_tm_0 + tiles * 4;
-                    const int* output0_tm_2 = output0_tm_0 + tiles * 8;
-                    const int* output0_tm_3 = output0_tm_0 + tiles * 12;
-                    const int* output0_tm_4 = output0_tm_0 + tiles * 16;
-                    const int* output0_tm_5 = output0_tm_0 + tiles * 20;
-
-                    int* output0 = out0.row<int>(i * 4) + (j * 4) * 4;
-
-                    // TODO sse optimize
-                    for (int m = 0; m < 5; m++)
-                    {
-                        __m128i _out0tm0 = _mm_loadu_si128((const __m128i*)output0_tm_0);
-                        __m128i _out0tm1 = _mm_loadu_si128((const __m128i*)output0_tm_1);
-                        __m128i _out0tm2 = _mm_loadu_si128((const __m128i*)output0_tm_2);
-                        __m128i _out0tm3 = _mm_loadu_si128((const __m128i*)output0_tm_3);
-                        __m128i _out0tm4 = _mm_loadu_si128((const __m128i*)output0_tm_4);
-                        __m128i _out0tm5 = _mm_loadu_si128((const __m128i*)output0_tm_5);
-
-                        __m128i _tmp02a = _mm_add_epi32(_out0tm1, _out0tm2);
-                        __m128i _tmp13a = _mm_sub_epi32(_out0tm1, _out0tm2);
-
-                        __m128i _tmp02b = _mm_add_epi32(_out0tm3, _out0tm4);
-                        __m128i _tmp13b = _mm_sub_epi32(_out0tm3, _out0tm4);
-
-                        __m128i _tmp0m = _mm_add_epi32(_mm_add_epi32(_out0tm0, _tmp02a), _tmp02b);
-                        __m128i _tmp1m = _mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 1));
-                        __m128i _tmp2m = _mm_add_epi32(_tmp02a, _mm_slli_epi32(_tmp02b, 2));
-                        __m128i _tmp3m = _mm_add_epi32(_mm_add_epi32(_tmp13a, _mm_slli_epi32(_out0tm5, 2)), _mm_slli_epi32(_tmp13b, 3));
-
-                        _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0m);
-                        _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1m);
-                        _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2m);
-                        _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3m);
-
-                        output0_tm_0 += tiles * 24;
-                        output0_tm_1 += tiles * 24;
-                        output0_tm_2 += tiles * 24;
-                        output0_tm_3 += tiles * 24;
-                        output0_tm_4 += tiles * 24;
-                        output0_tm_5 += tiles * 24;
-                    }
-                    for (int m = 5; m < 6; m++)
-                    {
-                        __m128i _out0tm0 = _mm_loadu_si128((const __m128i*)output0_tm_0);
-                        __m128i _out0tm1 = _mm_loadu_si128((const __m128i*)output0_tm_1);
-                        __m128i _out0tm2 = _mm_loadu_si128((const __m128i*)output0_tm_2);
-                        __m128i _out0tm3 = _mm_loadu_si128((const __m128i*)output0_tm_3);
-                        __m128i _out0tm4 = _mm_loadu_si128((const __m128i*)output0_tm_4);
-                        __m128i _out0tm5 = _mm_loadu_si128((const __m128i*)output0_tm_5);
-
-                        __m128i _tmp02a = _mm_add_epi32(_out0tm1, _out0tm2);
-                        __m128i _tmp13a = _mm_sub_epi32(_out0tm1, _out0tm2);
-
-                        __m128i _tmp02b = _mm_add_epi32(_out0tm3, _out0tm4);
-                        __m128i _tmp13b = _mm_sub_epi32(_out0tm3, _out0tm4);
-
-                        __m128i _tmp0m = _mm_add_epi32(_mm_add_epi32(_out0tm0, _tmp02a), _tmp02b);
-                        __m128i _tmp1m = _mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 1));
-                        __m128i _tmp2m = _mm_add_epi32(_tmp02a, _mm_slli_epi32(_tmp02b, 2));
-                        __m128i _tmp3m = _mm_add_epi32(_mm_add_epi32(_tmp13a, _mm_slli_epi32(_out0tm5, 2)), _mm_slli_epi32(_tmp13b, 3));
-
-                        _tmp0m = _mm_slli_epi32(_tmp0m, 2);
-                        _tmp1m = _mm_slli_epi32(_tmp1m, 2);
-                        _tmp2m = _mm_slli_epi32(_tmp2m, 2);
-                        _tmp3m = _mm_slli_epi32(_tmp3m, 2);
-
-                        _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0m);
-                        _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1m);
-                        _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2m);
-                        _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3m);
-
-                        output0_tm_0 += tiles * 24;
-                        output0_tm_1 += tiles * 24;
-                        output0_tm_2 += tiles * 24;
-                        output0_tm_3 += tiles * 24;
-                        output0_tm_4 += tiles * 24;
-                        output0_tm_5 += tiles * 24;
-                    }
-
-                    for (int m = 0; m < 4; m++)
-                    {
-                        __m128i _tmp00 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
-                        __m128i _tmp01 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
-                        __m128i _tmp02 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
-                        __m128i _tmp03 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
-                        __m128i _tmp04 = _mm_loadu_si128((const __m128i*)tmp[m][4]);
-                        __m128i _tmp05 = _mm_loadu_si128((const __m128i*)tmp[m][5]);
-
-                        __m128i _tmp02a = _mm_add_epi32(_tmp01, _tmp02);
-                        __m128i _tmp13a = _mm_sub_epi32(_tmp01, _tmp02);
-
-                        __m128i _tmp02b = _mm_add_epi32(_tmp03, _tmp04);
-                        __m128i _tmp13b = _mm_sub_epi32(_tmp03, _tmp04);
-
-                        __m128i _out00 = _mm_add_epi32(_mm_add_epi32(_tmp00, _tmp02a), _tmp02b);
-                        __m128i _out01 = _mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 1));
-                        __m128i _out02 = _mm_add_epi32(_tmp02a, _mm_slli_epi32(_tmp02b, 2));
-                        __m128i _out03 = _mm_add_epi32(_mm_add_epi32(_tmp05, _tmp13a), _mm_slli_epi32(_tmp13b, 3));
-
-                        // TODO use integer trick for division by 576
-                        __m128 _v576 = _mm_set1_ps(1.0 / 576);
-                        _out00 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_out00), _v576));
-                        _out01 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_out01), _v576));
-                        _out02 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_out02), _v576));
-                        _out03 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_out03), _v576));
-
-                        _mm_storeu_si128((__m128i*)output0, _out00);
-                        _mm_storeu_si128((__m128i*)(output0 + 4), _out01);
-                        _mm_storeu_si128((__m128i*)(output0 + 8), _out02);
-                        _mm_storeu_si128((__m128i*)(output0 + 12), _out03);
-
-                        output0 += outw * 4;
-                    }
-                }
-            }
-        }
-    }
-    // END transform output
-
-    // cut result pad
-    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt);
-}
diff --git a/src/layer/x86/convolution_3x3_winograd_int8.h b/src/layer/x86/convolution_3x3_winograd_int8.h
new file mode 100644
index 000000000000..8c7b891b0dda
--- /dev/null
+++ b/src/layer/x86/convolution_3x3_winograd_int8.h
@@ -0,0 +1,6407 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
+#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
+void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
+void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
+void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt);
+void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
+void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt);
+#endif
+#endif
+
+static void pack_A_tile_int8(const Mat& A, Mat& AT, int batch, int max_ii, int max_kk)
+{
+    const int N = max_kk * batch;
+
+    for (int b = 0; b < batch; b++)
+    {
+        short* pp = AT.row<short>(b);
+
+        int ii = 0;
+#if __SSE2__
+#if __AVX2__
+#if __AVX512F__
+        for (; ii + 15 < max_ii; ii += 16)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[batch];
+                pp[2] = p0[N];
+                pp[3] = p0[N + batch];
+                pp[4] = p0[N * 2];
+                pp[5] = p0[N * 2 + batch];
+                pp[6] = p0[N * 3];
+                pp[7] = p0[N * 3 + batch];
+                pp[8] = p0[N * 4];
+                pp[9] = p0[N * 4 + batch];
+                pp[10] = p0[N * 5];
+                pp[11] = p0[N * 5 + batch];
+                pp[12] = p0[N * 6];
+                pp[13] = p0[N * 6 + batch];
+                pp[14] = p0[N * 7];
+                pp[15] = p0[N * 7 + batch];
+                pp[16] = p0[N * 8];
+                pp[17] = p0[N * 8 + batch];
+                pp[18] = p0[N * 9];
+                pp[19] = p0[N * 9 + batch];
+                pp[20] = p0[N * 10];
+                pp[21] = p0[N * 10 + batch];
+                pp[22] = p0[N * 11];
+                pp[23] = p0[N * 11 + batch];
+                pp[24] = p0[N * 12];
+                pp[25] = p0[N * 12 + batch];
+                pp[26] = p0[N * 13];
+                pp[27] = p0[N * 13 + batch];
+                pp[28] = p0[N * 14];
+                pp[29] = p0[N * 14 + batch];
+                pp[30] = p0[N * 15];
+                pp[31] = p0[N * 15 + batch];
+                p0 += batch * 2;
+                pp += 32;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[N];
+                pp[2] = p0[N * 2];
+                pp[3] = p0[N * 3];
+                pp[4] = p0[N * 4];
+                pp[5] = p0[N * 5];
+                pp[6] = p0[N * 6];
+                pp[7] = p0[N * 7];
+                pp[8] = p0[N * 8];
+                pp[9] = p0[N * 9];
+                pp[10] = p0[N * 10];
+                pp[11] = p0[N * 11];
+                pp[12] = p0[N * 12];
+                pp[13] = p0[N * 13];
+                pp[14] = p0[N * 14];
+                pp[15] = p0[N * 15];
+                p0 += batch;
+                pp += 16;
+            }
+        }
+#endif // __AVX512F__
+        for (; ii + 7 < max_ii; ii += 8)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[batch];
+                pp[2] = p0[N];
+                pp[3] = p0[N + batch];
+                pp[4] = p0[N * 2];
+                pp[5] = p0[N * 2 + batch];
+                pp[6] = p0[N * 3];
+                pp[7] = p0[N * 3 + batch];
+                pp[8] = p0[N * 4];
+                pp[9] = p0[N * 4 + batch];
+                pp[10] = p0[N * 5];
+                pp[11] = p0[N * 5 + batch];
+                pp[12] = p0[N * 6];
+                pp[13] = p0[N * 6 + batch];
+                pp[14] = p0[N * 7];
+                pp[15] = p0[N * 7 + batch];
+                p0 += batch * 2;
+                pp += 16;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[N];
+                pp[2] = p0[N * 2];
+                pp[3] = p0[N * 3];
+                pp[4] = p0[N * 4];
+                pp[5] = p0[N * 5];
+                pp[6] = p0[N * 6];
+                pp[7] = p0[N * 7];
+                p0 += batch;
+                pp += 8;
+            }
+        }
+#endif // __AVX2__
+        for (; ii + 3 < max_ii; ii += 4)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[batch];
+                pp[2] = p0[N];
+                pp[3] = p0[N + batch];
+                pp[4] = p0[N * 2];
+                pp[5] = p0[N * 2 + batch];
+                pp[6] = p0[N * 3];
+                pp[7] = p0[N * 3 + batch];
+                p0 += batch * 2;
+                pp += 8;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[N];
+                pp[2] = p0[N * 2];
+                pp[3] = p0[N * 3];
+                p0 += batch;
+                pp += 4;
+            }
+        }
+#endif // __SSE2__
+        for (; ii + 1 < max_ii; ii += 2)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[batch];
+                pp[2] = p0[N];
+                pp[3] = p0[N + batch];
+                p0 += batch * 2;
+                pp += 4;
+            }
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[N];
+                p0 += batch;
+                pp += 2;
+            }
+        }
+        for (; ii < max_ii; ii++)
+        {
+            const short* p0 = (const short*)A + ii * N + b;
+
+            int kk = 0;
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                p0 += batch;
+                pp += 1;
+            }
+        }
+    }
+}
+
+static void transpose_pack_B_tile_int8(const Mat& B, Mat& BT, int batch, int max_jj, int max_kk, int nT)
+{
+    #pragma omp parallel for num_threads(nT)
+    for (int b = 0; b < batch; b++)
+    {
+        short* pp = BT.row<short>(b);
+
+        int jj = 0;
+#if __SSE2__
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+        for (; jj + 15 < max_jj; jj += 16)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+            p0 += (b * max_jj + jj) * 16;
+            for (; kk + 15 < max_kk; kk += 16)
+            {
+                __m512i _r0 = _mm512_loadu_si512((const __m512i*)p0);
+                __m512i _r1 = _mm512_loadu_si512((const __m512i*)(p0 + 32));
+                __m512i _r2 = _mm512_loadu_si512((const __m512i*)(p0 + 64));
+                __m512i _r3 = _mm512_loadu_si512((const __m512i*)(p0 + 96));
+                __m512i _r4 = _mm512_loadu_si512((const __m512i*)(p0 + 128));
+                __m512i _r5 = _mm512_loadu_si512((const __m512i*)(p0 + 160));
+                __m512i _r6 = _mm512_loadu_si512((const __m512i*)(p0 + 192));
+                __m512i _r7 = _mm512_loadu_si512((const __m512i*)(p0 + 224));
+                __m512i _tmp0 = _mm512_shuffle_i32x4(_r0, _r2, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp1 = _mm512_shuffle_i32x4(_r0, _r2, _MM_SHUFFLE(3, 2, 3, 2));
+                __m512i _tmp2 = _mm512_shuffle_i32x4(_r1, _r3, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp3 = _mm512_shuffle_i32x4(_r1, _r3, _MM_SHUFFLE(3, 2, 3, 2));
+                __m512i _tmp4 = _mm512_shuffle_i32x4(_r4, _r6, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp5 = _mm512_shuffle_i32x4(_r4, _r6, _MM_SHUFFLE(3, 2, 3, 2));
+                __m512i _tmp6 = _mm512_shuffle_i32x4(_r5, _r7, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp7 = _mm512_shuffle_i32x4(_r5, _r7, _MM_SHUFFLE(3, 2, 3, 2));
+                _r0 = _mm512_unpacklo_epi32(_tmp0, _tmp1);
+                _r1 = _mm512_unpackhi_epi32(_tmp0, _tmp1);
+                _r2 = _mm512_unpacklo_epi32(_tmp2, _tmp3);
+                _r3 = _mm512_unpackhi_epi32(_tmp2, _tmp3);
+                _r4 = _mm512_unpacklo_epi32(_tmp4, _tmp5);
+                _r5 = _mm512_unpackhi_epi32(_tmp4, _tmp5);
+                _r6 = _mm512_unpacklo_epi32(_tmp6, _tmp7);
+                _r7 = _mm512_unpackhi_epi32(_tmp6, _tmp7);
+                _tmp0 = _mm512_unpacklo_epi64(_r0, _r2);
+                _tmp1 = _mm512_unpackhi_epi64(_r0, _r2);
+                _tmp2 = _mm512_unpacklo_epi64(_r1, _r3);
+                _tmp3 = _mm512_unpackhi_epi64(_r1, _r3);
+                _tmp4 = _mm512_unpacklo_epi64(_r4, _r6);
+                _tmp5 = _mm512_unpackhi_epi64(_r4, _r6);
+                _tmp6 = _mm512_unpacklo_epi64(_r5, _r7);
+                _tmp7 = _mm512_unpackhi_epi64(_r5, _r7);
+                _r0 = _mm512_shuffle_i32x4(_tmp0, _tmp4, _MM_SHUFFLE(2, 0, 2, 0));
+                _r1 = _mm512_shuffle_i32x4(_tmp1, _tmp5, _MM_SHUFFLE(2, 0, 2, 0));
+                _r2 = _mm512_shuffle_i32x4(_tmp2, _tmp6, _MM_SHUFFLE(2, 0, 2, 0));
+                _r3 = _mm512_shuffle_i32x4(_tmp3, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
+                _r4 = _mm512_shuffle_i32x4(_tmp0, _tmp4, _MM_SHUFFLE(3, 1, 3, 1));
+                _r5 = _mm512_shuffle_i32x4(_tmp1, _tmp5, _MM_SHUFFLE(3, 1, 3, 1));
+                _r6 = _mm512_shuffle_i32x4(_tmp2, _tmp6, _MM_SHUFFLE(3, 1, 3, 1));
+                _r7 = _mm512_shuffle_i32x4(_tmp3, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                _mm512_storeu_si512((__m512i*)pp, _r0);
+                _mm512_storeu_si512((__m512i*)(pp + 32), _r1);
+                _mm512_storeu_si512((__m512i*)(pp + 64), _r2);
+                _mm512_storeu_si512((__m512i*)(pp + 96), _r3);
+                _mm512_storeu_si512((__m512i*)(pp + 128), _r4);
+                _mm512_storeu_si512((__m512i*)(pp + 160), _r5);
+                _mm512_storeu_si512((__m512i*)(pp + 192), _r6);
+                _mm512_storeu_si512((__m512i*)(pp + 224), _r7);
+                p0 += max_jj * batch * 16;
+                pp += 256;
+            }
+            p0 -= (b * max_jj + jj) * 16;
+            p0 += (b * max_jj + jj) * 8;
+            for (; kk + 7 < max_kk; kk += 8)
+            {
+                __m512i _r0 = _mm512_loadu_si512((const __m512i*)p0);
+                __m512i _r1 = _mm512_loadu_si512((const __m512i*)(p0 + 32));
+                __m512i _r2 = _mm512_loadu_si512((const __m512i*)(p0 + 64));
+                __m512i _r3 = _mm512_loadu_si512((const __m512i*)(p0 + 96));
+                __m512i _tmp0 = _mm512_shuffle_i32x4(_r0, _r1, _MM_SHUFFLE(2, 0, 2, 0));
+                __m512i _tmp1 = _mm512_shuffle_i32x4(_r0, _r1, _MM_SHUFFLE(3, 1, 3, 1));
+                __m512i _tmp2 = _mm512_shuffle_i32x4(_r2, _r3, _MM_SHUFFLE(2, 0, 2, 0));
+                __m512i _tmp3 = _mm512_shuffle_i32x4(_r2, _r3, _MM_SHUFFLE(3, 1, 3, 1));
+                _r0 = _mm512_unpacklo_epi32(_tmp0, _tmp1);
+                _r1 = _mm512_unpackhi_epi32(_tmp0, _tmp1);
+                _r2 = _mm512_unpacklo_epi32(_tmp2, _tmp3);
+                _r3 = _mm512_unpackhi_epi32(_tmp2, _tmp3);
+                _tmp0 = _mm512_permutex_epi64(_r0, _MM_SHUFFLE(3, 1, 2, 0));
+                _tmp1 = _mm512_permutex_epi64(_r1, _MM_SHUFFLE(3, 1, 2, 0));
+                _tmp2 = _mm512_permutex_epi64(_r2, _MM_SHUFFLE(3, 1, 2, 0));
+                _tmp3 = _mm512_permutex_epi64(_r3, _MM_SHUFFLE(3, 1, 2, 0));
+                _r0 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(2, 0, 2, 0));
+                _r1 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(3, 1, 3, 1));
+                _r2 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+                _r3 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                _mm512_storeu_si512((__m512i*)pp, _r0);
+                _mm512_storeu_si512((__m512i*)(pp + 32), _r1);
+                _mm512_storeu_si512((__m512i*)(pp + 64), _r2);
+                _mm512_storeu_si512((__m512i*)(pp + 96), _r3);
+                p0 += max_jj * batch * 8;
+                pp += 128;
+            }
+            p0 -= (b * max_jj + jj) * 8;
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                __m512i _r0 = _mm512_loadu_si512((const __m512i*)p0);
+                _mm512_storeu_si512((__m512i*)pp, _r0);
+                p0 += max_jj * batch * 2;
+                pp += 32;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                __m256i _r0 = _mm256_loadu_si256((const __m256i*)p0);
+                _mm256_store_si256((__m256i*)pp, _r0);
+                p0 += max_jj * batch;
+                pp += 16;
+            }
+        }
+#endif // __AVX512F__
+        for (; jj + 7 < max_jj; jj += 8)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+#if __AVX512F__
+            p0 += (b * max_jj + jj) * 16;
+            for (; kk + 15 < max_kk; kk += 16)
+            {
+                __m512i _r0 = _mm512_loadu_si512((const __m512i*)p0);
+                __m512i _r1 = _mm512_loadu_si512((const __m512i*)(p0 + 32));
+                __m512i _r2 = _mm512_loadu_si512((const __m512i*)(p0 + 64));
+                __m512i _r3 = _mm512_loadu_si512((const __m512i*)(p0 + 96));
+                __m512i _tmp0 = _mm512_shuffle_i32x4(_r0, _r2, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp1 = _mm512_shuffle_i32x4(_r0, _r2, _MM_SHUFFLE(3, 2, 3, 2));
+                __m512i _tmp2 = _mm512_shuffle_i32x4(_r1, _r3, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp3 = _mm512_shuffle_i32x4(_r1, _r3, _MM_SHUFFLE(3, 2, 3, 2));
+                _r0 = _mm512_unpacklo_epi32(_tmp0, _tmp1);
+                _r1 = _mm512_unpackhi_epi32(_tmp0, _tmp1);
+                _r2 = _mm512_unpacklo_epi32(_tmp2, _tmp3);
+                _r3 = _mm512_unpackhi_epi32(_tmp2, _tmp3);
+                _tmp0 = _mm512_unpacklo_epi64(_r0, _r2);
+                _tmp1 = _mm512_unpackhi_epi64(_r0, _r2);
+                _tmp2 = _mm512_unpacklo_epi64(_r1, _r3);
+                _tmp3 = _mm512_unpackhi_epi64(_r1, _r3);
+                _r0 = _mm512_shuffle_i32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                _r1 = _mm512_shuffle_i32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+                _r2 = _mm512_shuffle_i32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                _r3 = _mm512_shuffle_i32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                _mm512_storeu_si512((__m512i*)pp, _r0);
+                _mm512_storeu_si512((__m512i*)(pp + 32), _r1);
+                _mm512_storeu_si512((__m512i*)(pp + 64), _r2);
+                _mm512_storeu_si512((__m512i*)(pp + 96), _r3);
+                p0 += max_jj * batch * 16;
+                pp += 128;
+            }
+            p0 -= (b * max_jj + jj) * 16;
+#endif // __AVX512F__
+            p0 += (b * max_jj + jj) * 8;
+            for (; kk + 7 < max_kk; kk += 8)
+            {
+#if __AVX__
+                __m256 _r0 = _mm256_loadu_ps((const float*)p0);
+                __m256 _r1 = _mm256_loadu_ps((const float*)(p0 + 16));
+                __m256 _r2 = _mm256_loadu_ps((const float*)(p0 + 32));
+                __m256 _r3 = _mm256_loadu_ps((const float*)(p0 + 48));
+                __m256 _tmp0 = _mm256_permute2f128_ps(_r0, _r2, _MM_SHUFFLE(0, 2, 0, 0));
+                __m256 _tmp1 = _mm256_permute2f128_ps(_r0, _r2, _MM_SHUFFLE(0, 3, 0, 1));
+                __m256 _tmp2 = _mm256_permute2f128_ps(_r1, _r3, _MM_SHUFFLE(0, 2, 0, 0));
+                __m256 _tmp3 = _mm256_permute2f128_ps(_r1, _r3, _MM_SHUFFLE(0, 3, 0, 1));
+                _r0 = _mm256_unpacklo_ps(_tmp0, _tmp1);
+                _r1 = _mm256_unpackhi_ps(_tmp0, _tmp1);
+                _r2 = _mm256_unpacklo_ps(_tmp2, _tmp3);
+                _r3 = _mm256_unpackhi_ps(_tmp2, _tmp3);
+                _tmp0 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(_r0), _mm256_castps_pd(_r2)));
+                _tmp1 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(_r0), _mm256_castps_pd(_r2)));
+                _tmp2 = _mm256_castpd_ps(_mm256_unpacklo_pd(_mm256_castps_pd(_r1), _mm256_castps_pd(_r3)));
+                _tmp3 = _mm256_castpd_ps(_mm256_unpackhi_pd(_mm256_castps_pd(_r1), _mm256_castps_pd(_r3)));
+                _mm256_storeu_ps((float*)pp, _tmp0);
+                _mm256_storeu_ps((float*)(pp + 16), _tmp1);
+                _mm256_storeu_ps((float*)(pp + 32), _tmp2);
+                _mm256_storeu_ps((float*)(pp + 48), _tmp3);
+#else
+                __m128i _r0 = _mm_load_si128((const __m128i*)p0);
+                __m128i _r1 = _mm_load_si128((const __m128i*)(p0 + 8));
+                __m128i _r2 = _mm_load_si128((const __m128i*)(p0 + 8 * 2));
+                __m128i _r3 = _mm_load_si128((const __m128i*)(p0 + 8 * 3));
+                __m128i _r4 = _mm_load_si128((const __m128i*)(p0 + 8 * 4));
+                __m128i _r5 = _mm_load_si128((const __m128i*)(p0 + 8 * 5));
+                __m128i _r6 = _mm_load_si128((const __m128i*)(p0 + 8 * 6));
+                __m128i _r7 = _mm_load_si128((const __m128i*)(p0 + 8 * 7));
+                transpose4x8_epi32(_r0, _r1, _r2, _r3, _r4, _r5, _r6, _r7);
+                _mm_store_si128((__m128i*)pp, _r0);
+                _mm_store_si128((__m128i*)(pp + 8), _r1);
+                _mm_store_si128((__m128i*)(pp + 8 * 2), _r2);
+                _mm_store_si128((__m128i*)(pp + 8 * 3), _r3);
+                _mm_store_si128((__m128i*)(pp + 8 * 4), _r4);
+                _mm_store_si128((__m128i*)(pp + 8 * 5), _r5);
+                _mm_store_si128((__m128i*)(pp + 8 * 6), _r6);
+                _mm_store_si128((__m128i*)(pp + 8 * 7), _r7);
+#endif // __AVX__
+                p0 += max_jj * batch * 8;
+                pp += 64;
+            }
+            p0 -= (b * max_jj + jj) * 8;
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+#if __AVX__
+                __m256 _r0 = _mm256_loadu_ps((const float*)p0);
+                _mm256_storeu_ps((float*)pp, _r0);
+#else
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)p0);
+                __m128i _r1 = _mm_loadu_si128((const __m128i*)(p0 + 8));
+                _mm_store_si128((__m128i*)pp, _r0);
+                _mm_store_si128((__m128i*)(pp + 8), _r1);
+#endif // __AVX__
+                p0 += max_jj * batch * 2;
+                pp += 16;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)p0);
+                _mm_store_si128((__m128i*)pp, _r0);
+                p0 += max_jj * batch;
+                pp += 8;
+            }
+        }
+#endif // defined(__x86_64__) || defined(_M_X64)
+        for (; jj + 3 < max_jj; jj += 4)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+#if __AVX512F__
+            p0 += (b * max_jj + jj) * 16;
+            for (; kk + 15 < max_kk; kk += 16)
+            {
+                __m512i _r0 = _mm512_loadu_si512((const __m512i*)p0);
+                __m512i _r1 = _mm512_loadu_si512((const __m512i*)(p0 + 32));
+                __m512i _tmp0 = _mm512_shuffle_i32x4(_r0, _r1, _MM_SHUFFLE(1, 0, 1, 0));
+                __m512i _tmp1 = _mm512_shuffle_i32x4(_r0, _r1, _MM_SHUFFLE(3, 2, 3, 2));
+                _r0 = _mm512_unpacklo_epi32(_tmp0, _tmp1);
+                _r1 = _mm512_unpackhi_epi32(_tmp0, _tmp1);
+                _r0 = _mm512_permutex_epi64(_r0, _MM_SHUFFLE(3, 1, 2, 0));
+                _r1 = _mm512_permutex_epi64(_r1, _MM_SHUFFLE(3, 1, 2, 0));
+                _tmp0 = _mm512_shuffle_i32x4(_r0, _r1, _MM_SHUFFLE(1, 0, 1, 0));
+                _tmp1 = _mm512_shuffle_i32x4(_r0, _r1, _MM_SHUFFLE(3, 2, 3, 2));
+                _r0 = _mm512_unpacklo_epi64(_tmp0, _tmp1);
+                _r1 = _mm512_unpackhi_epi64(_tmp0, _tmp1);
+                _mm512_storeu_si512((__m512i*)pp, _r0);
+                _mm512_storeu_si512((__m512i*)(pp + 32), _r1);
+                p0 += max_jj * batch * 16;
+                pp += 64;
+            }
+            p0 -= (b * max_jj + jj) * 16;
+#endif // __AVX512F__
+            p0 += (b * max_jj + jj) * 8;
+            for (; kk + 7 < max_kk; kk += 8)
+            {
+                __m128i _r0 = _mm_load_si128((const __m128i*)p0);
+                __m128i _r1 = _mm_load_si128((const __m128i*)(p0 + 8));
+                __m128i _r2 = _mm_load_si128((const __m128i*)(p0 + 8 * 2));
+                __m128i _r3 = _mm_load_si128((const __m128i*)(p0 + 8 * 3));
+                transpose4x4_epi32(_r0, _r1, _r2, _r3);
+                _mm_storeu_si128((__m128i*)pp, _r0);
+                _mm_storeu_si128((__m128i*)(pp + 8), _r1);
+                _mm_storeu_si128((__m128i*)(pp + 8 * 2), _r2);
+                _mm_storeu_si128((__m128i*)(pp + 8 * 3), _r3);
+                p0 += max_jj * batch * 8;
+                pp += 32;
+            }
+            p0 -= (b * max_jj + jj) * 8;
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)p0);
+                _mm_storeu_si128((__m128i*)pp, _r0);
+                p0 += max_jj * batch * 2;
+                pp += 8;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                pp[2] = p0[2];
+                pp[3] = p0[3];
+                p0 += max_jj * batch;
+                pp += 4;
+            }
+        }
+#endif // __SSE2__
+        for (; jj + 1 < max_jj; jj += 2)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+#if __SSE2__
+#if __AVX512F__
+            p0 += (b * max_jj + jj) * 16;
+            for (; kk + 15 < max_kk; kk += 16)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)p0);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)(p0 + 16));
+                transpose8x2_epi32(_r0, _r1);
+                _mm256_storeu_si256((__m256i*)pp, _r0);
+                _mm256_storeu_si256((__m256i*)(pp + 16), _r1);
+                p0 += max_jj * batch * 16;
+                pp += 32;
+            }
+            p0 -= (b * max_jj + jj) * 16;
+#endif // __AVX512F__
+            p0 += (b * max_jj + jj) * 8;
+            for (; kk + 7 < max_kk; kk += 8)
+            {
+                __m128i _r0 = _mm_load_si128((const __m128i*)p0);
+                __m128i _r1 = _mm_load_si128((const __m128i*)(p0 + 8));
+                __m128i _tmp0 = _mm_unpacklo_epi32(_r0, _r1);
+                __m128i _tmp1 = _mm_unpackhi_epi32(_r0, _r1);
+                _mm_storeu_si128((__m128i*)pp, _tmp0);
+                _mm_storeu_si128((__m128i*)(pp + 8), _tmp1);
+                p0 += max_jj * batch * 8;
+                pp += 16;
+            }
+            p0 -= (b * max_jj + jj) * 8;
+#endif // __SSE2__
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                pp[2] = p0[2];
+                pp[3] = p0[3];
+                p0 += max_jj * batch * 2;
+                pp += 4;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                p0 += max_jj * batch;
+                pp += 2;
+            }
+        }
+        for (; jj < max_jj; jj++)
+        {
+            const short* p0 = B;
+
+            int kk = 0;
+#if __SSE2__
+#if __AVX512F__
+            p0 += (b * max_jj + jj) * 16;
+            for (; kk + 15 < max_kk; kk += 16)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)p0);
+                _mm256_storeu_si256((__m256i*)pp, _r0);
+                p0 += max_jj * batch * 16;
+                pp += 16;
+            }
+            p0 -= (b * max_jj + jj) * 16;
+#endif // __AVX512F__
+            p0 += (b * max_jj + jj) * 8;
+            for (; kk + 7 < max_kk; kk += 8)
+            {
+                __m128i _r0 = _mm_load_si128((const __m128i*)p0);
+                _mm_storeu_si128((__m128i*)pp, _r0);
+                p0 += max_jj * batch * 8;
+                pp += 8;
+            }
+            p0 -= (b * max_jj + jj) * 8;
+#endif // __SSE2__
+            p0 += (b * max_jj + jj) * 2;
+            for (; kk + 1 < max_kk; kk += 2)
+            {
+                pp[0] = p0[0];
+                pp[1] = p0[1];
+                p0 += max_jj * batch * 2;
+                pp += 2;
+            }
+            p0 -= (b * max_jj + jj) * 2;
+            p0 += (b * max_jj + jj);
+            for (; kk < max_kk; kk++)
+            {
+                pp[0] = p0[0];
+                p0 += max_jj * batch;
+                pp += 1;
+            }
+        }
+    }
+}
+
+static void gemm_transB_packed_tile_int8(const Mat& AT_tile, const Mat& BT_tile, Mat& top_blob, int batch, int max_ii, int max_jj, int k, int max_kk, bool k_end)
+{
+    int* outptr = top_blob;
+
+    int ii = 0;
+#if __SSE2__
+#if __AVX2__
+#if __AVX512F__
+    for (; ii + 15 < max_ii; ii += 16)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+#if defined(__x86_64__) || defined(_M_X64)
+            for (; jj + 15 < max_jj; jj += 16)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+                __m512i _sum2;
+                __m512i _sum3;
+                __m512i _sum4;
+                __m512i _sum5;
+                __m512i _sum6;
+                __m512i _sum7;
+                __m512i _sum8;
+                __m512i _sum9;
+                __m512i _suma;
+                __m512i _sumb;
+                __m512i _sumc;
+                __m512i _sumd;
+                __m512i _sume;
+                __m512i _sumf;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                    _sum2 = _mm512_setzero_si512();
+                    _sum3 = _mm512_setzero_si512();
+                    _sum4 = _mm512_setzero_si512();
+                    _sum5 = _mm512_setzero_si512();
+                    _sum6 = _mm512_setzero_si512();
+                    _sum7 = _mm512_setzero_si512();
+                    _sum8 = _mm512_setzero_si512();
+                    _sum9 = _mm512_setzero_si512();
+                    _suma = _mm512_setzero_si512();
+                    _sumb = _mm512_setzero_si512();
+                    _sumc = _mm512_setzero_si512();
+                    _sumd = _mm512_setzero_si512();
+                    _sume = _mm512_setzero_si512();
+                    _sumf = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_load_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_load_si512((const __m512i*)(outptr + 16));
+                    _sum2 = _mm512_load_si512((const __m512i*)(outptr + 32));
+                    _sum3 = _mm512_load_si512((const __m512i*)(outptr + 48));
+                    _sum4 = _mm512_load_si512((const __m512i*)(outptr + 64));
+                    _sum5 = _mm512_load_si512((const __m512i*)(outptr + 80));
+                    _sum6 = _mm512_load_si512((const __m512i*)(outptr + 96));
+                    _sum7 = _mm512_load_si512((const __m512i*)(outptr + 112));
+                    _sum8 = _mm512_load_si512((const __m512i*)(outptr + 128));
+                    _sum9 = _mm512_load_si512((const __m512i*)(outptr + 128 + 16));
+                    _suma = _mm512_load_si512((const __m512i*)(outptr + 128 + 32));
+                    _sumb = _mm512_load_si512((const __m512i*)(outptr + 128 + 48));
+                    _sumc = _mm512_load_si512((const __m512i*)(outptr + 128 + 64));
+                    _sumd = _mm512_load_si512((const __m512i*)(outptr + 128 + 80));
+                    _sume = _mm512_load_si512((const __m512i*)(outptr + 128 + 96));
+                    _sumf = _mm512_load_si512((const __m512i*)(outptr + 128 + 112));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA0 = _mm512_loadu_si512((const __m512i*)pA);
+                    __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB);
+                    __m512i _pA1 = _mm512_shuffle_i32x4(_pA0, _pA0, _MM_SHUFFLE(2, 1, 0, 3));
+                    __m512i _pA2 = _mm512_shuffle_i32x4(_pA0, _pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m512i _pA3 = _mm512_shuffle_i32x4(_pA0, _pA0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+                    __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC);
+                    __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm512_dpwssd_epi32(_sum2, _pA0, _pB2);
+                    _sum3 = _mm512_dpwssd_epi32(_sum3, _pA0, _pB3);
+                    _sum4 = _mm512_dpwssd_epi32(_sum4, _pA1, _pB0);
+                    _sum5 = _mm512_dpwssd_epi32(_sum5, _pA1, _pB1);
+                    _sum6 = _mm512_dpwssd_epi32(_sum6, _pA1, _pB2);
+                    _sum7 = _mm512_dpwssd_epi32(_sum7, _pA1, _pB3);
+                    _sum8 = _mm512_dpwssd_epi32(_sum8, _pA2, _pB0);
+                    _sum9 = _mm512_dpwssd_epi32(_sum9, _pA2, _pB1);
+                    _suma = _mm512_dpwssd_epi32(_suma, _pA2, _pB2);
+                    _sumb = _mm512_dpwssd_epi32(_sumb, _pA2, _pB3);
+                    _sumc = _mm512_dpwssd_epi32(_sumc, _pA3, _pB0);
+                    _sumd = _mm512_dpwssd_epi32(_sumd, _pA3, _pB1);
+                    _sume = _mm512_dpwssd_epi32(_sume, _pA3, _pB2);
+                    _sumf = _mm512_dpwssd_epi32(_sumf, _pA3, _pB3);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA0, _pB2));
+                    _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA0, _pB3));
+                    _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA1, _pB0));
+                    _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA1, _pB1));
+                    _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA1, _pB2));
+                    _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA1, _pB3));
+                    _sum8 = _mm512_add_epi32(_sum8, _mm512_madd_epi16(_pA2, _pB0));
+                    _sum9 = _mm512_add_epi32(_sum9, _mm512_madd_epi16(_pA2, _pB1));
+                    _suma = _mm512_add_epi32(_suma, _mm512_madd_epi16(_pA2, _pB2));
+                    _sumb = _mm512_add_epi32(_sumb, _mm512_madd_epi16(_pA2, _pB3));
+                    _sumc = _mm512_add_epi32(_sumc, _mm512_madd_epi16(_pA3, _pB0));
+                    _sumd = _mm512_add_epi32(_sumd, _mm512_madd_epi16(_pA3, _pB1));
+                    _sume = _mm512_add_epi32(_sume, _mm512_madd_epi16(_pA3, _pB2));
+                    _sumf = _mm512_add_epi32(_sumf, _mm512_madd_epi16(_pA3, _pB3));
+#endif
+
+                    pA += 32;
+                    pB += 32;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA0 = _mm512_cvtepi16_epi32(_mm256_load_si256((const __m256i*)pA));
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_mm256_load_si256((const __m256i*)pB));
+
+                    __m512i _pA1 = _mm512_shuffle_i32x4(_pA0, _pA0, _MM_SHUFFLE(2, 1, 0, 3));
+                    __m512i _pA2 = _mm512_shuffle_i32x4(_pA0, _pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m512i _pA3 = _mm512_shuffle_i32x4(_pA0, _pA0, _MM_SHUFFLE(0, 3, 2, 1));
+
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+                    __m512i _pB2 = _mm512_permutex_epi64(_pB0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA0, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA0, _pB1);
+                    __m512i _s2 = _mm512_mullo_epi32(_pA0, _pB2);
+                    __m512i _s3 = _mm512_mullo_epi32(_pA0, _pB3);
+                    __m512i _s4 = _mm512_mullo_epi32(_pA1, _pB0);
+                    __m512i _s5 = _mm512_mullo_epi32(_pA1, _pB1);
+                    __m512i _s6 = _mm512_mullo_epi32(_pA1, _pB2);
+                    __m512i _s7 = _mm512_mullo_epi32(_pA1, _pB3);
+                    __m512i _s8 = _mm512_mullo_epi32(_pA2, _pB0);
+                    __m512i _s9 = _mm512_mullo_epi32(_pA2, _pB1);
+                    __m512i _sa = _mm512_mullo_epi32(_pA2, _pB2);
+                    __m512i _sb = _mm512_mullo_epi32(_pA2, _pB3);
+                    __m512i _sc = _mm512_mullo_epi32(_pA3, _pB0);
+                    __m512i _sd = _mm512_mullo_epi32(_pA3, _pB1);
+                    __m512i _se = _mm512_mullo_epi32(_pA3, _pB2);
+                    __m512i _sf = _mm512_mullo_epi32(_pA3, _pB3);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+                    _sum2 = _mm512_add_epi32(_sum2, _s2);
+                    _sum3 = _mm512_add_epi32(_sum3, _s3);
+                    _sum4 = _mm512_add_epi32(_sum4, _s4);
+                    _sum5 = _mm512_add_epi32(_sum5, _s5);
+                    _sum6 = _mm512_add_epi32(_sum6, _s6);
+                    _sum7 = _mm512_add_epi32(_sum7, _s7);
+                    _sum8 = _mm512_add_epi32(_sum8, _s8);
+                    _sum9 = _mm512_add_epi32(_sum9, _s9);
+                    _suma = _mm512_add_epi32(_suma, _sa);
+                    _sumb = _mm512_add_epi32(_sumb, _sb);
+                    _sumc = _mm512_add_epi32(_sumc, _sc);
+                    _sumd = _mm512_add_epi32(_sumd, _sd);
+                    _sume = _mm512_add_epi32(_sume, _se);
+                    _sumf = _mm512_add_epi32(_sumf, _sf);
+
+                    pA += 16;
+                    pB += 16;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 44 55 66 77 88 99 aa bb cc dd ee ff
+                    //      01 12 23 30 45 56 67 74 89 9a ab b8 cd de ef fc
+                    //      02 13 20 31 46 57 64 75 8a 9b a8 b9 ce df ec fd
+                    //      03 10 21 32 47 54 65 76 8b 98 a9 ba cf dc ed fe
+                    //      c0 d1 e2 f3 04 15 26 37 48 59 6a 7b 8c 9d ae bf
+                    //      c1 d2 e3 f0 05 16 27 34 49 5a 6b 78 8d 9e af bc
+                    //      c2 d3 e0 f1 06 17 24 35 4a 5b 68 79 8e 9f ac bd
+                    //      c3 d0 e1 f2 07 14 25 36 4b 58 69 7a 8f 9c ad be
+                    //      80 91 a2 b3 c4 d5 e6 f7 08 19 2a 3b 4c 5d 6e 7f
+                    //      81 92 a3 b0 c5 d6 e7 f4 09 1a 2b 38 4d 5e 6f 7c
+                    //      82 93 a0 b1 c6 d7 e4 f5 0a 1b 28 39 4e 5f 6c 7d
+                    //      83 90 a1 b2 c7 d4 e5 f6 0b 18 29 3a 4f 5c 6d 7e
+                    //      40 51 62 73 84 95 a6 b7 c8 d9 ea fb 0c 1d 2e 3f
+                    //      41 52 63 70 85 96 a7 b4 c9 da eb f8 0d 1e 2f 3c
+                    //      42 53 60 71 86 97 a4 b5 ca db e8 f9 0e 1f 2c 3d
+                    //      43 50 61 72 87 94 a5 b6 cb d8 e9 fa 0f 1c 2d 3e
+                    // to
+                    //      00 10 20 30 44 54 64 74 88 98 a8 b8 cc dc ec fc
+                    //      01 11 21 31 45 55 65 75 89 99 a9 b9 cd dd ed fd
+                    //      02 12 22 32 46 56 66 76 8a 9a aa ba ce de ee fe
+                    //      03 13 23 33 47 57 67 77 8b 9b ab bb cf df ef ff
+                    //      c0 d0 e0 f0 04 14 24 34 48 58 68 78 8c 9c ac bc
+                    //      c1 d1 e1 f1 05 15 25 35 49 59 69 79 8d 9d ad bd
+                    //      c2 d2 e2 f2 06 16 26 36 4a 5a 6a 7a 8e 9e ae be
+                    //      c3 d3 e3 f3 07 17 27 37 4b 5b 6b 7b 8f 9f af bf
+                    //      80 90 a0 b0 c4 d4 e4 f4 08 18 28 38 4c 5c 6c 7c
+                    //      81 91 a1 b1 c5 d5 e5 f5 09 19 29 39 4d 5d 6d 7d
+                    //      82 92 a2 b2 c6 d6 e6 f6 0a 1a 2a 3a 4e 5e 6e 7e
+                    //      83 93 a3 b3 c7 d7 e7 f7 0b 1b 2b 3b 4f 5f 6f 7f
+                    //      40 50 60 70 84 94 a4 b4 c8 d8 e8 f8 0c 1c 2c 3c
+                    //      41 51 61 71 85 95 a5 b5 c9 d9 e9 f9 0d 1d 2d 3d
+                    //      42 52 62 72 86 96 a6 b6 ca da ea fa 0e 1e 2e 3e
+                    //      43 53 63 73 87 97 a7 b7 cb db eb fb 0f 1f 2f 3f
+                    {
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum2 = _mm512_shuffle_epi32(_sum2, _MM_PERM_BADC);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_ADCB);
+                        _sum5 = _mm512_shuffle_epi32(_sum5, _MM_PERM_CBAD);
+                        _sum6 = _mm512_shuffle_epi32(_sum6, _MM_PERM_BADC);
+                        _sum7 = _mm512_shuffle_epi32(_sum7, _MM_PERM_ADCB);
+                        _sum9 = _mm512_shuffle_epi32(_sum9, _MM_PERM_CBAD);
+                        _suma = _mm512_shuffle_epi32(_suma, _MM_PERM_BADC);
+                        _sumb = _mm512_shuffle_epi32(_sumb, _MM_PERM_ADCB);
+                        _sumd = _mm512_shuffle_epi32(_sumd, _MM_PERM_CBAD);
+                        _sume = _mm512_shuffle_epi32(_sume, _MM_PERM_BADC);
+                        _sumf = _mm512_shuffle_epi32(_sumf, _MM_PERM_ADCB);
+                        __m512i _tmp0 = _mm512_unpacklo_epi32(_sum0, _sum3);
+                        __m512i _tmp1 = _mm512_unpackhi_epi32(_sum0, _sum3);
+                        __m512i _tmp2 = _mm512_unpacklo_epi32(_sum2, _sum1);
+                        __m512i _tmp3 = _mm512_unpackhi_epi32(_sum2, _sum1);
+                        __m512i _tmp4 = _mm512_unpacklo_epi32(_sum4, _sum7);
+                        __m512i _tmp5 = _mm512_unpackhi_epi32(_sum4, _sum7);
+                        __m512i _tmp6 = _mm512_unpacklo_epi32(_sum6, _sum5);
+                        __m512i _tmp7 = _mm512_unpackhi_epi32(_sum6, _sum5);
+                        __m512i _tmp8 = _mm512_unpacklo_epi32(_sum8, _sumb);
+                        __m512i _tmp9 = _mm512_unpackhi_epi32(_sum8, _sumb);
+                        __m512i _tmpa = _mm512_unpacklo_epi32(_suma, _sum9);
+                        __m512i _tmpb = _mm512_unpackhi_epi32(_suma, _sum9);
+                        __m512i _tmpc = _mm512_unpacklo_epi32(_sumc, _sumf);
+                        __m512i _tmpd = _mm512_unpackhi_epi32(_sumc, _sumf);
+                        __m512i _tmpe = _mm512_unpacklo_epi32(_sume, _sumd);
+                        __m512i _tmpf = _mm512_unpackhi_epi32(_sume, _sumd);
+                        _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm512_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm512_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum4 = _mm512_unpacklo_epi64(_tmp4, _tmp6);
+                        _sum5 = _mm512_unpackhi_epi64(_tmp4, _tmp6);
+                        _sum6 = _mm512_unpacklo_epi64(_tmp7, _tmp5);
+                        _sum7 = _mm512_unpackhi_epi64(_tmp7, _tmp5);
+                        _sum8 = _mm512_unpacklo_epi64(_tmp8, _tmpa);
+                        _sum9 = _mm512_unpackhi_epi64(_tmp8, _tmpa);
+                        _suma = _mm512_unpacklo_epi64(_tmpb, _tmp9);
+                        _sumb = _mm512_unpackhi_epi64(_tmpb, _tmp9);
+                        _sumc = _mm512_unpacklo_epi64(_tmpc, _tmpe);
+                        _sumd = _mm512_unpackhi_epi64(_tmpc, _tmpe);
+                        _sume = _mm512_unpacklo_epi64(_tmpf, _tmpd);
+                        _sumf = _mm512_unpackhi_epi64(_tmpf, _tmpd);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                        _sum5 = _mm512_shuffle_epi32(_sum5, _MM_PERM_CBAD);
+                        _sum7 = _mm512_shuffle_epi32(_sum7, _MM_PERM_CBAD);
+                        _sum9 = _mm512_shuffle_epi32(_sum9, _MM_PERM_CBAD);
+                        _sumb = _mm512_shuffle_epi32(_sumb, _MM_PERM_CBAD);
+                        _sumd = _mm512_shuffle_epi32(_sumd, _MM_PERM_CBAD);
+                        _sumf = _mm512_shuffle_epi32(_sumf, _MM_PERM_CBAD);
+                    }
+
+                    __m512i _tmp0 = _mm512_shuffle_i32x4(_sum0, _sumc, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp1 = _mm512_shuffle_i32x4(_sum4, _sum0, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmp2 = _mm512_shuffle_i32x4(_sum8, _sum4, _MM_SHUFFLE(0, 2, 0, 2));
+                    __m512i _tmp3 = _mm512_shuffle_i32x4(_sumc, _sum8, _MM_SHUFFLE(1, 3, 1, 3));
+                    __m512i _tmp4 = _mm512_shuffle_i32x4(_sum1, _sumd, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp5 = _mm512_shuffle_i32x4(_sum5, _sum1, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmp6 = _mm512_shuffle_i32x4(_sum9, _sum5, _MM_SHUFFLE(0, 2, 0, 2));
+                    __m512i _tmp7 = _mm512_shuffle_i32x4(_sumd, _sum9, _MM_SHUFFLE(1, 3, 1, 3));
+                    __m512i _tmp8 = _mm512_shuffle_i32x4(_sum2, _sume, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp9 = _mm512_shuffle_i32x4(_sum6, _sum2, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmpa = _mm512_shuffle_i32x4(_suma, _sum6, _MM_SHUFFLE(0, 2, 0, 2));
+                    __m512i _tmpb = _mm512_shuffle_i32x4(_sume, _suma, _MM_SHUFFLE(1, 3, 1, 3));
+                    __m512i _tmpc = _mm512_shuffle_i32x4(_sum3, _sumf, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmpd = _mm512_shuffle_i32x4(_sum7, _sum3, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmpe = _mm512_shuffle_i32x4(_sumb, _sum7, _MM_SHUFFLE(0, 2, 0, 2));
+                    __m512i _tmpf = _mm512_shuffle_i32x4(_sumf, _sumb, _MM_SHUFFLE(1, 3, 1, 3));
+                    _sum0 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum1 = _mm512_shuffle_i32x4(_tmp4, _tmp6, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum2 = _mm512_shuffle_i32x4(_tmp8, _tmpa, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum3 = _mm512_shuffle_i32x4(_tmpc, _tmpe, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum4 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum5 = _mm512_shuffle_i32x4(_tmp5, _tmp7, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum6 = _mm512_shuffle_i32x4(_tmp9, _tmpb, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum7 = _mm512_shuffle_i32x4(_tmpd, _tmpf, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum8 = _mm512_shuffle_i32x4(_tmp2, _tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum9 = _mm512_shuffle_i32x4(_tmp6, _tmp4, _MM_SHUFFLE(3, 1, 2, 0));
+                    _suma = _mm512_shuffle_i32x4(_tmpa, _tmp8, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sumb = _mm512_shuffle_i32x4(_tmpe, _tmpc, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sumc = _mm512_shuffle_i32x4(_tmp3, _tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sumd = _mm512_shuffle_i32x4(_tmp7, _tmp5, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sume = _mm512_shuffle_i32x4(_tmpb, _tmp9, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sumf = _mm512_shuffle_i32x4(_tmpf, _tmpd, _MM_SHUFFLE(3, 1, 2, 0));
+                }
+
+                _mm512_store_si512((__m512i*)outptr, _sum0);
+                _mm512_store_si512((__m512i*)(outptr + 16), _sum1);
+                _mm512_store_si512((__m512i*)(outptr + 32), _sum2);
+                _mm512_store_si512((__m512i*)(outptr + 48), _sum3);
+                _mm512_store_si512((__m512i*)(outptr + 64), _sum4);
+                _mm512_store_si512((__m512i*)(outptr + 80), _sum5);
+                _mm512_store_si512((__m512i*)(outptr + 96), _sum6);
+                _mm512_store_si512((__m512i*)(outptr + 112), _sum7);
+                _mm512_store_si512((__m512i*)(outptr + 128), _sum8);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 16), _sum9);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 32), _suma);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 48), _sumb);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 64), _sumc);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 80), _sumd);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 96), _sume);
+                _mm512_store_si512((__m512i*)(outptr + 128 + 112), _sumf);
+                outptr += 256;
+            }
+            for (; jj + 7 < max_jj; jj += 8)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+                __m512i _sum2;
+                __m512i _sum3;
+                __m512i _sum4;
+                __m512i _sum5;
+                __m512i _sum6;
+                __m512i _sum7;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                    _sum2 = _mm512_setzero_si512();
+                    _sum3 = _mm512_setzero_si512();
+                    _sum4 = _mm512_setzero_si512();
+                    _sum5 = _mm512_setzero_si512();
+                    _sum6 = _mm512_setzero_si512();
+                    _sum7 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_load_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_load_si512((const __m512i*)(outptr + 16));
+                    _sum2 = _mm512_load_si512((const __m512i*)(outptr + 16 * 2));
+                    _sum3 = _mm512_load_si512((const __m512i*)(outptr + 16 * 3));
+                    _sum4 = _mm512_load_si512((const __m512i*)(outptr + 16 * 4));
+                    _sum5 = _mm512_load_si512((const __m512i*)(outptr + 16 * 5));
+                    _sum6 = _mm512_load_si512((const __m512i*)(outptr + 16 * 6));
+                    _sum7 = _mm512_load_si512((const __m512i*)(outptr + 16 * 7));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA0 = _mm512_loadu_si512((const __m512i*)pA);
+                    __m256i _pB = _mm256_loadu_si256((const __m256i*)pB);
+                    __m512i _pA1 = _mm512_permutex_epi64(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m512i _pB0 = _mm512_inserti32x8(_mm512_castsi256_si512(_pB), _pB, 1);
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+                    __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC);
+                    __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm512_dpwssd_epi32(_sum2, _pA0, _pB2);
+                    _sum3 = _mm512_dpwssd_epi32(_sum3, _pA0, _pB3);
+                    _sum4 = _mm512_dpwssd_epi32(_sum4, _pA1, _pB0);
+                    _sum5 = _mm512_dpwssd_epi32(_sum5, _pA1, _pB1);
+                    _sum6 = _mm512_dpwssd_epi32(_sum6, _pA1, _pB2);
+                    _sum7 = _mm512_dpwssd_epi32(_sum7, _pA1, _pB3);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA0, _pB2));
+                    _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA0, _pB3));
+                    _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA1, _pB0));
+                    _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA1, _pB1));
+                    _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA1, _pB2));
+                    _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA1, _pB3));
+#endif
+
+                    pA += 32;
+                    pB += 16;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA0 = _mm512_cvtepi16_epi32(_mm256_load_si256((const __m256i*)pA));
+                    __m256i _pB = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i*)pB));
+                    __m512i _pA1 = _mm512_permutex_epi64(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m512i _pB0 = _mm512_inserti32x8(_mm512_castsi256_si512(_pB), _pB, 1);
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+                    __m512i _pB2 = _mm512_permutex_epi64(_pB0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA0, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA0, _pB1);
+                    __m512i _s2 = _mm512_mullo_epi32(_pA0, _pB2);
+                    __m512i _s3 = _mm512_mullo_epi32(_pA0, _pB3);
+                    __m512i _s4 = _mm512_mullo_epi32(_pA1, _pB0);
+                    __m512i _s5 = _mm512_mullo_epi32(_pA1, _pB1);
+                    __m512i _s6 = _mm512_mullo_epi32(_pA1, _pB2);
+                    __m512i _s7 = _mm512_mullo_epi32(_pA1, _pB3);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+                    _sum2 = _mm512_add_epi32(_sum2, _s2);
+                    _sum3 = _mm512_add_epi32(_sum3, _s3);
+                    _sum4 = _mm512_add_epi32(_sum4, _s4);
+                    _sum5 = _mm512_add_epi32(_sum5, _s5);
+                    _sum6 = _mm512_add_epi32(_sum6, _s6);
+                    _sum7 = _mm512_add_epi32(_sum7, _s7);
+
+                    pA += 16;
+                    pB += 8;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 44 55 66 77 80 91 a2 b3 c4 d5 e6 f7
+                    //      01 12 23 30 45 56 67 74 81 92 a3 b0 c5 d6 e7 f4
+                    //      02 13 20 31 46 57 64 75 82 93 a0 b1 c6 d7 e4 f5
+                    //      03 10 21 32 47 54 65 76 83 90 a1 b2 c7 d4 e5 f6
+                    //      40 51 62 73 04 15 26 37 c0 d1 e2 f3 84 95 a6 b7
+                    //      41 52 63 70 05 16 27 34 c1 d2 e3 f0 85 96 a7 b4
+                    //      42 53 60 71 06 17 24 35 c2 d3 e0 f1 86 97 a4 b5
+                    //      43 50 61 72 07 14 25 36 c3 d0 e1 f2 87 94 a5 b6
+                    // to
+                    //      00 10 20 30 44 54 64 74 80 90 a0 b0 c4 d4 e4 f4
+                    //      01 11 21 31 45 55 65 75 81 91 a1 b1 c5 d5 e5 f5
+                    //      02 12 22 32 46 56 66 76 82 92 a2 b2 c6 d6 e6 f6
+                    //      03 13 23 33 47 57 67 77 83 93 a3 b3 c7 d7 e7 f7
+                    //      40 50 60 70 04 14 24 34 c0 d0 e0 f0 84 94 a4 b4
+                    //      41 51 61 71 05 15 25 35 c1 d1 e1 f1 85 95 a5 b5
+                    //      42 52 62 72 06 16 26 36 c2 d2 e2 f2 86 96 a6 b6
+                    //      43 53 63 73 07 17 27 37 c3 d3 e3 f3 87 97 a7 b7
+                    {
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum2 = _mm512_shuffle_epi32(_sum2, _MM_PERM_BADC);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_ADCB);
+                        _sum5 = _mm512_shuffle_epi32(_sum5, _MM_PERM_CBAD);
+                        _sum6 = _mm512_shuffle_epi32(_sum6, _MM_PERM_BADC);
+                        _sum7 = _mm512_shuffle_epi32(_sum7, _MM_PERM_ADCB);
+                        __m512i _tmp0 = _mm512_unpacklo_epi32(_sum0, _sum3);
+                        __m512i _tmp1 = _mm512_unpackhi_epi32(_sum0, _sum3);
+                        __m512i _tmp2 = _mm512_unpacklo_epi32(_sum2, _sum1);
+                        __m512i _tmp3 = _mm512_unpackhi_epi32(_sum2, _sum1);
+                        __m512i _tmp4 = _mm512_unpacklo_epi32(_sum4, _sum7);
+                        __m512i _tmp5 = _mm512_unpackhi_epi32(_sum4, _sum7);
+                        __m512i _tmp6 = _mm512_unpacklo_epi32(_sum6, _sum5);
+                        __m512i _tmp7 = _mm512_unpackhi_epi32(_sum6, _sum5);
+                        _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm512_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm512_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum4 = _mm512_unpacklo_epi64(_tmp4, _tmp6);
+                        _sum5 = _mm512_unpackhi_epi64(_tmp4, _tmp6);
+                        _sum6 = _mm512_unpacklo_epi64(_tmp7, _tmp5);
+                        _sum7 = _mm512_unpackhi_epi64(_tmp7, _tmp5);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                        _sum5 = _mm512_shuffle_epi32(_sum5, _MM_PERM_CBAD);
+                        _sum7 = _mm512_shuffle_epi32(_sum7, _MM_PERM_CBAD);
+                    }
+
+                    // TODO
+                    __m512i _tmp0 = _mm512_shuffle_i32x4(_sum0, _sum4, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp1 = _mm512_shuffle_i32x4(_sum1, _sum5, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp2 = _mm512_shuffle_i32x4(_sum2, _sum6, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp3 = _mm512_shuffle_i32x4(_sum3, _sum7, _MM_SHUFFLE(2, 0, 2, 0));
+                    __m512i _tmp4 = _mm512_shuffle_i32x4(_sum4, _sum0, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmp5 = _mm512_shuffle_i32x4(_sum5, _sum1, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmp6 = _mm512_shuffle_i32x4(_sum6, _sum2, _MM_SHUFFLE(3, 1, 3, 1));
+                    __m512i _tmp7 = _mm512_shuffle_i32x4(_sum7, _sum3, _MM_SHUFFLE(3, 1, 3, 1));
+                    _sum0 = _mm512_shuffle_i32x4(_tmp0, _tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum1 = _mm512_shuffle_i32x4(_tmp1, _tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum2 = _mm512_shuffle_i32x4(_tmp2, _tmp2, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum3 = _mm512_shuffle_i32x4(_tmp3, _tmp3, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum4 = _mm512_shuffle_i32x4(_tmp4, _tmp4, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum5 = _mm512_shuffle_i32x4(_tmp5, _tmp5, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum6 = _mm512_shuffle_i32x4(_tmp6, _tmp6, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum7 = _mm512_shuffle_i32x4(_tmp7, _tmp7, _MM_SHUFFLE(3, 1, 2, 0));
+                }
+
+                _mm512_store_si512((__m512i*)outptr, _sum0);
+                _mm512_store_si512((__m512i*)(outptr + 16), _sum1);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 2), _sum2);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 3), _sum3);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 4), _sum4);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 5), _sum5);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 6), _sum6);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 7), _sum7);
+                outptr += 16 * 8;
+            }
+#endif // defined(__x86_64__) || defined(_M_X64)
+            for (; jj + 3 < max_jj; jj += 4)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+                __m512i _sum2;
+                __m512i _sum3;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                    _sum2 = _mm512_setzero_si512();
+                    _sum3 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_load_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_load_si512((const __m512i*)(outptr + 16));
+                    _sum2 = _mm512_load_si512((const __m512i*)(outptr + 16 * 2));
+                    _sum3 = _mm512_load_si512((const __m512i*)(outptr + 16 * 3));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA0 = _mm512_loadu_si512((const __m512i*)pA);
+                    __m512i _pB = _mm512_castsi128_si512(_mm_loadu_si128((const __m128i*)pB));
+                    __m512i _pB0 = _mm512_shuffle_i32x4(_pB, _pB, _MM_SHUFFLE(0, 0, 0, 0));
+                    __m512i _pA1 = _mm512_shuffle_epi32(_pA0, _MM_PERM_BADC);
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm512_dpwssd_epi32(_sum2, _pA1, _pB0);
+                    _sum3 = _mm512_dpwssd_epi32(_sum3, _pA1, _pB1);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA1, _pB0));
+                    _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA1, _pB1));
+#endif
+
+                    pA += 32;
+                    pB += 8;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA0 = _mm512_cvtepi16_epi32(_mm256_load_si256((const __m256i*)pA));
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_mm256_castpd_si256(_mm256_broadcast_sd((const double*)pB)));
+                    __m512i _pA1 = _mm512_permutex_epi64(_pA0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA0, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA0, _pB1);
+                    __m512i _s2 = _mm512_mullo_epi32(_pA1, _pB0);
+                    __m512i _s3 = _mm512_mullo_epi32(_pA1, _pB1);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+                    _sum2 = _mm512_add_epi32(_sum2, _s2);
+                    _sum3 = _mm512_add_epi32(_sum3, _s3);
+
+                    pA += 16;
+                    pB += 4;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 40 51 62 73 80 91 a2 b3 c0 d1 e2 f3
+                    //      01 12 23 30 41 52 63 70 81 92 a3 b0 c1 d2 e3 f0
+                    //      20 31 02 13 60 71 42 53 a0 b1 82 93 e0 f1 c2 d3
+                    //      21 32 03 10 61 72 43 50 a1 b2 83 90 e1 f2 c3 d0
+                    // to
+                    //      00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+                    //      01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+                    //      02 12 22 32 42 52 62 72 82 92 a2 b2 c2 d2 e2 f2
+                    //      03 13 23 33 43 53 63 73 83 93 a3 b3 c3 d3 e3 f3
+                    {
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                        __m512i _tmp0 = _mm512_unpacklo_epi32(_sum0, _sum3);
+                        __m512i _tmp1 = _mm512_unpackhi_epi32(_sum0, _sum3);
+                        __m512i _tmp2 = _mm512_unpacklo_epi32(_sum2, _sum1);
+                        __m512i _tmp3 = _mm512_unpackhi_epi32(_sum2, _sum1);
+                        _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm512_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm512_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                    }
+                }
+
+                _mm512_store_si512((__m512i*)outptr, _sum0);
+                _mm512_store_si512((__m512i*)(outptr + 16), _sum1);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 2), _sum2);
+                _mm512_store_si512((__m512i*)(outptr + 16 * 3), _sum3);
+                outptr += 16 * 4;
+            }
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_load_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_load_si512((const __m512i*)(outptr + 16));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA = _mm512_loadu_si512((const __m512i*)pA);
+                    __m512i _pB0 = _mm512_castpd_si512(_mm512_set1_pd(((const double*)pB)[0]));
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CDAB);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA, _pB1);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA, _pB1));
+#endif
+
+                    pA += 32;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA = _mm512_cvtepi16_epi32(_mm256_load_si256((const __m256i*)pA));
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_mm256_set1_epi32(((const int*)pB)[0]));
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ABAB);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA, _pB1);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+
+                    pA += 16;
+                    pB += 2;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 20 31 40 51 60 71 80 91 a0 b1 c0 d1 e0 f1
+                    //      01 10 21 30 41 50 61 70 81 90 a1 b0 c1 d0 e1 f0
+                    // to
+                    //      00 10 20 30 40 50 60 70 80 90 a0 b0 c0 d0 e0 f0
+                    //      01 11 21 31 41 51 61 71 81 91 a1 b1 c1 d1 e1 f1
+                    {
+                        __m512i _tmp0 = _mm512_shuffle_epi32(_sum0, _MM_PERM_DBCA);
+                        __m512i _tmp1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_ACDB);
+                        _sum0 = _mm512_unpacklo_epi32(_tmp0, _tmp1);
+                        _sum1 = _mm512_unpackhi_epi32(_tmp0, _tmp1);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                    }
+                }
+
+                _mm512_store_si512((__m512i*)outptr, _sum0);
+                _mm512_store_si512((__m512i*)(outptr + 16), _sum1);
+                outptr += 16 * 2;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_load_si512((const __m512i*)outptr);
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA = _mm512_loadu_si512((const __m512i*)pA);
+                    __m512i _pB = _mm512_set1_epi32(((const int*)pB)[0]);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA, _pB);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA, _pB));
+#endif
+
+                    pA += 32;
+                    pB += 2;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA = _mm512_cvtepi16_epi32(_mm256_load_si256((const __m256i*)pA));
+                    __m512i _pB = _mm512_set1_epi32(pB[0]);
+                    __m512i _s0 = _mm512_mullo_epi32(_pA, _pB);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+
+                    pA += 16;
+                    pB += 1;
+                }
+
+                _mm512_store_si512((__m512i*)outptr, _sum0);
+                outptr += 16;
+            }
+        }
+    }
+#endif // __AVX512F__
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+            for (; jj + 15 < max_jj; jj += 16)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+                __m512i _sum2;
+                __m512i _sum3;
+                __m512i _sum4;
+                __m512i _sum5;
+                __m512i _sum6;
+                __m512i _sum7;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                    _sum2 = _mm512_setzero_si512();
+                    _sum3 = _mm512_setzero_si512();
+                    _sum4 = _mm512_setzero_si512();
+                    _sum5 = _mm512_setzero_si512();
+                    _sum6 = _mm512_setzero_si512();
+                    _sum7 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_loadu_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_loadu_si512((const __m512i*)(outptr + 16));
+                    _sum2 = _mm512_loadu_si512((const __m512i*)(outptr + 32));
+                    _sum3 = _mm512_loadu_si512((const __m512i*)(outptr + 48));
+                    _sum4 = _mm512_loadu_si512((const __m512i*)(outptr + 64));
+                    _sum5 = _mm512_loadu_si512((const __m512i*)(outptr + 80));
+                    _sum6 = _mm512_loadu_si512((const __m512i*)(outptr + 96));
+                    _sum7 = _mm512_loadu_si512((const __m512i*)(outptr + 112));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m256i _pA0 = _mm256_loadu_si256((const __m256i*)pA);
+                    __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB);
+
+                    __m512i _pA00 = _mm512_inserti32x8(_mm512_castsi256_si512(_pA0), _pA0, 1);
+                    __m512i _pA11 = _mm512_permutex_epi64(_pA00, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+                    __m512i _pB2 = _mm512_shuffle_epi32(_pB0, _MM_PERM_BADC);
+                    __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA00, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA00, _pB1);
+                    _sum2 = _mm512_dpwssd_epi32(_sum2, _pA00, _pB2);
+                    _sum3 = _mm512_dpwssd_epi32(_sum3, _pA00, _pB3);
+                    _sum4 = _mm512_dpwssd_epi32(_sum4, _pA11, _pB0);
+                    _sum5 = _mm512_dpwssd_epi32(_sum5, _pA11, _pB1);
+                    _sum6 = _mm512_dpwssd_epi32(_sum6, _pA11, _pB2);
+                    _sum7 = _mm512_dpwssd_epi32(_sum7, _pA11, _pB3);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA00, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA00, _pB1));
+                    _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA00, _pB2));
+                    _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA00, _pB3));
+                    _sum4 = _mm512_add_epi32(_sum4, _mm512_madd_epi16(_pA11, _pB0));
+                    _sum5 = _mm512_add_epi32(_sum5, _mm512_madd_epi16(_pA11, _pB1));
+                    _sum6 = _mm512_add_epi32(_sum6, _mm512_madd_epi16(_pA11, _pB2));
+                    _sum7 = _mm512_add_epi32(_sum7, _mm512_madd_epi16(_pA11, _pB3));
+#endif // __AVX512VNNI__
+
+                    pA += 16;
+                    pB += 32;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_load_si128((const __m128i*)pA);
+                    __m256i _pB = _mm256_loadu_si256((const __m256i*)pB);
+
+                    __m256i _pA0 = _mm256_cvtepi16_epi32(_pA);
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_pB);
+
+                    __m512i _pA00 = _mm512_inserti32x8(_mm512_castsi256_si512(_pA0), _pA0, 1);
+                    __m512i _pA11 = _mm512_permutex_epi64(_pA00, _MM_SHUFFLE(1, 0, 3, 2));
+
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+                    __m512i _pB2 = _mm512_permutex_epi64(_pB0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m512i _pB3 = _mm512_shuffle_epi32(_pB0, _MM_PERM_CBAD);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA00, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA00, _pB1);
+                    __m512i _s2 = _mm512_mullo_epi32(_pA00, _pB2);
+                    __m512i _s3 = _mm512_mullo_epi32(_pA00, _pB3);
+                    __m512i _s4 = _mm512_mullo_epi32(_pA11, _pB0);
+                    __m512i _s5 = _mm512_mullo_epi32(_pA11, _pB1);
+                    __m512i _s6 = _mm512_mullo_epi32(_pA11, _pB2);
+                    __m512i _s7 = _mm512_mullo_epi32(_pA11, _pB3);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+                    _sum2 = _mm512_add_epi32(_sum2, _s2);
+                    _sum3 = _mm512_add_epi32(_sum3, _s3);
+                    _sum4 = _mm512_add_epi32(_sum4, _s4);
+                    _sum5 = _mm512_add_epi32(_sum5, _s5);
+                    _sum6 = _mm512_add_epi32(_sum6, _s6);
+                    _sum7 = _mm512_add_epi32(_sum7, _s7);
+
+                    pA += 8;
+                    pB += 16;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 44 55 66 77 08 19 2a 3b 4c 5d 6e 7f
+                    //      01 12 23 30 45 56 67 74 09 1a 2b 38 4d 5e 6f 7c
+                    //      02 13 20 31 46 57 64 75 0a 1b 28 39 4e 5f 6c 7d
+                    //      03 10 21 32 47 54 65 76 0b 18 29 3a 4f 5c 6d 7e
+                    //      40 51 62 73 04 15 26 37 48 59 6a 7b 0c 1d 2e 3f
+                    //      41 52 63 70 05 16 27 34 49 5a 6b 78 0d 1e 2f 3c
+                    //      42 53 60 71 06 17 24 35 4a 5b 68 79 0e 1f 2c 3d
+                    //      43 50 61 72 07 14 25 36 4b 58 69 7a 0f 1c 2d 3e
+                    // to
+                    //      00 10 20 30 44 54 64 74 08 18 28 38 4c 5c 6c 7c
+                    //      01 11 21 31 45 55 65 75 09 19 29 39 4d 5d 6d 7d
+                    //      02 12 22 32 46 56 66 76 0a 1a 2a 3a 4e 5e 6e 7e
+                    //      03 13 23 33 47 57 67 77 0b 1b 2b 3b 4f 5f 6f 7f
+                    //      40 50 60 70 04 14 24 34 48 58 68 78 0c 1c 2c 3c
+                    //      41 51 61 71 05 15 25 35 49 59 69 79 0d 1d 2d 3d
+                    //      42 52 62 72 06 16 26 36 4a 5a 6a 7a 0e 1e 2e 3e
+                    //      43 53 63 73 07 17 27 37 4b 5b 6b 7b 0f 1f 2f 3f
+                    {
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum2 = _mm512_shuffle_epi32(_sum2, _MM_PERM_BADC);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_ADCB);
+                        _sum5 = _mm512_shuffle_epi32(_sum5, _MM_PERM_CBAD);
+                        _sum6 = _mm512_shuffle_epi32(_sum6, _MM_PERM_BADC);
+                        _sum7 = _mm512_shuffle_epi32(_sum7, _MM_PERM_ADCB);
+                        __m512i _tmp0 = _mm512_unpacklo_epi32(_sum0, _sum3);
+                        __m512i _tmp1 = _mm512_unpackhi_epi32(_sum0, _sum3);
+                        __m512i _tmp2 = _mm512_unpacklo_epi32(_sum2, _sum1);
+                        __m512i _tmp3 = _mm512_unpackhi_epi32(_sum2, _sum1);
+                        __m512i _tmp4 = _mm512_unpacklo_epi32(_sum4, _sum7);
+                        __m512i _tmp5 = _mm512_unpackhi_epi32(_sum4, _sum7);
+                        __m512i _tmp6 = _mm512_unpacklo_epi32(_sum6, _sum5);
+                        __m512i _tmp7 = _mm512_unpackhi_epi32(_sum6, _sum5);
+                        _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm512_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm512_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum4 = _mm512_unpacklo_epi64(_tmp4, _tmp6);
+                        _sum5 = _mm512_unpackhi_epi64(_tmp4, _tmp6);
+                        _sum6 = _mm512_unpacklo_epi64(_tmp7, _tmp5);
+                        _sum7 = _mm512_unpackhi_epi64(_tmp7, _tmp5);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                        _sum5 = _mm512_shuffle_epi32(_sum5, _MM_PERM_CBAD);
+                        _sum7 = _mm512_shuffle_epi32(_sum7, _MM_PERM_CBAD);
+                    }
+
+                    __m512i _tmp0 = _mm512_shuffle_i32x4(_sum0, _sum4, _MM_SHUFFLE(1, 0, 1, 0));
+                    __m512i _tmp1 = _mm512_shuffle_i32x4(_sum0, _sum4, _MM_SHUFFLE(3, 2, 3, 2));
+                    __m512i _tmp2 = _mm512_shuffle_i32x4(_sum1, _sum5, _MM_SHUFFLE(1, 0, 1, 0));
+                    __m512i _tmp3 = _mm512_shuffle_i32x4(_sum1, _sum5, _MM_SHUFFLE(3, 2, 3, 2));
+                    __m512i _tmp4 = _mm512_shuffle_i32x4(_sum2, _sum6, _MM_SHUFFLE(1, 0, 1, 0));
+                    __m512i _tmp5 = _mm512_shuffle_i32x4(_sum2, _sum6, _MM_SHUFFLE(3, 2, 3, 2));
+                    __m512i _tmp6 = _mm512_shuffle_i32x4(_sum3, _sum7, _MM_SHUFFLE(1, 0, 1, 0));
+                    __m512i _tmp7 = _mm512_shuffle_i32x4(_sum3, _sum7, _MM_SHUFFLE(3, 2, 3, 2));
+                    _sum0 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(2, 0, 2, 0));
+                    _sum1 = _mm512_shuffle_i32x4(_tmp4, _tmp6, _MM_SHUFFLE(2, 0, 2, 0));
+                    _sum2 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(1, 3, 1, 3));
+                    _sum3 = _mm512_shuffle_i32x4(_tmp4, _tmp6, _MM_SHUFFLE(1, 3, 1, 3));
+                    _sum4 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+                    _sum5 = _mm512_shuffle_i32x4(_tmp5, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
+                    _sum6 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(1, 3, 1, 3));
+                    _sum7 = _mm512_shuffle_i32x4(_tmp5, _tmp7, _MM_SHUFFLE(1, 3, 1, 3));
+                }
+
+                _mm512_storeu_si512((__m512i*)outptr, _sum0);
+                _mm512_storeu_si512((__m512i*)(outptr + 16), _sum1);
+                _mm512_storeu_si512((__m512i*)(outptr + 32), _sum2);
+                _mm512_storeu_si512((__m512i*)(outptr + 48), _sum3);
+                _mm512_storeu_si512((__m512i*)(outptr + 64), _sum4);
+                _mm512_storeu_si512((__m512i*)(outptr + 80), _sum5);
+                _mm512_storeu_si512((__m512i*)(outptr + 96), _sum6);
+                _mm512_storeu_si512((__m512i*)(outptr + 112), _sum7);
+                outptr += 128;
+            }
+#endif // __AVX512F__
+            for (; jj + 7 < max_jj; jj += 8)
+            {
+                const short* pA = pAT;
+
+#if __AVX512F__
+                __m512i _sum0;
+                __m512i _sum1;
+                __m512i _sum2;
+                __m512i _sum3;
+#else
+                __m256i _sum0;
+                __m256i _sum1;
+                __m256i _sum2;
+                __m256i _sum3;
+                __m256i _sum4;
+                __m256i _sum5;
+                __m256i _sum6;
+                __m256i _sum7;
+#endif // __AVX512F__
+
+                if (k == 0)
+                {
+#if __AVX512F__
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                    _sum2 = _mm512_setzero_si512();
+                    _sum3 = _mm512_setzero_si512();
+#else
+                    _sum0 = _mm256_setzero_si256();
+                    _sum1 = _mm256_setzero_si256();
+                    _sum2 = _mm256_setzero_si256();
+                    _sum3 = _mm256_setzero_si256();
+                    _sum4 = _mm256_setzero_si256();
+                    _sum5 = _mm256_setzero_si256();
+                    _sum6 = _mm256_setzero_si256();
+                    _sum7 = _mm256_setzero_si256();
+#endif // __AVX512F__
+                }
+                else
+                {
+#if __AVX512F__
+                    _sum0 = _mm512_loadu_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_loadu_si512((const __m512i*)(outptr + 16));
+                    _sum2 = _mm512_loadu_si512((const __m512i*)(outptr + 32));
+                    _sum3 = _mm512_loadu_si512((const __m512i*)(outptr + 48));
+#else
+                    _sum0 = _mm256_load_si256((const __m256i*)outptr);
+                    _sum1 = _mm256_load_si256((const __m256i*)(outptr + 8));
+                    _sum2 = _mm256_load_si256((const __m256i*)(outptr + 16));
+                    _sum3 = _mm256_load_si256((const __m256i*)(outptr + 24));
+                    _sum4 = _mm256_load_si256((const __m256i*)(outptr + 32));
+                    _sum5 = _mm256_load_si256((const __m256i*)(outptr + 40));
+                    _sum6 = _mm256_load_si256((const __m256i*)(outptr + 48));
+                    _sum7 = _mm256_load_si256((const __m256i*)(outptr + 56));
+#endif // __AVX512F__
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m256i _pA0 = _mm256_loadu_si256((const __m256i*)pA);
+                    __m256i _pB0 = _mm256_loadu_si256((const __m256i*)pB);
+#if __AVX512F__
+                    __m512i _pA00 = _mm512_inserti32x8(_mm512_castsi256_si512(_pA0), _pA0, 1);
+                    __m512i _pA11 = _mm512_permutex_epi64(_pA00, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m512i _pB01 = _mm512_inserti32x8(_mm512_castsi256_si512(_pB0), _pB1, 1);
+                    __m512i _pB23 = _mm512_shuffle_epi32(_pB01, _MM_PERM_BADC);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA00, _pB01);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA00, _pB23);
+                    _sum2 = _mm512_dpwssd_epi32(_sum2, _pA11, _pB01);
+                    _sum3 = _mm512_dpwssd_epi32(_sum3, _pA11, _pB23);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA00, _pB01));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA00, _pB23));
+                    _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA11, _pB01));
+                    _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA11, _pB23));
+#endif // __AVX512VNNI__
+#else  // __AVX512F__
+                    __m256i _pA1 = _mm256_permute4x64_epi64(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m256i _pB2 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m256i _pB3 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 1, 0, 3));
+
+#if __AVXVNNI__
+                    _sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm256_dpwssd_epi32(_sum2, _pA0, _pB2);
+                    _sum3 = _mm256_dpwssd_epi32(_sum3, _pA0, _pB3);
+                    _sum4 = _mm256_dpwssd_epi32(_sum4, _pA1, _pB0);
+                    _sum5 = _mm256_dpwssd_epi32(_sum5, _pA1, _pB1);
+                    _sum6 = _mm256_dpwssd_epi32(_sum6, _pA1, _pB2);
+                    _sum7 = _mm256_dpwssd_epi32(_sum7, _pA1, _pB3);
+#else
+                    _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA0, _pB2));
+                    _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA0, _pB3));
+                    _sum4 = _mm256_add_epi32(_sum4, _mm256_madd_epi16(_pA1, _pB0));
+                    _sum5 = _mm256_add_epi32(_sum5, _mm256_madd_epi16(_pA1, _pB1));
+                    _sum6 = _mm256_add_epi32(_sum6, _mm256_madd_epi16(_pA1, _pB2));
+                    _sum7 = _mm256_add_epi32(_sum7, _mm256_madd_epi16(_pA1, _pB3));
+#endif // __AVXVNNI__
+#endif // __AVX512F__
+
+                    pA += 16;
+                    pB += 16;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_load_si128((const __m128i*)pA);
+                    __m128i _pB = _mm_load_si128((const __m128i*)pB);
+
+                    __m256i _pA0 = _mm256_cvtepi16_epi32(_pA);
+                    __m256i _pB0 = _mm256_cvtepi16_epi32(_pB);
+#if __AVX512F__
+                    __m512i _pA00 = _mm512_inserti32x8(_mm512_castsi256_si512(_pA0), _pA0, 1);
+                    __m512i _pA11 = _mm512_shuffle_i32x4(_pA00, _pA00, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m512i _pB01 = _mm512_inserti32x8(_mm512_castsi256_si512(_pB0), _pB1, 1);
+                    __m512i _pB23 = _mm512_permutex_epi64(_pB01, _MM_SHUFFLE(2, 3, 0, 1));
+
+                    __m512i _s01 = _mm512_mullo_epi32(_pA00, _pB01);
+                    __m512i _s23 = _mm512_mullo_epi32(_pA00, _pB23);
+                    __m512i _s45 = _mm512_mullo_epi32(_pA11, _pB01);
+                    __m512i _s67 = _mm512_mullo_epi32(_pA11, _pB23);
+                    _sum0 = _mm512_add_epi32(_sum0, _s01);
+                    _sum1 = _mm512_add_epi32(_sum1, _s23);
+                    _sum2 = _mm512_add_epi32(_sum2, _s45);
+                    _sum3 = _mm512_add_epi32(_sum3, _s67);
+#else
+                    __m256i _pA1 = _mm256_permute4x64_epi64(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m256i _pB2 = _mm256_permute4x64_epi64(_pB0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m256i _pB3 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 1, 0, 3));
+
+                    __m256i _s0 = _mm256_mullo_epi32(_pA0, _pB0);
+                    __m256i _s1 = _mm256_mullo_epi32(_pA0, _pB1);
+                    __m256i _s2 = _mm256_mullo_epi32(_pA0, _pB2);
+                    __m256i _s3 = _mm256_mullo_epi32(_pA0, _pB3);
+                    __m256i _s4 = _mm256_mullo_epi32(_pA1, _pB0);
+                    __m256i _s5 = _mm256_mullo_epi32(_pA1, _pB1);
+                    __m256i _s6 = _mm256_mullo_epi32(_pA1, _pB2);
+                    __m256i _s7 = _mm256_mullo_epi32(_pA1, _pB3);
+                    _sum0 = _mm256_add_epi32(_sum0, _s0);
+                    _sum1 = _mm256_add_epi32(_sum1, _s1);
+                    _sum2 = _mm256_add_epi32(_sum2, _s2);
+                    _sum3 = _mm256_add_epi32(_sum3, _s3);
+                    _sum4 = _mm256_add_epi32(_sum4, _s4);
+                    _sum5 = _mm256_add_epi32(_sum5, _s5);
+                    _sum6 = _mm256_add_epi32(_sum6, _s6);
+                    _sum7 = _mm256_add_epi32(_sum7, _s7);
+#endif // __AVX512F__
+
+                    pA += 8;
+                    pB += 8;
+                }
+
+#if __AVX512F__
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 44 55 66 77 01 12 23 30 45 56 67 74
+                    //      02 13 20 31 46 57 64 75 03 10 21 32 47 54 65 76
+                    //      40 51 62 73 04 15 26 37 41 52 63 70 05 16 27 34
+                    //      42 53 60 71 06 17 24 35 43 50 61 72 07 14 25 36
+                    // to
+                    //      00 10 20 30 44 54 64 74 04 14 24 34 40 50 60 70
+                    //      01 11 21 31 45 55 65 75 05 15 25 35 41 51 61 71
+                    //      02 12 22 32 46 56 66 76 06 16 26 36 42 52 62 72
+                    //      03 13 23 33 47 57 67 77 07 17 27 37 43 53 63 73
+                    {
+                        __m512i _s0 = _mm512_shuffle_i32x4(_sum0, _sum2, _MM_SHUFFLE(0, 1, 1, 0));
+                        __m512i _s1 = _mm512_shuffle_i32x4(_sum1, _sum3, _MM_SHUFFLE(2, 3, 3, 2));
+                        __m512i _s2 = _mm512_shuffle_i32x4(_sum1, _sum3, _MM_SHUFFLE(0, 1, 1, 0));
+                        __m512i _s3 = _mm512_shuffle_i32x4(_sum0, _sum2, _MM_SHUFFLE(2, 3, 3, 2));
+                        _s1 = _mm512_shuffle_epi32(_s1, _MM_PERM_ADCB);
+                        _s2 = _mm512_shuffle_epi32(_s2, _MM_PERM_BADC);
+                        _s3 = _mm512_shuffle_epi32(_s3, _MM_PERM_CBAD);
+                        __m512i _tmp0 = _mm512_unpacklo_epi32(_s0, _s1);
+                        __m512i _tmp1 = _mm512_unpackhi_epi32(_s0, _s1);
+                        __m512i _tmp2 = _mm512_unpacklo_epi32(_s2, _s3);
+                        __m512i _tmp3 = _mm512_unpackhi_epi32(_s2, _s3);
+                        _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm512_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm512_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                    }
+
+                    __m512i _tmp0 = _mm512_shuffle_i32x4(_sum0, _sum1, _MM_SHUFFLE(3, 0, 3, 0));
+                    __m512i _tmp1 = _mm512_shuffle_i32x4(_sum2, _sum3, _MM_SHUFFLE(3, 0, 3, 0));
+                    __m512i _tmp2 = _mm512_shuffle_i32x4(_sum0, _sum1, _MM_SHUFFLE(1, 2, 1, 2));
+                    __m512i _tmp3 = _mm512_shuffle_i32x4(_sum2, _sum3, _MM_SHUFFLE(1, 2, 1, 2));
+                    _sum0 = _tmp0;
+                    _sum1 = _tmp1;
+                    _sum2 = _tmp2;
+                    _sum3 = _tmp3;
+                }
+
+                _mm512_storeu_si512((__m512i*)outptr, _sum0);
+                _mm512_storeu_si512((__m512i*)(outptr + 16), _sum1);
+                _mm512_storeu_si512((__m512i*)(outptr + 32), _sum2);
+                _mm512_storeu_si512((__m512i*)(outptr + 48), _sum3);
+                outptr += 64;
+#else
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 44 55 66 77
+                    //      01 12 23 30 45 56 67 74
+                    //      02 13 20 31 46 57 64 75
+                    //      03 10 21 32 47 54 65 76
+                    //      40 51 62 73 04 15 26 37
+                    //      41 52 63 70 05 16 27 34
+                    //      42 53 60 71 06 17 24 35
+                    //      43 50 61 72 07 14 25 36
+                    // to
+                    //      00 10 20 30 44 54 64 74
+                    //      01 11 21 31 45 55 65 75
+                    //      02 12 22 32 46 56 66 76
+                    //      03 13 23 33 47 57 67 77
+                    //      40 50 60 70 04 14 24 34
+                    //      41 51 61 71 05 15 25 35
+                    //      42 52 62 72 06 16 26 36
+                    //      43 53 63 73 07 17 27 37
+                    {
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum2 = _mm256_shuffle_epi32(_sum2, _MM_SHUFFLE(1, 0, 3, 2));
+                        _sum3 = _mm256_shuffle_epi32(_sum3, _MM_SHUFFLE(0, 3, 2, 1));
+                        _sum5 = _mm256_shuffle_epi32(_sum5, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum6 = _mm256_shuffle_epi32(_sum6, _MM_SHUFFLE(1, 0, 3, 2));
+                        _sum7 = _mm256_shuffle_epi32(_sum7, _MM_SHUFFLE(0, 3, 2, 1));
+                        __m256i _tmp0 = _mm256_unpacklo_epi32(_sum0, _sum3);
+                        __m256i _tmp1 = _mm256_unpackhi_epi32(_sum0, _sum3);
+                        __m256i _tmp2 = _mm256_unpacklo_epi32(_sum2, _sum1);
+                        __m256i _tmp3 = _mm256_unpackhi_epi32(_sum2, _sum1);
+                        __m256i _tmp4 = _mm256_unpacklo_epi32(_sum4, _sum7);
+                        __m256i _tmp5 = _mm256_unpackhi_epi32(_sum4, _sum7);
+                        __m256i _tmp6 = _mm256_unpacklo_epi32(_sum6, _sum5);
+                        __m256i _tmp7 = _mm256_unpackhi_epi32(_sum6, _sum5);
+                        _sum0 = _mm256_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm256_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm256_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm256_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum4 = _mm256_unpacklo_epi64(_tmp4, _tmp6);
+                        _sum5 = _mm256_unpackhi_epi64(_tmp4, _tmp6);
+                        _sum6 = _mm256_unpacklo_epi64(_tmp7, _tmp5);
+                        _sum7 = _mm256_unpackhi_epi64(_tmp7, _tmp5);
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm256_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum5 = _mm256_shuffle_epi32(_sum5, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum7 = _mm256_shuffle_epi32(_sum7, _MM_SHUFFLE(2, 1, 0, 3));
+                    }
+
+                    __m256i _tmp0 = _mm256_permute2x128_si256(_sum0, _sum4, _MM_SHUFFLE(0, 2, 0, 0));
+                    __m256i _tmp1 = _mm256_permute2x128_si256(_sum1, _sum5, _MM_SHUFFLE(0, 2, 0, 0));
+                    __m256i _tmp2 = _mm256_permute2x128_si256(_sum2, _sum6, _MM_SHUFFLE(0, 2, 0, 0));
+                    __m256i _tmp3 = _mm256_permute2x128_si256(_sum3, _sum7, _MM_SHUFFLE(0, 2, 0, 0));
+                    __m256i _tmp4 = _mm256_permute2x128_si256(_sum4, _sum0, _MM_SHUFFLE(0, 3, 0, 1));
+                    __m256i _tmp5 = _mm256_permute2x128_si256(_sum5, _sum1, _MM_SHUFFLE(0, 3, 0, 1));
+                    __m256i _tmp6 = _mm256_permute2x128_si256(_sum6, _sum2, _MM_SHUFFLE(0, 3, 0, 1));
+                    __m256i _tmp7 = _mm256_permute2x128_si256(_sum7, _sum3, _MM_SHUFFLE(0, 3, 0, 1));
+                    _sum0 = _tmp0;
+                    _sum1 = _tmp1;
+                    _sum2 = _tmp2;
+                    _sum3 = _tmp3;
+                    _sum4 = _tmp4;
+                    _sum5 = _tmp5;
+                    _sum6 = _tmp6;
+                    _sum7 = _tmp7;
+                }
+
+                _mm256_store_si256((__m256i*)outptr, _sum0);
+                _mm256_store_si256((__m256i*)(outptr + 8), _sum1);
+                _mm256_store_si256((__m256i*)(outptr + 8 * 2), _sum2);
+                _mm256_store_si256((__m256i*)(outptr + 8 * 3), _sum3);
+                _mm256_store_si256((__m256i*)(outptr + 8 * 4), _sum4);
+                _mm256_store_si256((__m256i*)(outptr + 8 * 5), _sum5);
+                _mm256_store_si256((__m256i*)(outptr + 8 * 6), _sum6);
+                _mm256_store_si256((__m256i*)(outptr + 8 * 7), _sum7);
+                outptr += 8 * 8;
+#endif // __AVX512F__
+            }
+#endif // defined(__x86_64__) || defined(_M_X64)
+            for (; jj + 3 < max_jj; jj += 4)
+            {
+                const short* pA = pAT;
+
+                __m256i _sum0;
+                __m256i _sum1;
+                __m256i _sum2;
+                __m256i _sum3;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm256_setzero_si256();
+                    _sum1 = _mm256_setzero_si256();
+                    _sum2 = _mm256_setzero_si256();
+                    _sum3 = _mm256_setzero_si256();
+                }
+                else
+                {
+                    _sum0 = _mm256_load_si256((const __m256i*)outptr);
+                    _sum1 = _mm256_load_si256((const __m256i*)(outptr + 8));
+                    _sum2 = _mm256_load_si256((const __m256i*)(outptr + 16));
+                    _sum3 = _mm256_load_si256((const __m256i*)(outptr + 24));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m256i _pA0 = _mm256_loadu_si256((const __m256i*)pA);
+                    __m128i _pB = _mm_loadu_si128((const __m128i*)pB);
+                    __m256i _pA1 = _mm256_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m256i _pB0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_pB), _pB, 1);
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+
+#if __AVXVNNI__ || __AVX512VNNI__
+                    _sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm256_dpwssd_epi32(_sum2, _pA1, _pB0);
+                    _sum3 = _mm256_dpwssd_epi32(_sum3, _pA1, _pB1);
+#else
+                    _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA1, _pB0));
+                    _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA1, _pB1));
+#endif
+
+                    pA += 16;
+                    pB += 8;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m256i _pA0 = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i*)pA));
+                    __m256i _pB0 = _mm256_cvtepi16_epi32(_mm_castpd_si128(_mm_load1_pd((const double*)pB)));
+                    __m256i _pA1 = _mm256_permute4x64_epi64(_pA0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+
+                    __m256i _s0 = _mm256_mullo_epi32(_pA0, _pB0);
+                    __m256i _s1 = _mm256_mullo_epi32(_pA0, _pB1);
+                    __m256i _s2 = _mm256_mullo_epi32(_pA1, _pB0);
+                    __m256i _s3 = _mm256_mullo_epi32(_pA1, _pB1);
+                    _sum0 = _mm256_add_epi32(_sum0, _s0);
+                    _sum1 = _mm256_add_epi32(_sum1, _s1);
+                    _sum2 = _mm256_add_epi32(_sum2, _s2);
+                    _sum3 = _mm256_add_epi32(_sum3, _s3);
+
+                    pA += 8;
+                    pB += 4;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 40 51 62 73
+                    //      01 12 23 30 41 52 63 70
+                    //      20 31 02 13 60 71 42 53
+                    //      21 32 03 10 61 72 43 50
+                    // to
+                    //      00 10 20 30 40 50 60 70
+                    //      01 11 21 31 41 51 61 71
+                    //      02 12 22 32 42 52 62 72
+                    //      03 13 23 33 43 53 63 73
+                    {
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm256_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        __m256i _tmp0 = _mm256_unpacklo_epi32(_sum0, _sum3);
+                        __m256i _tmp1 = _mm256_unpackhi_epi32(_sum0, _sum3);
+                        __m256i _tmp2 = _mm256_unpacklo_epi32(_sum2, _sum1);
+                        __m256i _tmp3 = _mm256_unpackhi_epi32(_sum2, _sum1);
+                        _sum0 = _mm256_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm256_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm256_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm256_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm256_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                    }
+                }
+
+                _mm256_store_si256((__m256i*)outptr, _sum0);
+                _mm256_store_si256((__m256i*)(outptr + 8), _sum1);
+                _mm256_store_si256((__m256i*)(outptr + 16), _sum2);
+                _mm256_store_si256((__m256i*)(outptr + 24), _sum3);
+                outptr += 32;
+            }
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                __m256i _sum0;
+                __m256i _sum1;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm256_setzero_si256();
+                    _sum1 = _mm256_setzero_si256();
+                }
+                else
+                {
+                    _sum0 = _mm256_load_si256((const __m256i*)outptr);
+                    _sum1 = _mm256_load_si256((const __m256i*)(outptr + 8));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m256i _pA = _mm256_loadu_si256((const __m256i*)pA);
+                    __m256i _pB0 = _mm256_castpd_si256(_mm256_broadcast_sd((const double*)pB));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 1, 0, 1));
+
+#if __AVXVNNI__ || __AVX512VNNI__
+                    _sum0 = _mm256_dpwssd_epi32(_sum0, _pA, _pB0);
+                    _sum1 = _mm256_dpwssd_epi32(_sum1, _pA, _pB1);
+#else
+                    _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA, _pB0));
+                    _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA, _pB1));
+#endif
+
+                    pA += 16;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m256i _pA = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i*)pA));
+                    __m256i _pB0 = _mm256_cvtepi16_epi32(_mm_castps_si128(_mm_load1_ps((const float*)pB)));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 1, 0, 1));
+
+                    __m256i _s0 = _mm256_mullo_epi32(_pA, _pB0);
+                    __m256i _s1 = _mm256_mullo_epi32(_pA, _pB1);
+                    _sum0 = _mm256_add_epi32(_sum0, _s0);
+                    _sum1 = _mm256_add_epi32(_sum1, _s1);
+
+                    pA += 8;
+                    pB += 2;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 20 31 40 51 60 71
+                    //      01 10 21 30 41 50 61 70
+                    // to
+                    //      00 10 20 30 40 50 60 70
+                    //      01 11 21 31 41 51 61 71
+                    {
+                        __m256i _tmp0 = _mm256_shuffle_epi32(_sum0, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m256i _tmp1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(0, 2, 3, 1));
+                        _sum0 = _mm256_unpacklo_epi32(_tmp0, _tmp1);
+                        _sum1 = _mm256_unpackhi_epi32(_tmp0, _tmp1);
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                    }
+                }
+
+                _mm256_store_si256((__m256i*)outptr, _sum0);
+                _mm256_store_si256((__m256i*)(outptr + 8), _sum1);
+                outptr += 16;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                __m256i _sum0;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm256_setzero_si256();
+                }
+                else
+                {
+                    _sum0 = _mm256_load_si256((const __m256i*)outptr);
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m256i _pA = _mm256_loadu_si256((const __m256i*)pA);
+                    __m256i _pB = _mm256_castps_si256(_mm256_broadcast_ss((const float*)pB));
+                    _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA, _pB));
+
+                    pA += 16;
+                    pB += 2;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m256i _pA = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i*)pA));
+                    __m256i _pB = _mm256_set1_epi32(pB[0]);
+                    __m256i _s0 = _mm256_mullo_epi32(_pA, _pB);
+                    _sum0 = _mm256_add_epi32(_sum0, _s0);
+
+                    pA += 8;
+                    pB += 1;
+                }
+
+                _mm256_store_si256((__m256i*)outptr, _sum0);
+                outptr += 8;
+            }
+        }
+    }
+#endif // __AVX2__
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+            for (; jj + 15 < max_jj; jj += 16)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+                __m512i _sum2;
+                __m512i _sum3;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                    _sum2 = _mm512_setzero_si512();
+                    _sum3 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_loadu_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_loadu_si512((const __m512i*)(outptr + 16));
+                    _sum2 = _mm512_loadu_si512((const __m512i*)(outptr + 32));
+                    _sum3 = _mm512_loadu_si512((const __m512i*)(outptr + 48));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA = _mm_loadu_si128((const __m128i*)pA);
+                    __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB);
+                    __m256i _pAA = _mm256_inserti128_si256(_mm256_castsi128_si256(_pA), _pA, 1);
+                    __m512i _pA0 = _mm512_inserti32x8(_mm512_castsi256_si512(_pAA), _pAA, 1);
+                    __m512i _pA1 = _mm512_shuffle_epi32(_pA0, _MM_PERM_BADC);
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm512_dpwssd_epi32(_sum2, _pA1, _pB0);
+                    _sum3 = _mm512_dpwssd_epi32(_sum3, _pA1, _pB1);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm512_add_epi32(_sum2, _mm512_madd_epi16(_pA1, _pB0));
+                    _sum3 = _mm512_add_epi32(_sum3, _mm512_madd_epi16(_pA1, _pB1));
+#endif
+
+                    pA += 8;
+                    pB += 32;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m256i _pA = _mm256_castpd_si256(_mm256_broadcast_sd((const double*)pA));
+                    __m256i _pB = _mm256_loadu_si256((const __m256i*)pB);
+
+                    __m512i _pA0 = _mm512_cvtepi16_epi32(_pA);
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_pB);
+                    __m512i _pA1 = _mm512_permutex_epi64(_pA0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m512i _pB1 = _mm512_shuffle_epi32(_pB0, _MM_PERM_ADCB);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA0, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA0, _pB1);
+                    __m512i _s2 = _mm512_mullo_epi32(_pA1, _pB0);
+                    __m512i _s3 = _mm512_mullo_epi32(_pA1, _pB1);
+
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+                    _sum2 = _mm512_add_epi32(_sum2, _s2);
+                    _sum3 = _mm512_add_epi32(_sum3, _s3);
+
+                    pA += 4;
+                    pB += 16;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 04 15 26 37 08 19 2a 3b 0c 1d 2e 3f
+                    //      01 12 23 30 05 16 27 34 09 1a 2b 38 0d 1e 2f 3c
+                    //      20 31 02 13 24 35 06 17 28 3a 0a 1b 2c 3d 0e 1f
+                    //      21 32 03 10 25 36 07 14 29 3a 0b 18 2d 3e 0f 1c
+                    // to
+                    //      00 10 20 30 04 14 24 34 08 18 28 38 0c 1c 2c 3c
+                    //      01 11 21 31 05 15 25 35 09 19 29 39 0d 1d 2d 3d
+                    //      02 12 22 32 06 16 26 36 0a 1a 2a 3a 0e 1e 2e 3e
+                    //      03 13 23 33 07 17 27 37 0b 1b 2b 3b 0f 1f 2f 3f
+                    {
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                        __m512i _tmp0 = _mm512_unpacklo_epi32(_sum0, _sum3);
+                        __m512i _tmp1 = _mm512_unpackhi_epi32(_sum0, _sum3);
+                        __m512i _tmp2 = _mm512_unpacklo_epi32(_sum2, _sum1);
+                        __m512i _tmp3 = _mm512_unpackhi_epi32(_sum2, _sum1);
+                        _sum0 = _mm512_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm512_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm512_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm512_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum1 = _mm512_shuffle_epi32(_sum1, _MM_PERM_CBAD);
+                        _sum3 = _mm512_shuffle_epi32(_sum3, _MM_PERM_CBAD);
+                    }
+
+                    __m512i _tmp0 = _mm512_shuffle_i32x4(_sum0, _sum1, _MM_SHUFFLE(1, 0, 1, 0));
+                    __m512i _tmp1 = _mm512_shuffle_i32x4(_sum2, _sum3, _MM_SHUFFLE(1, 0, 1, 0));
+                    __m512i _tmp2 = _mm512_shuffle_i32x4(_sum0, _sum1, _MM_SHUFFLE(3, 2, 3, 2));
+                    __m512i _tmp3 = _mm512_shuffle_i32x4(_sum2, _sum3, _MM_SHUFFLE(3, 2, 3, 2));
+                    _sum0 = _mm512_shuffle_i32x4(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                    _sum1 = _mm512_shuffle_i32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                    _sum2 = _mm512_shuffle_i32x4(_tmp2, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
+                    _sum3 = _mm512_shuffle_i32x4(_tmp2, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
+                }
+
+                _mm512_storeu_si512((__m512i*)outptr, _sum0);
+                _mm512_storeu_si512((__m512i*)(outptr + 16), _sum1);
+                _mm512_storeu_si512((__m512i*)(outptr + 32), _sum2);
+                _mm512_storeu_si512((__m512i*)(outptr + 48), _sum3);
+                outptr += 64;
+            }
+#endif // __AVX512F__
+            for (; jj + 7 < max_jj; jj += 8)
+            {
+                const short* pA = pAT;
+
+#if __AVX2__
+                __m256i _sum0;
+                __m256i _sum1;
+                __m256i _sum2;
+                __m256i _sum3;
+#else
+                __m128i _sum0;
+                __m128i _sum1;
+                __m128i _sum2;
+                __m128i _sum3;
+                __m128i _sum4;
+                __m128i _sum5;
+                __m128i _sum6;
+                __m128i _sum7;
+#endif
+
+                if (k == 0)
+                {
+#if __AVX2__
+                    _sum0 = _mm256_setzero_si256();
+                    _sum1 = _mm256_setzero_si256();
+                    _sum2 = _mm256_setzero_si256();
+                    _sum3 = _mm256_setzero_si256();
+#else
+                    _sum0 = _mm_setzero_si128();
+                    _sum1 = _mm_setzero_si128();
+                    _sum2 = _mm_setzero_si128();
+                    _sum3 = _mm_setzero_si128();
+                    _sum4 = _mm_setzero_si128();
+                    _sum5 = _mm_setzero_si128();
+                    _sum6 = _mm_setzero_si128();
+                    _sum7 = _mm_setzero_si128();
+#endif
+                }
+                else
+                {
+#if __AVX2__
+                    _sum0 = _mm256_loadu_si256((const __m256i*)outptr);
+                    _sum1 = _mm256_loadu_si256((const __m256i*)(outptr + 8));
+                    _sum2 = _mm256_loadu_si256((const __m256i*)(outptr + 16));
+                    _sum3 = _mm256_loadu_si256((const __m256i*)(outptr + 24));
+#else
+                    _sum0 = _mm_load_si128((const __m128i*)outptr);
+                    _sum1 = _mm_load_si128((const __m128i*)(outptr + 4));
+                    _sum2 = _mm_load_si128((const __m128i*)(outptr + 8));
+                    _sum3 = _mm_load_si128((const __m128i*)(outptr + 12));
+                    _sum4 = _mm_load_si128((const __m128i*)(outptr + 16));
+                    _sum5 = _mm_load_si128((const __m128i*)(outptr + 20));
+                    _sum6 = _mm_load_si128((const __m128i*)(outptr + 24));
+                    _sum7 = _mm_load_si128((const __m128i*)(outptr + 28));
+#endif
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+#if __AVX2__
+                    __m128i _pA = _mm_loadu_si128((const __m128i*)pA);
+                    __m256i _pB0 = _mm256_loadu_si256((const __m256i*)pB);
+                    __m256i _pA0 = _mm256_inserti128_si256(_mm256_castsi128_si256(_pA), _pA, 1);
+                    __m256i _pA1 = _mm256_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+
+#if __AVXVNNI__ || __AVX512VNNI__
+                    _sum0 = _mm256_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm256_dpwssd_epi32(_sum1, _pA0, _pB1);
+                    _sum2 = _mm256_dpwssd_epi32(_sum2, _pA1, _pB0);
+                    _sum3 = _mm256_dpwssd_epi32(_sum3, _pA1, _pB1);
+#else
+                    _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm256_add_epi32(_sum2, _mm256_madd_epi16(_pA1, _pB0));
+                    _sum3 = _mm256_add_epi32(_sum3, _mm256_madd_epi16(_pA1, _pB1));
+#endif
+#else // __AVX2__
+                    __m128i _pA0 = _mm_loadu_si128((const __m128i*)pA);
+                    __m128i _pB0 = _mm_loadu_si128((const __m128i*)pB);
+                    __m128i _pB1 = _mm_loadu_si128((const __m128i*)(pB + 8));
+                    __m128i _pA1 = _mm_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m128i _pB2 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m128i _pB3 = _mm_shuffle_epi32(_pB1, _MM_SHUFFLE(0, 3, 2, 1));
+
+#if __XOP__
+                    _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0);
+                    _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1);
+                    _sum2 = _mm_maddd_epi16(_pA0, _pB2, _sum2);
+                    _sum3 = _mm_maddd_epi16(_pA0, _pB3, _sum3);
+                    _sum4 = _mm_maddd_epi16(_pA1, _pB0, _sum4);
+                    _sum5 = _mm_maddd_epi16(_pA1, _pB1, _sum5);
+                    _sum6 = _mm_maddd_epi16(_pA1, _pB2, _sum6);
+                    _sum7 = _mm_maddd_epi16(_pA1, _pB3, _sum7);
+#else
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA0, _pB2));
+                    _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA0, _pB3));
+                    _sum4 = _mm_add_epi32(_sum4, _mm_madd_epi16(_pA1, _pB0));
+                    _sum5 = _mm_add_epi32(_sum5, _mm_madd_epi16(_pA1, _pB1));
+                    _sum6 = _mm_add_epi32(_sum6, _mm_madd_epi16(_pA1, _pB2));
+                    _sum7 = _mm_add_epi32(_sum7, _mm_madd_epi16(_pA1, _pB3));
+#endif
+#endif // __AVX2__
+
+                    pA += 8;
+                    pB += 16;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_castpd_si128(_mm_load1_pd((const double*)pA));
+                    __m128i _pB = _mm_loadu_si128((const __m128i*)pB);
+
+#if __AVX2__
+                    __m256i _pA0 = _mm256_cvtepi16_epi32(_pA);
+                    __m256i _pB0 = _mm256_cvtepi16_epi32(_pB);
+                    __m256i _pA1 = _mm256_permute4x64_epi64(_pA0, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m256i _pB1 = _mm256_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+
+                    __m256i _s0 = _mm256_mullo_epi32(_pA0, _pB0);
+                    __m256i _s1 = _mm256_mullo_epi32(_pA0, _pB1);
+                    __m256i _s2 = _mm256_mullo_epi32(_pA1, _pB0);
+                    __m256i _s3 = _mm256_mullo_epi32(_pA1, _pB1);
+
+                    _sum0 = _mm256_add_epi32(_sum0, _s0);
+                    _sum1 = _mm256_add_epi32(_sum1, _s1);
+                    _sum2 = _mm256_add_epi32(_sum2, _s2);
+                    _sum3 = _mm256_add_epi32(_sum3, _s3);
+#else // __AVX2__
+#if __XOP__
+                    __m128i _pA0 = _mm_unpacklo_epi16(_pA, _pA);
+                    __m128i _pA1 = _mm_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m128i _pB0 = _mm_unpacklo_epi16(_pB, _pB);
+                    __m128i _pB1 = _mm_unpackhi_epi16(_pB, _pB);
+                    __m128i _pB2 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m128i _pB3 = _mm_shuffle_epi32(_pB1, _MM_SHUFFLE(0, 3, 2, 1));
+                    _sum0 = _mm_maccd_epi16(_pA0, _pB0, _sum0);
+                    _sum1 = _mm_maccd_epi16(_pA0, _pB1, _sum1);
+                    _sum2 = _mm_maccd_epi16(_pA0, _pB2, _sum2);
+                    _sum3 = _mm_maccd_epi16(_pA0, _pB3, _sum3);
+                    _sum4 = _mm_maccd_epi16(_pA1, _pB0, _sum4);
+                    _sum5 = _mm_maccd_epi16(_pA1, _pB1, _sum5);
+                    _sum6 = _mm_maccd_epi16(_pA1, _pB2, _sum6);
+                    _sum7 = _mm_maccd_epi16(_pA1, _pB3, _sum7);
+#else
+                    __m128i _pA0 = _pA;
+                    __m128i _pA1 = _mm_shuffle_epi32(_pA, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m128i _pB01 = _pB;
+                    __m128i _pB23 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_pB, _MM_SHUFFLE(0, 3, 2, 1)), _MM_SHUFFLE(0, 3, 2, 1));
+                    __m128i _sl0 = _mm_mullo_epi16(_pA0, _pB01);
+                    __m128i _sh0 = _mm_mulhi_epi16(_pA0, _pB01);
+                    __m128i _sl1 = _mm_mullo_epi16(_pA0, _pB23);
+                    __m128i _sh1 = _mm_mulhi_epi16(_pA0, _pB23);
+                    __m128i _sl2 = _mm_mullo_epi16(_pA1, _pB01);
+                    __m128i _sh2 = _mm_mulhi_epi16(_pA1, _pB01);
+                    __m128i _sl3 = _mm_mullo_epi16(_pA1, _pB23);
+                    __m128i _sh3 = _mm_mulhi_epi16(_pA1, _pB23);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl0, _sh0);
+                    __m128i _s1 = _mm_unpackhi_epi16(_sl0, _sh0);
+                    __m128i _s2 = _mm_unpacklo_epi16(_sl1, _sh1);
+                    __m128i _s3 = _mm_unpackhi_epi16(_sl1, _sh1);
+                    __m128i _s4 = _mm_unpacklo_epi16(_sl2, _sh2);
+                    __m128i _s5 = _mm_unpackhi_epi16(_sl2, _sh2);
+                    __m128i _s6 = _mm_unpacklo_epi16(_sl3, _sh3);
+                    __m128i _s7 = _mm_unpackhi_epi16(_sl3, _sh3);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    _sum1 = _mm_add_epi32(_sum1, _s1);
+                    _sum2 = _mm_add_epi32(_sum2, _s2);
+                    _sum3 = _mm_add_epi32(_sum3, _s3);
+                    _sum4 = _mm_add_epi32(_sum4, _s4);
+                    _sum5 = _mm_add_epi32(_sum5, _s5);
+                    _sum6 = _mm_add_epi32(_sum6, _s6);
+                    _sum7 = _mm_add_epi32(_sum7, _s7);
+#endif
+#endif // __AVX2__
+
+                    pA += 4;
+                    pB += 8;
+                }
+
+#if __AVX2__
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33 04 15 26 37
+                    //      01 12 23 30 05 16 27 34
+                    //      20 31 02 13 24 35 06 17
+                    //      21 32 03 10 25 36 07 14
+                    // to
+                    //      00 10 20 30 04 14 24 34
+                    //      01 11 21 31 05 15 25 35
+                    //      02 12 22 32 06 16 26 36
+                    //      03 13 23 33 07 17 27 37
+                    {
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm256_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        __m256i _tmp0 = _mm256_unpacklo_epi32(_sum0, _sum3);
+                        __m256i _tmp1 = _mm256_unpackhi_epi32(_sum0, _sum3);
+                        __m256i _tmp2 = _mm256_unpacklo_epi32(_sum2, _sum1);
+                        __m256i _tmp3 = _mm256_unpackhi_epi32(_sum2, _sum1);
+                        _sum0 = _mm256_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm256_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm256_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm256_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum1 = _mm256_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm256_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        _tmp0 = _mm256_permute2x128_si256(_sum0, _sum1, _MM_SHUFFLE(0, 2, 0, 0));
+                        _tmp1 = _mm256_permute2x128_si256(_sum2, _sum3, _MM_SHUFFLE(0, 2, 0, 0));
+                        _tmp2 = _mm256_permute2x128_si256(_sum0, _sum1, _MM_SHUFFLE(0, 3, 0, 1));
+                        _tmp3 = _mm256_permute2x128_si256(_sum2, _sum3, _MM_SHUFFLE(0, 3, 0, 1));
+                        _sum0 = _tmp0;
+                        _sum1 = _tmp1;
+                        _sum2 = _tmp2;
+                        _sum3 = _tmp3;
+                    }
+                }
+
+                _mm256_storeu_si256((__m256i*)outptr, _sum0);
+                _mm256_storeu_si256((__m256i*)(outptr + 8), _sum1);
+                _mm256_storeu_si256((__m256i*)(outptr + 16), _sum2);
+                _mm256_storeu_si256((__m256i*)(outptr + 24), _sum3);
+                outptr += 32;
+#else  // __AVX2__
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33  04 15 26 37
+                    //      01 12 23 30  05 16 27 34
+                    //      20 31 02 13  24 35 06 17
+                    //      21 32 03 10  25 36 07 14
+                    // to
+                    //      00 10 20 30  04 14 24 34
+                    //      01 11 21 31  05 15 25 35
+                    //      02 12 22 32  06 16 26 36
+                    //      03 13 23 33  07 17 27 37
+                    {
+                        _sum2 = _mm_shuffle_epi32(_sum2, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum6 = _mm_shuffle_epi32(_sum6, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum7 = _mm_shuffle_epi32(_sum7, _MM_SHUFFLE(2, 1, 0, 3));
+                        __m128i _tmp0 = _mm_unpacklo_epi32(_sum0, _sum6);
+                        __m128i _tmp1 = _mm_unpackhi_epi32(_sum0, _sum6);
+                        __m128i _tmp2 = _mm_unpacklo_epi32(_sum1, _sum7);
+                        __m128i _tmp3 = _mm_unpackhi_epi32(_sum1, _sum7);
+                        __m128i _tmp4 = _mm_unpacklo_epi32(_sum4, _sum2);
+                        __m128i _tmp5 = _mm_unpackhi_epi32(_sum4, _sum2);
+                        __m128i _tmp6 = _mm_unpacklo_epi32(_sum5, _sum3);
+                        __m128i _tmp7 = _mm_unpackhi_epi32(_sum5, _sum3);
+                        _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp4);
+                        _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp4);
+                        _sum2 = _mm_unpacklo_epi64(_tmp5, _tmp1);
+                        _sum3 = _mm_unpackhi_epi64(_tmp5, _tmp1);
+                        _sum4 = _mm_unpacklo_epi64(_tmp2, _tmp6);
+                        _sum5 = _mm_unpackhi_epi64(_tmp2, _tmp6);
+                        _sum6 = _mm_unpacklo_epi64(_tmp7, _tmp3);
+                        _sum7 = _mm_unpackhi_epi64(_tmp7, _tmp3);
+                        _sum1 = _mm_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum5 = _mm_shuffle_epi32(_sum5, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum7 = _mm_shuffle_epi32(_sum7, _MM_SHUFFLE(2, 1, 0, 3));
+                    }
+                }
+
+                _mm_store_si128((__m128i*)outptr, _sum0);
+                _mm_store_si128((__m128i*)(outptr + 4), _sum1);
+                _mm_store_si128((__m128i*)(outptr + 8), _sum2);
+                _mm_store_si128((__m128i*)(outptr + 12), _sum3);
+                _mm_store_si128((__m128i*)(outptr + 16), _sum4);
+                _mm_store_si128((__m128i*)(outptr + 20), _sum5);
+                _mm_store_si128((__m128i*)(outptr + 24), _sum6);
+                _mm_store_si128((__m128i*)(outptr + 28), _sum7);
+                outptr += 32;
+#endif // __AVX2__
+            }
+#endif // defined(__x86_64__) || defined(_M_X64)
+            for (; jj + 3 < max_jj; jj += 4)
+            {
+                const short* pA = pAT;
+
+                __m128i _sum0;
+                __m128i _sum1;
+                __m128i _sum2;
+                __m128i _sum3;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm_setzero_si128();
+                    _sum1 = _mm_setzero_si128();
+                    _sum2 = _mm_setzero_si128();
+                    _sum3 = _mm_setzero_si128();
+                }
+                else
+                {
+                    _sum0 = _mm_load_si128((const __m128i*)outptr);
+                    _sum1 = _mm_load_si128((const __m128i*)(outptr + 4));
+                    _sum2 = _mm_load_si128((const __m128i*)(outptr + 8));
+                    _sum3 = _mm_load_si128((const __m128i*)(outptr + 12));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA0 = _mm_loadu_si128((const __m128i*)pA);
+                    __m128i _pB0 = _mm_loadu_si128((const __m128i*)pB);
+                    __m128i _pA1 = _mm_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m128i _pB1 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+
+#if __XOP__
+                    _sum0 = _mm_maddd_epi16(_pA0, _pB0, _sum0);
+                    _sum1 = _mm_maddd_epi16(_pA0, _pB1, _sum1);
+                    _sum2 = _mm_maddd_epi16(_pA1, _pB0, _sum2);
+                    _sum3 = _mm_maddd_epi16(_pA1, _pB1, _sum3);
+#else
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA1, _pB0));
+                    _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA1, _pB1));
+#endif
+
+                    pA += 8;
+                    pB += 8;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_castpd_si128(_mm_load1_pd((const double*)pA));
+                    __m128i _pB = _mm_castpd_si128(_mm_load1_pd((const double*)pB));
+#if __XOP__
+                    __m128i _pA0 = _mm_unpacklo_epi16(_pA, _pA);
+                    __m128i _pA1 = _mm_shuffle_epi32(_pA0, _MM_SHUFFLE(1, 0, 3, 2));
+                    __m128i _pB0 = _mm_unpacklo_epi16(_pB, _pB);
+                    __m128i _pB1 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(0, 3, 2, 1));
+                    _sum0 = _mm_maccd_epi16(_pA0, _pB0, _sum0);
+                    _sum1 = _mm_maccd_epi16(_pA0, _pB1, _sum1);
+                    _sum2 = _mm_maccd_epi16(_pA1, _pB0, _sum2);
+                    _sum3 = _mm_maccd_epi16(_pA1, _pB1, _sum3);
+#else
+                    __m128i _pA0 = _pA;
+                    __m128i _pA1 = _mm_shuffle_epi32(_pA, _MM_SHUFFLE(2, 3, 0, 1));
+                    __m128i _pB01 = _mm_shufflehi_epi16(_pB, _MM_SHUFFLE(0, 3, 2, 1));
+                    __m128i _sl0 = _mm_mullo_epi16(_pA0, _pB01);
+                    __m128i _sh0 = _mm_mulhi_epi16(_pA0, _pB01);
+                    __m128i _sl1 = _mm_mullo_epi16(_pA1, _pB01);
+                    __m128i _sh1 = _mm_mulhi_epi16(_pA1, _pB01);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl0, _sh0);
+                    __m128i _s1 = _mm_unpackhi_epi16(_sl0, _sh0);
+                    __m128i _s2 = _mm_unpacklo_epi16(_sl1, _sh1);
+                    __m128i _s3 = _mm_unpackhi_epi16(_sl1, _sh1);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    _sum1 = _mm_add_epi32(_sum1, _s1);
+                    _sum2 = _mm_add_epi32(_sum2, _s2);
+                    _sum3 = _mm_add_epi32(_sum3, _s3);
+#endif
+
+                    pA += 4;
+                    pB += 4;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 22 33
+                    //      01 12 23 30
+                    //      20 31 02 13
+                    //      21 32 03 10
+                    // to
+                    //      00 10 20 30
+                    //      01 11 21 31
+                    //      02 12 22 32
+                    //      03 13 23 33
+                    {
+                        _sum1 = _mm_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                        __m128i _tmp0 = _mm_unpacklo_epi32(_sum0, _sum3);
+                        __m128i _tmp1 = _mm_unpackhi_epi32(_sum0, _sum3);
+                        __m128i _tmp2 = _mm_unpacklo_epi32(_sum2, _sum1);
+                        __m128i _tmp3 = _mm_unpackhi_epi32(_sum2, _sum1);
+                        _sum0 = _mm_unpacklo_epi64(_tmp0, _tmp2);
+                        _sum1 = _mm_unpackhi_epi64(_tmp0, _tmp2);
+                        _sum2 = _mm_unpacklo_epi64(_tmp3, _tmp1);
+                        _sum3 = _mm_unpackhi_epi64(_tmp3, _tmp1);
+                        _sum1 = _mm_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                        _sum3 = _mm_shuffle_epi32(_sum3, _MM_SHUFFLE(2, 1, 0, 3));
+                    }
+                }
+
+                _mm_store_si128((__m128i*)outptr, _sum0);
+                _mm_store_si128((__m128i*)(outptr + 4), _sum1);
+                _mm_store_si128((__m128i*)(outptr + 8), _sum2);
+                _mm_store_si128((__m128i*)(outptr + 12), _sum3);
+                outptr += 16;
+            }
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                __m128i _sum0;
+                __m128i _sum1;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm_setzero_si128();
+                    _sum1 = _mm_setzero_si128();
+                }
+                else
+                {
+                    _sum0 = _mm_load_si128((const __m128i*)outptr);
+                    _sum1 = _mm_load_si128((const __m128i*)(outptr + 4));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA = _mm_loadu_si128((const __m128i*)pA);
+                    __m128i _pB0 = _mm_castpd_si128(_mm_load1_pd((const double*)pB));
+                    __m128i _pB1 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 3, 0, 1));
+
+#if __XOP__
+                    _sum0 = _mm_maddd_epi16(_pA, _pB0, _sum0);
+                    _sum1 = _mm_maddd_epi16(_pA, _pB1, _sum1);
+#else
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB0));
+                    _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA, _pB1));
+#endif
+
+                    pA += 8;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_castpd_si128(_mm_load1_pd((const double*)pA));
+                    __m128i _pB = _mm_castps_si128(_mm_load1_ps((const float*)pB));
+
+#if __XOP__
+                    _pA = _mm_unpacklo_epi16(_pA, _pA);
+                    __m128i _pB0 = _mm_unpacklo_epi16(_pB, _pB);
+                    __m128i _pB1 = _mm_shuffle_epi32(_pB0, _MM_SHUFFLE(2, 3, 0, 1));
+                    _sum0 = _mm_maccd_epi16(_pA, _pB0, _sum0);
+                    _sum1 = _mm_maccd_epi16(_pA, _pB1, _sum1);
+#else
+                    __m128i _pB01 = _mm_shufflehi_epi16(_pB, _MM_SHUFFLE(0, 1, 0, 1));
+                    __m128i _sl = _mm_mullo_epi16(_pA, _pB01);
+                    __m128i _sh = _mm_mulhi_epi16(_pA, _pB01);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
+                    __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    _sum1 = _mm_add_epi32(_sum1, _s1);
+#endif
+
+                    pA += 4;
+                    pB += 2;
+                }
+
+                if (k_end)
+                {
+                    // from
+                    //      00 11 20 31
+                    //      01 10 21 30
+                    // to
+                    //      00 10 20 30
+                    //      01 11 21 31
+                    {
+                        __m128i _tmp0 = _mm_shuffle_epi32(_sum0, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m128i _tmp1 = _mm_shuffle_epi32(_sum1, _MM_SHUFFLE(0, 2, 3, 1));
+                        _sum0 = _mm_unpacklo_epi32(_tmp0, _tmp1);
+                        _sum1 = _mm_unpackhi_epi32(_tmp0, _tmp1);
+                        _sum1 = _mm_shuffle_epi32(_sum1, _MM_SHUFFLE(2, 1, 0, 3));
+                    }
+                }
+
+                _mm_store_si128((__m128i*)outptr, _sum0);
+                _mm_store_si128((__m128i*)(outptr + 4), _sum1);
+                outptr += 8;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                __m128i _sum0;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm_setzero_si128();
+                }
+                else
+                {
+                    _sum0 = _mm_load_si128((const __m128i*)outptr);
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA = _mm_loadu_si128((const __m128i*)pA);
+                    __m128i _pB = _mm_castps_si128(_mm_load1_ps((const float*)pB));
+
+#if __XOP__
+                    _sum0 = _mm_maddd_epi16(_pA, _pB, _sum0);
+#else
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB));
+#endif
+
+                    pA += 8;
+                    pB += 2;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_loadl_epi64((const __m128i*)pA);
+                    __m128i _pB = _mm_set1_epi16(pB[0]);
+
+#if __XOP__
+                    _pA = _mm_unpacklo_epi16(_pA, _pA);
+                    _sum0 = _mm_maccd_epi16(_pA, _pB, _sum0);
+#else
+                    __m128i _sl = _mm_mullo_epi16(_pA, _pB);
+                    __m128i _sh = _mm_mulhi_epi16(_pA, _pB);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+#endif
+
+                    pA += 4;
+                    pB += 1;
+                }
+
+                _mm_store_si128((__m128i*)outptr, _sum0);
+                outptr += 4;
+            }
+        }
+    }
+#endif // __SSE2__
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+#if __SSE2__
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+            for (; jj + 15 < max_jj; jj += 16)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+                __m512i _sum1;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                    _sum1 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_loadu_si512((const __m512i*)outptr);
+                    _sum1 = _mm512_loadu_si512((const __m512i*)(outptr + 16));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA0 = _mm512_set1_epi32(((const int*)pA)[0]);
+                    __m512i _pA1 = _mm512_set1_epi32(((const int*)pA)[1]);
+                    __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB);
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0);
+                    _sum1 = _mm512_dpwssd_epi32(_sum1, _pA1, _pB0);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm512_add_epi32(_sum1, _mm512_madd_epi16(_pA1, _pB0));
+#endif // __AVX512VNNI__
+
+                    pA += 4;
+                    pB += 32;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA0 = _mm512_set1_epi32(pA[0]);
+                    __m512i _pA1 = _mm512_set1_epi32(pA[1]);
+                    __m256i _pB = _mm256_loadu_si256((const __m256i*)pB);
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_pB);
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA0, _pB0);
+                    __m512i _s1 = _mm512_mullo_epi32(_pA1, _pB0);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+                    _sum1 = _mm512_add_epi32(_sum1, _s1);
+
+                    pA += 2;
+                    pB += 16;
+                }
+
+                if (k_end)
+                {
+                    __m512i _tmp0 = _mm512_unpacklo_epi32(_sum0, _sum1);
+                    __m512i _tmp1 = _mm512_unpackhi_epi32(_sum0, _sum1);
+                    _sum0 = _mm512_shuffle_i32x4(_tmp0, _tmp1, _MM_SHUFFLE(1, 0, 1, 0));
+                    _sum1 = _mm512_shuffle_i32x4(_tmp0, _tmp1, _MM_SHUFFLE(3, 2, 3, 2));
+                    _sum0 = _mm512_shuffle_i32x4(_sum0, _sum0, _MM_SHUFFLE(3, 1, 2, 0));
+                    _sum1 = _mm512_shuffle_i32x4(_sum1, _sum1, _MM_SHUFFLE(3, 1, 2, 0));
+                }
+
+                _mm512_storeu_si512((__m512i*)outptr, _sum0);
+                _mm512_storeu_si512((__m512i*)(outptr + 16), _sum1);
+                outptr += 32;
+            }
+#endif // __AVX512F__
+            for (; jj + 7 < max_jj; jj += 8)
+            {
+                const short* pA = pAT;
+
+#if __AVX2__
+                __m256i _sum0;
+                __m256i _sum1;
+#else
+                __m128i _sum0;
+                __m128i _sum1;
+                __m128i _sum2;
+                __m128i _sum3;
+#endif
+
+                if (k == 0)
+                {
+#if __AVX2__
+                    _sum0 = _mm256_setzero_si256();
+                    _sum1 = _mm256_setzero_si256();
+#else
+                    _sum0 = _mm_setzero_si128();
+                    _sum1 = _mm_setzero_si128();
+                    _sum2 = _mm_setzero_si128();
+                    _sum3 = _mm_setzero_si128();
+#endif
+                }
+                else
+                {
+#if __AVX2__
+                    _sum0 = _mm256_loadu_si256((const __m256i*)outptr);
+                    _sum1 = _mm256_loadu_si256((const __m256i*)(outptr + 8));
+#else
+                    _sum0 = _mm_loadu_si128((const __m128i*)outptr);
+                    _sum1 = _mm_loadu_si128((const __m128i*)(outptr + 4));
+                    _sum2 = _mm_loadu_si128((const __m128i*)(outptr + 8));
+                    _sum3 = _mm_loadu_si128((const __m128i*)(outptr + 12));
+#endif
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+#if __AVX2__
+                    __m256i _pA0 = _mm256_castps_si256(_mm256_broadcast_ss((const float*)pA));
+                    __m256i _pA1 = _mm256_castps_si256(_mm256_broadcast_ss((const float*)(pA + 2)));
+                    __m256i _pB0 = _mm256_loadu_si256((const __m256i*)pB);
+
+                    // vs2019 internal compiler error with avx512 vnni intrinsics here
+                    // fallback to avx2 madd anyway as a workaround   --- nihui
+                    _sum0 = _mm256_add_epi32(_sum0, _mm256_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm256_add_epi32(_sum1, _mm256_madd_epi16(_pA1, _pB0));
+#else  // __AVX2__
+                    __m128i _pA0 = _mm_castps_si128(_mm_load1_ps((const float*)pA));
+                    __m128i _pA1 = _mm_castps_si128(_mm_load1_ps((const float*)(pA + 2)));
+                    __m128i _pB0 = _mm_loadu_si128((const __m128i*)pB);
+                    __m128i _pB1 = _mm_loadu_si128((const __m128i*)(pB + 8));
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB0));
+                    _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA0, _pB1));
+                    _sum2 = _mm_add_epi32(_sum2, _mm_madd_epi16(_pA1, _pB0));
+                    _sum3 = _mm_add_epi32(_sum3, _mm_madd_epi16(_pA1, _pB1));
+#endif // __AVX2__
+
+                    pA += 4;
+                    pB += 16;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pB = _mm_load_si128((const __m128i*)pB);
+#if __AVX2__
+                    __m256i _pA0 = _mm256_set1_epi32(pA[0]);
+                    __m256i _pA1 = _mm256_set1_epi32(pA[1]);
+                    __m256i _pB0 = _mm256_cvtepi16_epi32(_pB);
+
+                    __m256i _s0 = _mm256_mullo_epi32(_pA0, _pB0);
+                    __m256i _s1 = _mm256_mullo_epi32(_pA1, _pB0);
+                    _sum0 = _mm256_add_epi32(_sum0, _s0);
+                    _sum1 = _mm256_add_epi32(_sum1, _s1);
+#else  // __AVX2__
+                    __m128i _pA0 = _mm_set1_epi16(pA[0]);
+                    __m128i _pA1 = _mm_set1_epi16(pA[1]);
+
+                    __m128i _sl0 = _mm_mullo_epi16(_pA0, _pB);
+                    __m128i _sh0 = _mm_mulhi_epi16(_pA0, _pB);
+                    __m128i _sl1 = _mm_mullo_epi16(_pA1, _pB);
+                    __m128i _sh1 = _mm_mulhi_epi16(_pA1, _pB);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl0, _sh0);
+                    __m128i _s1 = _mm_unpackhi_epi16(_sl0, _sh0);
+                    __m128i _s2 = _mm_unpacklo_epi16(_sl1, _sh1);
+                    __m128i _s3 = _mm_unpackhi_epi16(_sl1, _sh1);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    _sum1 = _mm_add_epi32(_sum1, _s1);
+                    _sum2 = _mm_add_epi32(_sum2, _s2);
+                    _sum3 = _mm_add_epi32(_sum3, _s3);
+#endif // __AVX2__
+                    pA += 2;
+                    pB += 8;
+                }
+
+#if __AVX2__
+                if (k_end)
+                {
+                    __m256i _tmp0 = _mm256_unpacklo_epi32(_sum0, _sum1);
+                    __m256i _tmp1 = _mm256_unpackhi_epi32(_sum0, _sum1);
+                    _sum0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
+                    _sum1 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+                }
+
+                _mm256_storeu_si256((__m256i*)outptr, _sum0);
+                _mm256_storeu_si256((__m256i*)(outptr + 8), _sum1);
+                outptr += 16;
+#else  // __AVX2__
+                if (k_end)
+                {
+                    __m128i _tmp0 = _mm_unpacklo_epi32(_sum0, _sum2);
+                    __m128i _tmp1 = _mm_unpackhi_epi32(_sum0, _sum2);
+                    __m128i _tmp2 = _mm_unpacklo_epi32(_sum1, _sum3);
+                    __m128i _tmp3 = _mm_unpackhi_epi32(_sum1, _sum3);
+                    _sum0 = _tmp0;
+                    _sum1 = _tmp1;
+                    _sum2 = _tmp2;
+                    _sum3 = _tmp3;
+                }
+
+                _mm_storeu_si128((__m128i*)outptr, _sum0);
+                _mm_storeu_si128((__m128i*)(outptr + 4), _sum1);
+                _mm_storeu_si128((__m128i*)(outptr + 8), _sum2);
+                _mm_storeu_si128((__m128i*)(outptr + 12), _sum3);
+                outptr += 16;
+#endif // __AVX2__
+            }
+#endif // defined(__x86_64__) || defined(_M_X64)
+            for (; jj + 3 < max_jj; jj += 4)
+            {
+                const short* pA = pAT;
+
+                __m128i _sum0;
+                __m128i _sum1;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm_setzero_si128();
+                    _sum1 = _mm_setzero_si128();
+                }
+                else
+                {
+                    _sum0 = _mm_loadu_si128((const __m128i*)outptr);
+                    _sum1 = _mm_loadu_si128((const __m128i*)(outptr + 4));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA0 = _mm_castps_si128(_mm_load1_ps((const float*)pA));
+                    __m128i _pA1 = _mm_castps_si128(_mm_load1_ps((const float*)(pA + 2)));
+                    __m128i _pB = _mm_loadu_si128((const __m128i*)pB);
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA0, _pB));
+                    _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA1, _pB));
+
+                    pA += 4;
+                    pB += 8;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA0 = _mm_set1_epi16(pA[0]);
+                    __m128i _pA1 = _mm_set1_epi16(pA[1]);
+                    __m128i _pB = _mm_loadl_epi64((const __m128i*)pB);
+                    __m128i _sl0 = _mm_mullo_epi16(_pA0, _pB);
+                    __m128i _sh0 = _mm_mulhi_epi16(_pA0, _pB);
+                    __m128i _sl1 = _mm_mullo_epi16(_pA1, _pB);
+                    __m128i _sh1 = _mm_mulhi_epi16(_pA1, _pB);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl0, _sh0);
+                    __m128i _s1 = _mm_unpacklo_epi16(_sl1, _sh1);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    _sum1 = _mm_add_epi32(_sum1, _s1);
+                    pA += 2;
+                    pB += 4;
+                }
+
+                if (k_end)
+                {
+                    __m128i _tmp0 = _mm_unpacklo_epi32(_sum0, _sum1);
+                    __m128i _tmp1 = _mm_unpackhi_epi32(_sum0, _sum1);
+                    _sum0 = _tmp0;
+                    _sum1 = _tmp1;
+                }
+
+                _mm_storeu_si128((__m128i*)outptr, _sum0);
+                _mm_storeu_si128((__m128i*)(outptr + 4), _sum1);
+                outptr += 2 * 4;
+            }
+#endif // __SSE2__
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                int sum00 = 0;
+                int sum01 = 0;
+                int sum10 = 0;
+                int sum11 = 0;
+
+                if (k == 0)
+                {
+                    sum00 = 0;
+                    sum01 = 0;
+                    sum10 = 0;
+                    sum11 = 0;
+                }
+                else
+                {
+                    sum00 = outptr[0];
+                    sum01 = outptr[1];
+                    sum10 = outptr[2];
+                    sum11 = outptr[3];
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    sum00 += pA[0] * pB[0];
+                    sum00 += pA[1] * pB[1];
+                    sum01 += pA[2] * pB[0];
+                    sum01 += pA[3] * pB[1];
+                    sum10 += pA[0] * pB[2];
+                    sum10 += pA[1] * pB[3];
+                    sum11 += pA[2] * pB[2];
+                    sum11 += pA[3] * pB[3];
+
+                    pA += 4;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    sum00 += pA[0] * pB[0];
+                    sum01 += pA[1] * pB[0];
+                    sum10 += pA[0] * pB[1];
+                    sum11 += pA[1] * pB[1];
+                    pA += 2;
+                    pB += 2;
+                }
+
+                outptr[0] = sum00;
+                outptr[1] = sum01;
+                outptr[2] = sum10;
+                outptr[3] = sum11;
+                outptr += 2 * 2;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+                if (k == 0)
+                {
+                    sum0 = 0;
+                    sum1 = 0;
+                }
+                else
+                {
+                    sum0 = outptr[0];
+                    sum1 = outptr[1];
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum0 += pA[1] * pB[1];
+                    sum1 += pA[2] * pB[0];
+                    sum1 += pA[3] * pB[1];
+                    pA += 4;
+                    pB += 2;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum1 += pA[1] * pB[0];
+                    pA += 2;
+                    pB += 1;
+                }
+
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+                outptr += 2;
+            }
+        }
+    }
+    for (; ii < max_ii; ii++)
+    {
+        for (int b = 0; b < batch; b++)
+        {
+            const short* pAT = AT_tile.row<const short>(b) + max_kk * ii;
+            const short* pB = BT_tile.row<const short>(b);
+
+            int jj = 0;
+#if __SSE2__
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+            for (; jj + 15 < max_jj; jj += 16)
+            {
+                const short* pA = pAT;
+
+                __m512i _sum0;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm512_setzero_si512();
+                }
+                else
+                {
+                    _sum0 = _mm512_loadu_si512((const __m512i*)outptr);
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m512i _pA0 = _mm512_set1_epi32(((const int*)pA)[0]);
+                    __m512i _pB0 = _mm512_loadu_si512((const __m512i*)pB);
+
+#if __AVX512VNNI__
+                    _sum0 = _mm512_dpwssd_epi32(_sum0, _pA0, _pB0);
+#else
+                    _sum0 = _mm512_add_epi32(_sum0, _mm512_madd_epi16(_pA0, _pB0));
+#endif
+
+                    pA += 2;
+                    pB += 32;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m512i _pA = _mm512_set1_epi32(pA[0]);
+                    __m512i _pB0 = _mm512_cvtepi16_epi32(_mm256_loadu_si256((const __m256i*)pB));
+
+                    __m512i _s0 = _mm512_mullo_epi32(_pA, _pB0);
+                    _sum0 = _mm512_add_epi32(_sum0, _s0);
+
+                    pA += 1;
+                    pB += 16;
+                }
+
+                _mm512_storeu_si512((__m512i*)outptr, _sum0);
+                outptr += 16;
+            }
+#endif // __AVX512F__
+            for (; jj + 7 < max_jj; jj += 8)
+            {
+                const short* pA = pAT;
+
+                __m128i _sum0;
+                __m128i _sum1;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm_setzero_si128();
+                    _sum1 = _mm_setzero_si128();
+                }
+                else
+                {
+                    _sum0 = _mm_loadu_si128((const __m128i*)outptr);
+                    _sum1 = _mm_loadu_si128((const __m128i*)(outptr + 4));
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA = _mm_castps_si128(_mm_load1_ps((const float*)pA));
+                    __m128i _pB0 = _mm_loadu_si128((const __m128i*)pB);
+                    __m128i _pB1 = _mm_loadu_si128((const __m128i*)(pB + 8));
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB0));
+                    _sum1 = _mm_add_epi32(_sum1, _mm_madd_epi16(_pA, _pB1));
+
+                    pA += 2;
+                    pB += 16;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_set1_epi16(pA[0]);
+                    __m128i _pB = _mm_load_si128((const __m128i*)pB);
+                    __m128i _sl = _mm_mullo_epi16(_pA, _pB);
+                    __m128i _sh = _mm_mulhi_epi16(_pA, _pB);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
+                    __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    _sum1 = _mm_add_epi32(_sum1, _s1);
+                    pA += 1;
+                    pB += 8;
+                }
+
+                _mm_storeu_si128((__m128i*)outptr, _sum0);
+                _mm_storeu_si128((__m128i*)(outptr + 4), _sum1);
+                outptr += 8;
+            }
+#endif // defined(__x86_64__) || defined(_M_X64)
+            for (; jj + 3 < max_jj; jj += 4)
+            {
+                const short* pA = pAT;
+
+                __m128i _sum0;
+
+                if (k == 0)
+                {
+                    _sum0 = _mm_setzero_si128();
+                }
+                else
+                {
+                    _sum0 = _mm_loadu_si128((const __m128i*)outptr);
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    __m128i _pA = _mm_castps_si128(_mm_load1_ps((const float*)pA));
+                    __m128i _pB = _mm_loadu_si128((const __m128i*)pB);
+                    _sum0 = _mm_add_epi32(_sum0, _mm_madd_epi16(_pA, _pB));
+
+                    pA += 2;
+                    pB += 8;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    __m128i _pA = _mm_set1_epi16(pA[0]);
+                    __m128i _pB = _mm_loadl_epi64((const __m128i*)pB);
+                    __m128i _sl = _mm_mullo_epi16(_pA, _pB);
+                    __m128i _sh = _mm_mulhi_epi16(_pA, _pB);
+                    __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
+                    _sum0 = _mm_add_epi32(_sum0, _s0);
+                    pA += 1;
+                    pB += 4;
+                }
+
+                _mm_storeu_si128((__m128i*)outptr, _sum0);
+                outptr += 4;
+            }
+#endif // __SSE2__
+            for (; jj + 1 < max_jj; jj += 2)
+            {
+                const short* pA = pAT;
+
+                int sum0 = 0;
+                int sum1 = 0;
+
+                if (k == 0)
+                {
+                    sum0 = 0;
+                    sum1 = 0;
+                }
+                else
+                {
+                    sum0 = outptr[0];
+                    sum1 = outptr[1];
+                }
+
+                int kk = 0;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum0 += pA[1] * pB[1];
+                    sum1 += pA[0] * pB[2];
+                    sum1 += pA[1] * pB[3];
+                    pA += 2;
+                    pB += 4;
+                }
+                for (; kk < max_kk; kk++)
+                {
+                    sum0 += pA[0] * pB[0];
+                    sum1 += pA[0] * pB[1];
+                    pA += 1;
+                    pB += 2;
+                }
+
+                outptr[0] = sum0;
+                outptr[1] = sum1;
+                outptr += 2;
+            }
+            for (; jj < max_jj; jj++)
+            {
+                const short* pA = pAT;
+
+                int sum = 0;
+
+                if (k == 0)
+                {
+                    sum = 0;
+                }
+                else
+                {
+                    sum = outptr[0];
+                }
+
+                int kk = 0;
+                for (; kk < max_kk; kk++)
+                {
+                    sum += pA[0] * pB[0];
+                    pA += 1;
+                    pB += 1;
+                }
+
+                outptr[0] = sum;
+                outptr += 1;
+            }
+        }
+    }
+}
+
+static void get_optimal_tile_mnk_int8(int M, int N, int K, int& TILE_M, int& TILE_N, int& TILE_K, int nT)
+{
+    // resolve optimal tile size from cache size
+    const size_t l2_cache_size_int8 = (int)(get_cpu_level2_cache_size() / sizeof(short));
+
+    if (nT == 0)
+        nT = get_physical_big_cpu_count();
+
+    // solve M
+    {
+        int tile_size = (int)sqrt((float)l2_cache_size_int8 / 3);
+
+#if __AVX512F__
+        TILE_M = std::max(16, tile_size / 16 * 16);
+#elif __AVX2__
+        TILE_M = std::max(8, tile_size / 8 * 8);
+#elif __SSE2__
+        TILE_M = std::max(4, tile_size / 4 * 4);
+#else
+        TILE_M = std::max(2, tile_size / 2 * 2);
+#endif
+
+        TILE_M *= std::min(nT, get_physical_cpu_count());
+
+        int nn_M = (M + TILE_M - 1) / TILE_M;
+#if __AVX512F__
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 15) / 16 * 16);
+#elif __AVX2__
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 7) / 8 * 8);
+#elif __SSE2__
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 3) / 4 * 4);
+#else
+        TILE_M = std::min(TILE_M, ((M + nn_M - 1) / nn_M + 1) / 2 * 2);
+#endif
+
+        if (nT > 1)
+        {
+#if __AVX512F__
+            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 15) / 16 * 16);
+#elif __AVX2__
+            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 7) / 8 * 8);
+#elif __SSE2__
+            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 3) / 4 * 4);
+#else
+            TILE_M = std::min(TILE_M, (std::max(1, TILE_M / nT) + 1) / 2 * 2);
+#endif
+        }
+    }
+
+    // solve K
+    {
+        int tile_size = (int)(sqrt((float)l2_cache_size_int8) - TILE_M);
+
+#if __AVX512F__
+        TILE_K = std::max(16, tile_size / 16 * 16);
+#elif __AVX2__
+        TILE_K = std::max(8, tile_size / 8 * 8);
+#elif __SSE2__
+        TILE_K = std::max(4, tile_size / 4 * 4);
+#else
+        TILE_K = std::max(2, tile_size / 2 * 2);
+#endif
+
+        int nn_K = (K + TILE_K - 1) / TILE_K;
+#if __AVX512F__
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 15) / 16 * 16);
+#elif __AVX2__
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 7) / 8 * 8);
+#elif __SSE2__
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 3) / 4 * 4);
+#else
+        TILE_K = std::min(TILE_K, ((K + nn_K - 1) / nn_K + 1) / 2 * 2);
+#endif
+    }
+
+    if (N > 0)
+    {
+        int tile_size = (int)((l2_cache_size_int8 - TILE_M * TILE_K) / (TILE_M * 2 + TILE_K));
+
+#if __SSE2__
+        TILE_N = std::max(4, tile_size / 4 * 4);
+#else
+        TILE_N = std::max(1, tile_size);
+#endif
+
+        int nn_N = (N + TILE_N - 1) / TILE_N;
+#if __SSE2__
+        TILE_N = std::min(TILE_N, ((N + nn_N - 1) / nn_N + 3) / 4 * 4);
+#else
+        TILE_N = std::min(TILE_N, (N + nn_N - 1) / nn_N);
+#endif
+    }
+}
+
+static inline void conv3x3s1_winograd23_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
+{
+    // const signed char ktm[4][3] = {
+    //     {2, 0, 0},
+    //     {1, 1, 1},
+    //     {1, -1, 1},
+    //     {0, 0, 2}
+    // };
+
+    short* ptmp = A;
+
+    int ii = 0;
+    for (; ii < max_ii; ii++)
+    {
+        int kk = 0;
+        for (; kk < max_kk; kk++)
+        {
+            short tmp[4][3];
+
+            const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;
+
+            for (int m = 0; m < 3; m++)
+            {
+                signed char r0 = k0[0];
+                signed char r1 = k0[1];
+                signed char r2 = k0[2];
+
+                tmp[0][m] = r0 * 2;
+                tmp[1][m] = r0 + r1 + r2;
+                tmp[2][m] = r0 - r1 + r2;
+                tmp[3][m] = r2 * 2;
+
+                k0 += 3;
+            }
+
+            for (int m = 0; m < 4; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+
+                short z0 = r0 * 2;
+                short z1 = r0 + r1 + r2;
+                short z2 = r0 - r1 + r2;
+                short z3 = r2 * 2;
+
+                ptmp[0] = z0;
+                ptmp[1] = z1;
+                ptmp[2] = z2;
+                ptmp[3] = z3;
+                ptmp += 4;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
+{
+#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+    if (ncnn::cpu_support_x86_avx2())
+    {
+        conv3x3s1_winograd23_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);
+        return;
+    }
+#endif
+#endif
+
+    const int M = outch;
+    const int K = inch;
+    const int B = 16;
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+
+    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 2u, (Allocator*)0);
+
+    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 2u, (Allocator*)0);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat A_tile = A_tileX.channel(get_omp_thread_num());
+
+        for (int k = 0; k < K; k += TILE_K)
+        {
+            const int max_ii = std::min((M - i), TILE_M);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            conv3x3s1_winograd23_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk);
+
+            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+            pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk);
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd23_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
+{
+    // const signed char itm[4][4] = {
+    //     {1,  0, -1,  0},
+    //     {0,  1,  1,  0},
+    //     {0, -1,  1,  0},
+    //     {0, -1,  0,  1}
+    // };
+
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int elempack = bottom_blob.elempack;
+    const int N = bottom_blob.cstep * elempack;
+
+    const int w_tiles = (w - 1) / 2;
+
+    int nn_max_kk = 0;
+    int remain_max_kk_start = 0;
+#if __SSE2__
+#if __AVX512F__
+    nn_max_kk = max_kk / 16;
+    #pragma omp parallel for num_threads(nT)
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = ppkk * 16;
+
+#ifdef _MSC_VER
+        __declspec(align(64))
+#else
+        __attribute__((aligned(64)))
+#endif
+        short tmp[4][4][16];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel((k + kk) / elempack).row<const signed char>(ti * 2) + (tj * 2) * elempack;
+
+            for (int m = 0; m < 4; m++)
+            {
+                __m256i _r0 = _mm256_setzero_si256();
+                __m256i _r1 = _mm256_setzero_si256();
+                __m256i _r2 = _mm256_setzero_si256();
+                __m256i _r3 = _mm256_setzero_si256();
+
+                if (ti * 2 + m < h)
+                {
+                    if (elempack == 16)
+                    {
+                        _r0 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)r0));
+                        if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 16)));
+                        if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 32)));
+                        if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 48)));
+                    }
+                    if (elempack == 8)
+                    {
+                        const signed char* r1 = r0 + N;
+
+                        _r0 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)r0), _mm_loadl_epi64((const __m128i*)r1)));
+                        if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 8)), _mm_loadl_epi64((const __m128i*)(r1 + 8))));
+                        if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 16)), _mm_loadl_epi64((const __m128i*)(r1 + 16))));
+                        if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 24)), _mm_loadl_epi64((const __m128i*)(r1 + 24))));
+                    }
+                    if (elempack == 1)
+                    {
+                        __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+                        _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
+                        _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
+                        if (tj * 2 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
+                        if (tj * 2 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
+                        if (tj * 2 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
+                    }
+                }
+
+                __m256i _tmp0 = _mm256_sub_epi16(_r0, _r2);
+                __m256i _tmp1 = _mm256_add_epi16(_r1, _r2);
+                __m256i _tmp2 = _mm256_sub_epi16(_r2, _r1);
+                __m256i _tmp3 = _mm256_sub_epi16(_r3, _r1);
+
+                _mm256_store_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_store_si256((__m256i*)tmp[1][m], _tmp1);
+                _mm256_store_si256((__m256i*)tmp[2][m], _tmp2);
+                _mm256_store_si256((__m256i*)tmp[3][m], _tmp3);
+
+                r0 += w * elempack;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 16 + jj * 16;
+            short* p1 = p0 + max_jj * 16;
+            short* p2 = p0 + max_jj * 16 * 2;
+            short* p3 = p0 + max_jj * 16 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)tmp[m][0]);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)tmp[m][1]);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)tmp[m][2]);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)tmp[m][3]);
+
+                __m256i _tmp0 = _mm256_sub_epi16(_r0, _r2);
+                __m256i _tmp1 = _mm256_add_epi16(_r1, _r2);
+                __m256i _tmp2 = _mm256_sub_epi16(_r2, _r1);
+                __m256i _tmp3 = _mm256_sub_epi16(_r3, _r1);
+
+                _mm256_store_si256((__m256i*)p0, _tmp0);
+                _mm256_store_si256((__m256i*)p1, _tmp1);
+                _mm256_store_si256((__m256i*)p2, _tmp2);
+                _mm256_store_si256((__m256i*)p3, _tmp3);
+
+                p0 += max_jj * 4 * 16;
+                p1 += max_jj * 4 * 16;
+                p2 += max_jj * 4 * 16;
+                p3 += max_jj * 4 * 16;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 16;
+    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
+#else // __AVX512F__
+    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
+    #pragma omp parallel for num_threads(nT)
+#endif // __AVX512F__
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = remain_max_kk_start + ppkk * 8;
+
+#ifdef _MSC_VER
+        __declspec(align(32))
+#else
+        __attribute__((aligned(32)))
+#endif
+        short tmp[4][4][8];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel((k + kk) / elempack).row<const signed char>(ti * 2) + (tj * 2) * elempack;
+
+            for (int m = 0; m < 4; m++)
+            {
+                __m128i _r0 = _mm_setzero_si128();
+                __m128i _r1 = _mm_setzero_si128();
+                __m128i _r2 = _mm_setzero_si128();
+                __m128i _r3 = _mm_setzero_si128();
+
+                if (ti * 2 + m < h)
+                {
+                    if (elempack == 8)
+                    {
+                        _r0 = _mm_loadl_epi64((const __m128i*)r0);
+                        _r0 = _mm_unpacklo_epi8(_r0, _mm_cmpgt_epi8(_mm_setzero_si128(), _r0));
+                        if (tj * 2 + 1 < w)
+                        {
+                            _r1 = _mm_loadl_epi64((const __m128i*)(r0 + 8));
+                            _r1 = _mm_unpacklo_epi8(_r1, _mm_cmpgt_epi8(_mm_setzero_si128(), _r1));
+                        }
+                        if (tj * 2 + 2 < w)
+                        {
+                            _r2 = _mm_loadl_epi64((const __m128i*)(r0 + 16));
+                            _r2 = _mm_unpacklo_epi8(_r2, _mm_cmpgt_epi8(_mm_setzero_si128(), _r2));
+                        }
+                        if (tj * 2 + 3 < w)
+                        {
+                            _r3 = _mm_loadl_epi64((const __m128i*)(r0 + 24));
+                            _r3 = _mm_unpacklo_epi8(_r3, _mm_cmpgt_epi8(_mm_setzero_si128(), _r3));
+                        }
+                    }
+                    if (elempack == 1)
+                    {
+#if __AVX2__
+                        __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+                        _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
+#if __AVX512F__
+                        _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
+                        if (tj * 2 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
+                        if (tj * 2 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
+                        if (tj * 2 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
+#else
+                        __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+                        __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
+                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
+                        _r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
+                        if (tj * 2 + 1 < w)
+                        {
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
+                            _r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
+                        }
+                        if (tj * 2 + 2 < w)
+                        {
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
+                            _r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
+                        }
+                        if (tj * 2 + 3 < w)
+                        {
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
+                            _r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
+                        }
+#endif // __AVX512F__
+#else  // __AVX2__
+                        const signed char* r1 = r0 + N;
+                        const signed char* r2 = r0 + N * 2;
+                        const signed char* r3 = r0 + N * 3;
+                        const signed char* r4 = r0 + N * 4;
+                        const signed char* r5 = r0 + N * 5;
+                        const signed char* r6 = r0 + N * 6;
+                        const signed char* r7 = r0 + N * 7;
+
+                        __m128i _t0 = _mm_loadl_epi64((const __m128i*)r0);
+                        __m128i _t1 = _mm_loadl_epi64((const __m128i*)r1);
+                        __m128i _t2 = _mm_loadl_epi64((const __m128i*)r2);
+                        __m128i _t3 = _mm_loadl_epi64((const __m128i*)r3);
+                        __m128i _t4 = _mm_loadl_epi64((const __m128i*)r4);
+                        __m128i _t5 = _mm_loadl_epi64((const __m128i*)r5);
+                        __m128i _t6 = _mm_loadl_epi64((const __m128i*)r6);
+                        __m128i _t7 = _mm_loadl_epi64((const __m128i*)r7);
+
+                        __m128i _t01 = _mm_unpacklo_epi8(_t0, _t1);
+                        __m128i _t23 = _mm_unpacklo_epi8(_t2, _t3);
+                        __m128i _t45 = _mm_unpacklo_epi8(_t4, _t5);
+                        __m128i _t67 = _mm_unpacklo_epi8(_t6, _t7);
+                        _t0 = _mm_unpacklo_epi16(_t01, _t23);
+                        _t1 = _mm_unpacklo_epi16(_t45, _t67);
+                        _t2 = _mm_unpacklo_epi32(_t0, _t1);
+                        _t3 = _mm_unpackhi_epi32(_t0, _t1);
+
+                        __m128i _extt2 = _mm_cmpgt_epi8(_mm_setzero_si128(), _t2);
+                        __m128i _extt3 = _mm_cmpgt_epi8(_mm_setzero_si128(), _t3);
+
+                        _r0 = _mm_unpacklo_epi8(_t2, _extt2);
+                        if (tj * 2 + 1 < w) _r1 = _mm_unpackhi_epi8(_t2, _extt2);
+                        if (tj * 2 + 2 < w) _r2 = _mm_unpacklo_epi8(_t3, _extt3);
+                        if (tj * 2 + 3 < w) _r3 = _mm_unpackhi_epi8(_t3, _extt3);
+#endif // __AVX2__
+                    }
+                }
+
+                __m128i _tmp0 = _mm_sub_epi16(_r0, _r2);
+                __m128i _tmp1 = _mm_add_epi16(_r1, _r2);
+                __m128i _tmp2 = _mm_sub_epi16(_r2, _r1);
+                __m128i _tmp3 = _mm_sub_epi16(_r3, _r1);
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                // old gcc breaks stack variable alignement
+                // ref https://gcc.gnu.org/bugzilla/show_bug.cgi?id=16660
+                _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3);
+#else
+                _mm_store_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_store_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_store_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_store_si128((__m128i*)tmp[3][m], _tmp3);
+#endif
+
+                r0 += w * elempack;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 16 + jj * 8;
+            short* p1 = p0 + max_jj * 8;
+            short* p2 = p0 + max_jj * 8 * 2;
+            short* p3 = p0 + max_jj * 8 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
+#else
+                __m128i _r0 = _mm_load_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_load_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_load_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_load_si128((const __m128i*)tmp[m][3]);
+#endif
+
+                __m128i _tmp0 = _mm_sub_epi16(_r0, _r2);
+                __m128i _tmp1 = _mm_add_epi16(_r1, _r2);
+                __m128i _tmp2 = _mm_sub_epi16(_r2, _r1);
+                __m128i _tmp3 = _mm_sub_epi16(_r3, _r1);
+
+                _mm_store_si128((__m128i*)p0, _tmp0);
+                _mm_store_si128((__m128i*)p1, _tmp1);
+                _mm_store_si128((__m128i*)p2, _tmp2);
+                _mm_store_si128((__m128i*)p3, _tmp3);
+
+                p0 += max_jj * 4 * 8;
+                p1 += max_jj * 4 * 8;
+                p2 += max_jj * 4 * 8;
+                p3 += max_jj * 4 * 8;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 8;
+    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
+#else // __SSE2__
+    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
+    #pragma omp parallel for num_threads(nT)
+#endif // __SSE2__
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = remain_max_kk_start + ppkk * 2;
+
+        short tmp[4][4][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel(k + kk).row<const signed char>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 4; m++)
+            {
+                signed char r00 = 0;
+                signed char r01 = 0;
+                signed char r10 = 0;
+                signed char r11 = 0;
+                signed char r20 = 0;
+                signed char r21 = 0;
+                signed char r30 = 0;
+                signed char r31 = 0;
+
+                if (ti * 2 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        const signed char* r1 = r0 + N;
+
+                        r00 = r0[0];
+                        r01 = r1[0];
+                        if (tj * 2 + 1 < w)
+                        {
+                            r10 = r0[1];
+                            r11 = r1[1];
+                        }
+                        if (tj * 2 + 2 < w)
+                        {
+                            r20 = r0[2];
+                            r21 = r1[2];
+                        }
+                        if (tj * 2 + 3 < w)
+                        {
+                            r30 = r0[3];
+                            r31 = r1[3];
+                        }
+                    }
+                }
+
+                tmp[0][m][0] = r00 - r20;
+                tmp[0][m][1] = r01 - r21;
+                tmp[1][m][0] = r10 + r20;
+                tmp[1][m][1] = r11 + r21;
+                tmp[2][m][0] = r20 - r10;
+                tmp[2][m][1] = r21 - r11;
+                tmp[3][m][0] = r30 - r10;
+                tmp[3][m][1] = r31 - r11;
+
+                r0 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 16 + jj * 2;
+            short* p1 = p0 + max_jj * 2;
+            short* p2 = p0 + max_jj * 2 * 2;
+            short* p3 = p0 + max_jj * 2 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                short r00 = tmp[m][0][0];
+                short r01 = tmp[m][0][1];
+                short r10 = tmp[m][1][0];
+                short r11 = tmp[m][1][1];
+                short r20 = tmp[m][2][0];
+                short r21 = tmp[m][2][1];
+                short r30 = tmp[m][3][0];
+                short r31 = tmp[m][3][1];
+
+                p0[0] = r00 - r20;
+                p0[1] = r01 - r21;
+                p1[0] = r10 + r20;
+                p1[1] = r11 + r21;
+                p2[0] = r20 - r10;
+                p2[1] = r21 - r11;
+                p3[0] = r30 - r10;
+                p3[1] = r31 - r11;
+
+                p0 += max_jj * 4 * 2;
+                p1 += max_jj * 4 * 2;
+                p2 += max_jj * 4 * 2;
+                p3 += max_jj * 4 * 2;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 2;
+    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
+    {
+        short tmp[4][4];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0123 = bottom_blob.channel(k + kk).row<const signed char>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 4; m++)
+            {
+                signed char r0 = 0;
+                signed char r1 = 0;
+                signed char r2 = 0;
+                signed char r3 = 0;
+
+                if (ti * 2 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        r0 = r0123[0];
+                        if (tj * 2 + 1 < w) r1 = r0123[1];
+                        if (tj * 2 + 2 < w) r2 = r0123[2];
+                        if (tj * 2 + 3 < w) r3 = r0123[3];
+                    }
+                }
+
+                tmp[0][m] = r0 - r2;
+                tmp[1][m] = r1 + r2;
+                tmp[2][m] = r2 - r1;
+                tmp[3][m] = r3 - r1;
+
+                r0123 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 16 + jj;
+            short* p1 = p0 + max_jj;
+            short* p2 = p0 + max_jj * 2;
+            short* p3 = p0 + max_jj * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+                short r3 = tmp[m][3];
+
+                p0[0] = r0 - r2;
+                p1[0] = r1 + r2;
+                p2[0] = r2 - r1;
+                p3[0] = r3 - r1;
+
+                p0 += max_jj * 4;
+                p1 += max_jj * 4;
+                p2 += max_jj * 4;
+                p3 += max_jj * 4;
+            }
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd23_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj)
+{
+    // const int otm[2][4] = {
+    //     {1,  1,  1,  0},
+    //     {0,  1, -1,  1}
+    // };
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int N = top_blob.cstep * out_elempack;
+
+    const int w_tiles = (outw + 1) / 2;
+
+    int ii = 0;
+#if __SSE2__
+#if __AVX2__
+#if __AVX512F__
+    for (; ii + 15 < max_ii; ii += 16)
+    {
+#ifdef _MSC_VER
+        __declspec(align(64))
+#else
+        __attribute__((aligned(64)))
+#endif
+        int tmp[2][4][16];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 16;
+            const int* r1 = r0 + max_jj * 16;
+            const int* r2 = r0 + max_jj * 16 * 2;
+            const int* r3 = r0 + max_jj * 16 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                __m512i _r0 = _mm512_load_si512((const __m512i*)r0);
+                __m512i _r1 = _mm512_load_si512((const __m512i*)r1);
+                __m512i _r2 = _mm512_load_si512((const __m512i*)r2);
+                __m512i _r3 = _mm512_load_si512((const __m512i*)r3);
+
+                __m512i _tmp0 = _mm512_add_epi32(_mm512_add_epi32(_r0, _r1), _r2);
+                __m512i _tmp1 = _mm512_add_epi32(_mm512_sub_epi32(_r1, _r2), _r3);
+
+                _mm512_store_si512((__m512i*)tmp[0][m], _tmp0);
+                _mm512_store_si512((__m512i*)tmp[1][m], _tmp1);
+
+                r0 += max_jj * 4 * 16;
+                r1 += max_jj * 4 * 16;
+                r2 += max_jj * 4 * 16;
+                r3 += max_jj * 4 * 16;
+            }
+
+            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 2) + (tj * 2) * out_elempack;
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+                __m512i _r0 = _mm512_load_si512((const __m512i*)tmp[m][0]);
+                __m512i _r1 = _mm512_load_si512((const __m512i*)tmp[m][1]);
+                __m512i _r2 = _mm512_load_si512((const __m512i*)tmp[m][2]);
+                __m512i _r3 = _mm512_load_si512((const __m512i*)tmp[m][3]);
+
+                __m512i _tmp0 = _mm512_add_epi32(_mm512_add_epi32(_r0, _r1), _r2);
+                __m512i _tmp1 = _mm512_add_epi32(_mm512_sub_epi32(_r1, _r2), _r3);
+
+                _tmp0 = _mm512_srai_epi32(_tmp0, 2);
+                _tmp1 = _mm512_srai_epi32(_tmp1, 2);
+
+                if (out_elempack == 16)
+                {
+                    _mm512_store_si512((__m512i*)outptr0, _tmp0);
+                    if (tj * 2 + 1 < outw)
+                    {
+                        _mm512_store_si512((__m512i*)(outptr0 + 16), _tmp1);
+                    }
+                }
+                if (out_elempack == 8)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    _mm256_store_si256((__m256i*)outptr0, _mm512_extracti32x8_epi32(_tmp0, 0));
+                    _mm256_store_si256((__m256i*)outptr1, _mm512_extracti32x8_epi32(_tmp0, 1));
+                    if (tj * 2 + 1 < outw)
+                    {
+                        _mm256_store_si256((__m256i*)(outptr0 + 8), _mm512_extracti32x8_epi32(_tmp1, 0));
+                        _mm256_store_si256((__m256i*)(outptr1 + 8), _mm512_extracti32x8_epi32(_tmp1, 1));
+                    }
+                }
+                if (out_elempack == 4)
+                {
+                    int* outptr1 = outptr0 + N;
+                    int* outptr2 = outptr0 + N * 2;
+                    int* outptr3 = outptr0 + N * 3;
+
+                    _mm_store_si128((__m128i*)outptr0, _mm512_extracti32x4_epi32(_tmp0, 0));
+                    _mm_store_si128((__m128i*)outptr1, _mm512_extracti32x4_epi32(_tmp0, 1));
+                    _mm_store_si128((__m128i*)outptr2, _mm512_extracti32x4_epi32(_tmp0, 2));
+                    _mm_store_si128((__m128i*)outptr3, _mm512_extracti32x4_epi32(_tmp0, 3));
+                    if (tj * 2 + 1 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 4), _mm512_extracti32x4_epi32(_tmp1, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 4), _mm512_extracti32x4_epi32(_tmp1, 1));
+                        _mm_store_si128((__m128i*)(outptr2 + 4), _mm512_extracti32x4_epi32(_tmp1, 2));
+                        _mm_store_si128((__m128i*)(outptr3 + 4), _mm512_extracti32x4_epi32(_tmp1, 3));
+                    }
+                }
+                if (out_elempack == 1)
+                {
+                    __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+                    _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
+                    _mm512_i32scatter_epi32(outptr0, _vindex, _tmp0, sizeof(int));
+                    if (tj * 2 + 1 < outw) _mm512_i32scatter_epi32(outptr0 + 1, _vindex, _tmp1, sizeof(int));
+                }
+
+                outptr0 += outw * out_elempack;
+            }
+        }
+    }
+#endif // __AVX512F__
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+#ifdef _MSC_VER
+        __declspec(align(32))
+#else
+        __attribute__((aligned(32)))
+#endif
+        int tmp[2][4][8];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 8;
+            const int* r1 = r0 + max_jj * 8;
+            const int* r2 = r0 + max_jj * 8 * 2;
+            const int* r3 = r0 + max_jj * 8 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)r0);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)r1);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)r2);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)r3);
+
+                __m256i _tmp0 = _mm256_add_epi32(_mm256_add_epi32(_r0, _r1), _r2);
+                __m256i _tmp1 = _mm256_add_epi32(_mm256_sub_epi32(_r1, _r2), _r3);
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm256_storeu_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_storeu_si256((__m256i*)tmp[1][m], _tmp1);
+#else
+                _mm256_store_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_store_si256((__m256i*)tmp[1][m], _tmp1);
+#endif
+
+                r0 += max_jj * 4 * 8;
+                r1 += max_jj * 4 * 8;
+                r2 += max_jj * 4 * 8;
+                r3 += max_jj * 4 * 8;
+            }
+
+            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 2) + (tj * 2) * out_elempack;
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                __m256i _r0 = _mm256_loadu_si256((const __m256i*)tmp[m][0]);
+                __m256i _r1 = _mm256_loadu_si256((const __m256i*)tmp[m][1]);
+                __m256i _r2 = _mm256_loadu_si256((const __m256i*)tmp[m][2]);
+                __m256i _r3 = _mm256_loadu_si256((const __m256i*)tmp[m][3]);
+#else
+                __m256i _r0 = _mm256_load_si256((const __m256i*)tmp[m][0]);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)tmp[m][1]);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)tmp[m][2]);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)tmp[m][3]);
+#endif
+
+                __m256i _tmp0 = _mm256_add_epi32(_mm256_add_epi32(_r0, _r1), _r2);
+                __m256i _tmp1 = _mm256_add_epi32(_mm256_sub_epi32(_r1, _r2), _r3);
+
+                _tmp0 = _mm256_srai_epi32(_tmp0, 2);
+                _tmp1 = _mm256_srai_epi32(_tmp1, 2);
+
+                if (out_elempack == 8)
+                {
+                    _mm256_store_si256((__m256i*)outptr0, _tmp0);
+                    if (tj * 2 + 1 < outw)
+                    {
+                        _mm256_store_si256((__m256i*)(outptr0 + 8), _tmp1);
+                    }
+                }
+                if (out_elempack == 4)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    _mm_store_si128((__m128i*)outptr0, _mm256_extracti128_si256(_tmp0, 0));
+                    _mm_store_si128((__m128i*)outptr1, _mm256_extracti128_si256(_tmp0, 1));
+                    if (tj * 2 + 1 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 4), _mm256_extracti128_si256(_tmp1, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 4), _mm256_extracti128_si256(_tmp1, 1));
+                    }
+                }
+                if (out_elempack == 1)
+                {
+#if __AVX512F__
+                    __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+                    _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
+                    _mm256_i32scatter_epi32(outptr0, _vindex, _tmp0, sizeof(int));
+                    if (tj * 2 + 1 < outw) _mm256_i32scatter_epi32(outptr0 + 1, _vindex, _tmp1, sizeof(int));
+#else
+                    int tmp0[8];
+                    int tmp1[8];
+                    _mm256_storeu_si256((__m256i*)tmp0, _tmp0);
+                    _mm256_storeu_si256((__m256i*)tmp1, _tmp1);
+
+                    int* outptr1 = outptr0 + N;
+                    int* outptr2 = outptr0 + N * 2;
+                    int* outptr3 = outptr0 + N * 3;
+                    int* outptr4 = outptr0 + N * 4;
+                    int* outptr5 = outptr0 + N * 5;
+                    int* outptr6 = outptr0 + N * 6;
+                    int* outptr7 = outptr0 + N * 7;
+
+                    outptr0[0] = tmp0[0];
+                    outptr1[0] = tmp0[1];
+                    outptr2[0] = tmp0[2];
+                    outptr3[0] = tmp0[3];
+                    outptr4[0] = tmp0[4];
+                    outptr5[0] = tmp0[5];
+                    outptr6[0] = tmp0[6];
+                    outptr7[0] = tmp0[7];
+
+                    if (tj * 2 + 1 < outw)
+                    {
+                        outptr0[1] = tmp1[0];
+                        outptr1[1] = tmp1[1];
+                        outptr2[1] = tmp1[2];
+                        outptr3[1] = tmp1[3];
+                        outptr4[1] = tmp1[4];
+                        outptr5[1] = tmp1[5];
+                        outptr6[1] = tmp1[6];
+                        outptr7[1] = tmp1[7];
+                    }
+#endif // __AVX512F__
+                }
+
+                outptr0 += outw * out_elempack;
+            }
+        }
+    }
+#endif // __AVX2__
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+#ifdef _MSC_VER
+        __declspec(align(16))
+#else
+        __attribute__((aligned(16)))
+#endif
+        int tmp[2][4][4];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 4;
+            const int* r1 = r0 + max_jj * 4;
+            const int* r2 = r0 + max_jj * 4 * 2;
+            const int* r3 = r0 + max_jj * 4 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                __m128i _r0 = _mm_load_si128((const __m128i*)r0);
+                __m128i _r1 = _mm_load_si128((const __m128i*)r1);
+                __m128i _r2 = _mm_load_si128((const __m128i*)r2);
+                __m128i _r3 = _mm_load_si128((const __m128i*)r3);
+
+                __m128i _tmp0 = _mm_add_epi32(_mm_add_epi32(_r0, _r1), _r2);
+                __m128i _tmp1 = _mm_add_epi32(_mm_sub_epi32(_r1, _r2), _r3);
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1);
+#else
+                _mm_store_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_store_si128((__m128i*)tmp[1][m], _tmp1);
+#endif
+
+                r0 += max_jj * 4 * 4;
+                r1 += max_jj * 4 * 4;
+                r2 += max_jj * 4 * 4;
+                r3 += max_jj * 4 * 4;
+            }
+
+            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 2) + (tj * 2) * out_elempack;
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
+#else
+                __m128i _r0 = _mm_load_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_load_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_load_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_load_si128((const __m128i*)tmp[m][3]);
+#endif
+
+                __m128i _tmp0 = _mm_add_epi32(_mm_add_epi32(_r0, _r1), _r2);
+                __m128i _tmp1 = _mm_add_epi32(_mm_sub_epi32(_r1, _r2), _r3);
+
+                _tmp0 = _mm_srai_epi32(_tmp0, 2);
+                _tmp1 = _mm_srai_epi32(_tmp1, 2);
+
+                if (out_elempack == 4)
+                {
+                    _mm_store_si128((__m128i*)outptr0, _tmp0);
+                    if (tj * 2 + 1 < outw) _mm_store_si128((__m128i*)(outptr0 + 4), _tmp1);
+                }
+                if (out_elempack == 1)
+                {
+#if __AVX512F__
+                    __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3);
+                    _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(N));
+                    _mm_i32scatter_epi32(outptr0, _vindex, _tmp0, sizeof(int));
+                    if (tj * 2 + 1 < outw) _mm_i32scatter_epi32(outptr0 + 1, _vindex, _tmp1, sizeof(int));
+#else
+                    int tmp0[4];
+                    int tmp1[4];
+                    _mm_storeu_si128((__m128i*)tmp0, _tmp0);
+                    _mm_storeu_si128((__m128i*)tmp1, _tmp1);
+
+                    int* outptr1 = outptr0 + N;
+                    int* outptr2 = outptr0 + N * 2;
+                    int* outptr3 = outptr0 + N * 3;
+
+                    outptr0[0] = tmp0[0];
+                    outptr1[0] = tmp0[1];
+                    outptr2[0] = tmp0[2];
+                    outptr3[0] = tmp0[3];
+
+                    if (tj * 2 + 1 < outw)
+                    {
+                        outptr0[1] = tmp1[0];
+                        outptr1[1] = tmp1[1];
+                        outptr2[1] = tmp1[2];
+                        outptr3[1] = tmp1[3];
+                    }
+#endif // __AVX512F__
+                }
+
+                outptr0 += outw * out_elempack;
+            }
+        }
+    }
+#endif // __SSE2__
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        int tmp[2][4][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj * 2;
+            const int* r1 = r0 + max_jj * 2;
+            const int* r2 = r0 + max_jj * 2 * 2;
+            const int* r3 = r0 + max_jj * 2 * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                tmp[0][m][0] = r0[0] + r1[0] + r2[0];
+                tmp[0][m][1] = r0[1] + r1[1] + r2[1];
+                tmp[1][m][0] = r1[0] - r2[0] + r3[0];
+                tmp[1][m][1] = r1[1] - r2[1] + r3[1];
+
+                r0 += max_jj * 4 * 2;
+                r1 += max_jj * 4 * 2;
+                r2 += max_jj * 4 * 2;
+                r3 += max_jj * 4 * 2;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+                int tmp00 = tmp[m][0][0] + tmp[m][1][0] + tmp[m][2][0];
+                int tmp01 = tmp[m][0][1] + tmp[m][1][1] + tmp[m][2][1];
+                int tmp10 = tmp[m][1][0] - tmp[m][2][0] + tmp[m][3][0];
+                int tmp11 = tmp[m][1][1] - tmp[m][2][1] + tmp[m][3][1];
+
+                tmp00 = tmp00 >> 2;
+                tmp01 = tmp01 >> 2;
+                tmp10 = tmp10 >> 2;
+                tmp11 = tmp11 >> 2;
+
+                // if (out_elempack == 1)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    outptr0[0] = tmp00;
+                    outptr1[0] = tmp01;
+                    if (tj * 2 + 1 < outw)
+                    {
+                        outptr0[1] = tmp10;
+                        outptr1[1] = tmp11;
+                    }
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+    for (; ii < max_ii; ii++)
+    {
+        int tmp[2][4];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 16 + jj;
+            const int* r1 = r0 + max_jj;
+            const int* r2 = r0 + max_jj * 2;
+            const int* r3 = r0 + max_jj * 3;
+
+            for (int m = 0; m < 4; m++)
+            {
+                tmp[0][m] = r0[0] + r1[0] + r2[0];
+                tmp[1][m] = r1[0] - r2[0] + r3[0];
+
+                r0 += max_jj * 4;
+                r1 += max_jj * 4;
+                r2 += max_jj * 4;
+                r3 += max_jj * 4;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 2) + (tj * 2);
+
+            for (int m = 0; m < 2; m++)
+            {
+                if (ti * 2 + m >= outh)
+                    continue;
+
+                int tmp0 = tmp[m][0] + tmp[m][1] + tmp[m][2];
+                int tmp1 = tmp[m][1] - tmp[m][2] + tmp[m][3];
+
+                tmp0 = tmp0 >> 2;
+                tmp1 = tmp1 >> 2;
+
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = tmp0;
+                    if (tj * 2 + 1 < outw) outptr0[1] = tmp1;
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
+{
+#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
+#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
+    if (ncnn::cpu_support_x86_avx512_vnni())
+    {
+        conv3x3s1_winograd23_int8_avx512vnni(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
+    if (ncnn::cpu_support_x86_avx_vnni())
+    {
+        conv3x3s1_winograd23_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+    if (ncnn::cpu_support_x86_avx2())
+    {
+        conv3x3s1_winograd23_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
+    if (ncnn::cpu_support_x86_xop())
+    {
+        conv3x3s1_winograd23_int8_xop(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+#endif
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    // pad to 2n+2, winograd F(2,3)
+    int w_tiles = (outw + 1) / 2;
+    int h_tiles = (outh + 1) / 2;
+    int tiles = w_tiles * h_tiles;
+
+    const int M = top_blob.c * top_blob.elempack;
+    const int N = tiles;
+    const int K = bottom_blob.c * bottom_blob.elempack;
+    const int B = 16;
+
+    // NCNN_LOGE("conv3x3s1_winograd23_int8 %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+    const int nn_N = (N + TILE_N - 1) / TILE_N;
+    const int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
+
+    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 2u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    if (nT > 1 && nn_NK < nT)
+    {
+        Mat B_tile(TILE_N * B * TILE_K, 2u, opt.workspace_allocator);
+
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            // transform input
+            conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT);
+        }
+    }
+    else
+    {
+        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 2u, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(nT)
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            Mat B_tile = B_tileX.channel(get_omp_thread_num());
+
+            // transform input
+            conv3x3s1_winograd23_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1);
+        }
+    }
+
+    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat top_tile = top_tileX.channel(get_omp_thread_num());
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+                bool k_end = k + TILE_K >= K;
+
+                gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, k_end);
+            }
+
+            // transform output
+            conv3x3s1_winograd23_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd43_transform_kernel_tile_int8(const Mat& kernel, Mat& A, int inch, int i, int max_ii, int k, int max_kk)
+{
+    // const short ktm[6][3] = {
+    //     {6, 0, 0},
+    //     {-4, -4, -4},
+    //     {-4, 4, -4},
+    //     {1, 2, 4},
+    //     {1, -2, 4},
+    //     {0, 0, 6}
+    // };
+
+    short* ptmp = A;
+
+    int ii = 0;
+    for (; ii < max_ii; ii++)
+    {
+        int kk = 0;
+        for (; kk < max_kk; kk++)
+        {
+            short tmp[6][3];
+
+            const signed char* k0 = (const signed char*)kernel + (i + ii) * inch * 9 + (k + kk) * 9;
+
+            for (int m = 0; m < 3; m++)
+            {
+                signed char r0 = k0[0];
+                signed char r1 = k0[1];
+                signed char r2 = k0[2];
+
+                tmp[0][m] = r0 * 6;
+                tmp[1][m] = -r0 * 4 - r1 * 4 - r2 * 4;
+                tmp[2][m] = -r0 * 4 + r1 * 4 - r2 * 4;
+                tmp[3][m] = r0 + r1 * 2 + r2 * 4;
+                tmp[4][m] = r0 - r1 * 2 + r2 * 4;
+                tmp[5][m] = r2 * 6;
+
+                k0 += 3;
+            }
+
+            for (int m = 0; m < 6; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+
+                short z0 = r0 * 6;
+                short z1 = -r0 * 4 - r1 * 4 - r2 * 4;
+                short z2 = -r0 * 4 + r1 * 4 - r2 * 4;
+                short z3 = r0 + r1 * 2 + r2 * 4;
+                short z4 = r0 - r1 * 2 + r2 * 4;
+                short z5 = r2 * 6;
+
+                ptmp[0] = z0;
+                ptmp[1] = z1;
+                ptmp[2] = z2;
+                ptmp[3] = z3;
+                ptmp[4] = z4;
+                ptmp[5] = z5;
+                ptmp += 6;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_transform_kernel_int8(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
+{
+#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+    if (ncnn::cpu_support_x86_avx2())
+    {
+        conv3x3s1_winograd43_transform_kernel_int8_avx2(kernel, AT, inch, outch, opt);
+        return;
+    }
+#endif
+#endif
+
+    const int M = outch;
+    const int K = inch;
+    const int B = 36;
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, 0, K, TILE_M, TILE_N, TILE_K, opt.num_threads);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+
+    Mat A_tileX(B * TILE_M * TILE_K, 1, opt.num_threads, 4u, (Allocator*)0);
+
+    AT.create(TILE_K * TILE_M, B, (K + TILE_K - 1) / TILE_K, (M + TILE_M - 1) / TILE_M, 4u, (Allocator*)0);
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat A_tile = A_tileX.channel(get_omp_thread_num());
+
+        for (int k = 0; k < K; k += TILE_K)
+        {
+            const int max_ii = std::min((M - i), TILE_M);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            conv3x3s1_winograd43_transform_kernel_tile_int8(kernel, A_tile, inch, i, max_ii, k, max_kk);
+
+            Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+            pack_A_tile_int8(A_tile, AT_tile, B, max_ii, max_kk);
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd43_transform_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int nT)
+{
+    // const float itm[4][4] = {
+    //     {4,  0, -5,  0, 1, 0},
+    //     {0, -4, -4,  1, 1, 0},
+    //     {0,  4, -4, -1, 1, 0},
+    //     {0, -2, -1,  2, 1, 0},
+    //     {0,  2, -1, -2, 1, 0},
+    //     {0,  4,  0, -5, 0, 1}
+    // };
+
+    const int w = bottom_blob.w;
+    const int h = bottom_blob.h;
+    const int elempack = bottom_blob.elempack;
+    const int N = bottom_blob.cstep * elempack;
+
+    const int w_tiles = (w + 1) / 4;
+
+    int nn_max_kk = 0;
+    int remain_max_kk_start = 0;
+#if __SSE2__
+#if __AVX512F__
+    nn_max_kk = max_kk / 16;
+    #pragma omp parallel for num_threads(nT)
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = ppkk * 16;
+
+#ifdef _MSC_VER
+        __declspec(align(64))
+#else
+        __attribute__((aligned(64)))
+#endif
+        short tmp[6][6][16];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel((k + kk) / elempack).row<const signed char>(ti * 4) + (tj * 4) * elempack;
+
+            __m256i _v2 = _mm256_set1_epi16(2);
+            __m256i _v4 = _mm256_set1_epi16(4);
+            __m256i _v5 = _mm256_set1_epi16(5);
+
+            for (int m = 0; m < 6; m++)
+            {
+                __m256i _r0 = _mm256_setzero_si256();
+                __m256i _r1 = _mm256_setzero_si256();
+                __m256i _r2 = _mm256_setzero_si256();
+                __m256i _r3 = _mm256_setzero_si256();
+                __m256i _r4 = _mm256_setzero_si256();
+                __m256i _r5 = _mm256_setzero_si256();
+
+                if (ti * 4 + m < h)
+                {
+                    if (elempack == 16)
+                    {
+                        _r0 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)r0));
+                        if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 16)));
+                        if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 32)));
+                        if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 48)));
+                        if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 64)));
+                        if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*)(r0 + 80)));
+                    }
+                    if (elempack == 8)
+                    {
+                        const signed char* r1 = r0 + N;
+
+                        _r0 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)r0), _mm_loadl_epi64((const __m128i*)r1)));
+                        if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 8)), _mm_loadl_epi64((const __m128i*)(r1 + 8))));
+                        if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 16)), _mm_loadl_epi64((const __m128i*)(r1 + 16))));
+                        if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 24)), _mm_loadl_epi64((const __m128i*)(r1 + 24))));
+                        if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 32)), _mm_loadl_epi64((const __m128i*)(r1 + 32))));
+                        if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i*)(r0 + 40)), _mm_loadl_epi64((const __m128i*)(r1 + 40))));
+                    }
+                    if (elempack == 1)
+                    {
+                        __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+                        _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
+                        _r0 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)r0, sizeof(signed char))));
+                        if (tj * 4 + 1 < w) _r1 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 1), sizeof(signed char))));
+                        if (tj * 4 + 2 < w) _r2 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 2), sizeof(signed char))));
+                        if (tj * 4 + 3 < w) _r3 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 3), sizeof(signed char))));
+                        if (tj * 4 + 4 < w) _r4 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 4), sizeof(signed char))));
+                        if (tj * 4 + 5 < w) _r5 = _mm256_cvtepi8_epi16(_mm512_cvtepi32_epi8(_mm512_i32gather_epi32(_vindex, (const int*)(r0 + 5), sizeof(signed char))));
+                    }
+                }
+
+                __m256i _tmp12a = _mm256_sub_epi16(_r3, _mm256_mullo_epi16(_r1, _v4));
+                __m256i _tmp12b = _mm256_sub_epi16(_r4, _mm256_mullo_epi16(_r2, _v4));
+                __m256i _tmp34a = _mm256_mullo_epi16(_mm256_sub_epi16(_r3, _r1), _v2);
+                __m256i _tmp34b = _mm256_sub_epi16(_r4, _r2);
+
+                __m256i _tmp0 = _mm256_add_epi16(_r4, _mm256_sub_epi16(_mm256_mullo_epi16(_r0, _v4), _mm256_mullo_epi16(_r2, _v5)));
+                __m256i _tmp1 = _mm256_add_epi16(_tmp12b, _tmp12a);
+                __m256i _tmp2 = _mm256_sub_epi16(_tmp12b, _tmp12a);
+                __m256i _tmp3 = _mm256_add_epi16(_tmp34b, _tmp34a);
+                __m256i _tmp4 = _mm256_sub_epi16(_tmp34b, _tmp34a);
+                __m256i _tmp5 = _mm256_add_epi16(_r5, _mm256_sub_epi16(_mm256_mullo_epi16(_r1, _v4), _mm256_mullo_epi16(_r3, _v5)));
+
+                _mm256_store_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_store_si256((__m256i*)tmp[1][m], _tmp1);
+                _mm256_store_si256((__m256i*)tmp[2][m], _tmp2);
+                _mm256_store_si256((__m256i*)tmp[3][m], _tmp3);
+                _mm256_store_si256((__m256i*)tmp[4][m], _tmp4);
+                _mm256_store_si256((__m256i*)tmp[5][m], _tmp5);
+
+                r0 += w * elempack;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 36 + jj * 16;
+            short* p1 = p0 + max_jj * 16;
+            short* p2 = p0 + max_jj * 16 * 2;
+            short* p3 = p0 + max_jj * 16 * 3;
+            short* p4 = p0 + max_jj * 16 * 4;
+            short* p5 = p0 + max_jj * 16 * 5;
+
+            for (int m = 0; m < 6; m++)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)tmp[m][0]);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)tmp[m][1]);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)tmp[m][2]);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)tmp[m][3]);
+                __m256i _r4 = _mm256_load_si256((const __m256i*)tmp[m][4]);
+                __m256i _r5 = _mm256_load_si256((const __m256i*)tmp[m][5]);
+
+                __m256i _tmp12a = _mm256_sub_epi16(_r3, _mm256_mullo_epi16(_r1, _v4));
+                __m256i _tmp12b = _mm256_sub_epi16(_r4, _mm256_mullo_epi16(_r2, _v4));
+                __m256i _tmp34a = _mm256_mullo_epi16(_mm256_sub_epi16(_r3, _r1), _v2);
+                __m256i _tmp34b = _mm256_sub_epi16(_r4, _r2);
+
+                __m256i _tmp0 = _mm256_add_epi16(_r4, _mm256_sub_epi16(_mm256_mullo_epi16(_r0, _v4), _mm256_mullo_epi16(_r2, _v5)));
+                __m256i _tmp1 = _mm256_add_epi16(_tmp12b, _tmp12a);
+                __m256i _tmp2 = _mm256_sub_epi16(_tmp12b, _tmp12a);
+                __m256i _tmp3 = _mm256_add_epi16(_tmp34b, _tmp34a);
+                __m256i _tmp4 = _mm256_sub_epi16(_tmp34b, _tmp34a);
+                __m256i _tmp5 = _mm256_add_epi16(_r5, _mm256_sub_epi16(_mm256_mullo_epi16(_r1, _v4), _mm256_mullo_epi16(_r3, _v5)));
+
+                _mm256_store_si256((__m256i*)p0, _tmp0);
+                _mm256_store_si256((__m256i*)p1, _tmp1);
+                _mm256_store_si256((__m256i*)p2, _tmp2);
+                _mm256_store_si256((__m256i*)p3, _tmp3);
+                _mm256_store_si256((__m256i*)p4, _tmp4);
+                _mm256_store_si256((__m256i*)p5, _tmp5);
+
+                p0 += max_jj * 6 * 16;
+                p1 += max_jj * 6 * 16;
+                p2 += max_jj * 6 * 16;
+                p3 += max_jj * 6 * 16;
+                p4 += max_jj * 6 * 16;
+                p5 += max_jj * 6 * 16;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 16;
+    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
+#else // __AVX512F__
+    nn_max_kk = (max_kk - remain_max_kk_start) / 8;
+    #pragma omp parallel for num_threads(nT)
+#endif // __AVX512F__
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = remain_max_kk_start + ppkk * 8;
+
+#ifdef _MSC_VER
+        __declspec(align(32))
+#else
+        __attribute__((aligned(32)))
+#endif
+        short tmp[6][6][8];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel((k + kk) / elempack).row<const signed char>(ti * 4) + (tj * 4) * elempack;
+
+            __m128i _v2 = _mm_set1_epi16(2);
+            __m128i _v4 = _mm_set1_epi16(4);
+            __m128i _v5 = _mm_set1_epi16(5);
+
+            for (int m = 0; m < 6; m++)
+            {
+                __m128i _r0 = _mm_setzero_si128();
+                __m128i _r1 = _mm_setzero_si128();
+                __m128i _r2 = _mm_setzero_si128();
+                __m128i _r3 = _mm_setzero_si128();
+                __m128i _r4 = _mm_setzero_si128();
+                __m128i _r5 = _mm_setzero_si128();
+
+                if (ti * 4 + m < h)
+                {
+                    if (elempack == 8)
+                    {
+                        _r0 = _mm_loadl_epi64((const __m128i*)r0);
+                        _r0 = _mm_unpacklo_epi8(_r0, _mm_cmpgt_epi8(_mm_setzero_si128(), _r0));
+                        if (tj * 4 + 1 < w)
+                        {
+                            _r1 = _mm_loadl_epi64((const __m128i*)(r0 + 8));
+                            _r1 = _mm_unpacklo_epi8(_r1, _mm_cmpgt_epi8(_mm_setzero_si128(), _r1));
+                        }
+                        if (tj * 4 + 2 < w)
+                        {
+                            _r2 = _mm_loadl_epi64((const __m128i*)(r0 + 16));
+                            _r2 = _mm_unpacklo_epi8(_r2, _mm_cmpgt_epi8(_mm_setzero_si128(), _r2));
+                        }
+                        if (tj * 4 + 3 < w)
+                        {
+                            _r3 = _mm_loadl_epi64((const __m128i*)(r0 + 24));
+                            _r3 = _mm_unpacklo_epi8(_r3, _mm_cmpgt_epi8(_mm_setzero_si128(), _r3));
+                        }
+                        if (tj * 4 + 4 < w)
+                        {
+                            _r4 = _mm_loadl_epi64((const __m128i*)(r0 + 32));
+                            _r4 = _mm_unpacklo_epi8(_r4, _mm_cmpgt_epi8(_mm_setzero_si128(), _r4));
+                        }
+                        if (tj * 4 + 5 < w)
+                        {
+                            _r5 = _mm_loadl_epi64((const __m128i*)(r0 + 40));
+                            _r5 = _mm_unpacklo_epi8(_r5, _mm_cmpgt_epi8(_mm_setzero_si128(), _r5));
+                        }
+                    }
+                    if (elempack == 1)
+                    {
+#if __AVX2__
+                        __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+                        _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
+#if __AVX512F__
+                        _r0 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char))));
+                        if (tj * 4 + 1 < w) _r1 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char))));
+                        if (tj * 4 + 2 < w) _r2 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char))));
+                        if (tj * 4 + 3 < w) _r3 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char))));
+                        if (tj * 4 + 4 < w) _r4 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char))));
+                        if (tj * 4 + 5 < w) _r5 = _mm_cvtepi8_epi16(_mm256_cvtepi32_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char))));
+#else
+                        __m128i _sindex8 = _mm_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+                        __m256i _sindex88 = _mm256_inserti128_si256(_mm256_castsi128_si256(_sindex8), _sindex8, 1);
+                        __m256i _val0_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)r0, _vindex, sizeof(signed char)), _sindex88);
+                        _r0 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val0_32, 0), _mm256_extracti128_si256(_val0_32, 1)));
+                        if (tj * 4 + 1 < w)
+                        {
+                            __m256i _val1_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 1), _vindex, sizeof(signed char)), _sindex88);
+                            _r1 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val1_32, 0), _mm256_extracti128_si256(_val1_32, 1)));
+                        }
+                        if (tj * 4 + 2 < w)
+                        {
+                            __m256i _val2_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 2), _vindex, sizeof(signed char)), _sindex88);
+                            _r2 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val2_32, 0), _mm256_extracti128_si256(_val2_32, 1)));
+                        }
+                        if (tj * 4 + 3 < w)
+                        {
+                            __m256i _val3_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 3), _vindex, sizeof(signed char)), _sindex88);
+                            _r3 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val3_32, 0), _mm256_extracti128_si256(_val3_32, 1)));
+                        }
+                        if (tj * 4 + 4 < w)
+                        {
+                            __m256i _val4_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 4), _vindex, sizeof(signed char)), _sindex88);
+                            _r4 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val4_32, 0), _mm256_extracti128_si256(_val4_32, 1)));
+                        }
+                        if (tj * 4 + 5 < w)
+                        {
+                            __m256i _val5_32 = _mm256_shuffle_epi8(_mm256_i32gather_epi32((const int*)(r0 + 5), _vindex, sizeof(signed char)), _sindex88);
+                            _r5 = _mm_cvtepi8_epi16(_mm_unpacklo_epi32(_mm256_extracti128_si256(_val5_32, 0), _mm256_extracti128_si256(_val5_32, 1)));
+                        }
+#endif // __AVX512F__
+#else  // __AVX2__
+                        const signed char* r1 = r0 + N;
+                        const signed char* r2 = r0 + N * 2;
+                        const signed char* r3 = r0 + N * 3;
+                        const signed char* r4 = r0 + N * 4;
+                        const signed char* r5 = r0 + N * 5;
+                        const signed char* r6 = r0 + N * 6;
+                        const signed char* r7 = r0 + N * 7;
+
+                        __m128i _t0 = _mm_loadl_epi64((const __m128i*)r0);
+                        __m128i _t1 = _mm_loadl_epi64((const __m128i*)r1);
+                        __m128i _t2 = _mm_loadl_epi64((const __m128i*)r2);
+                        __m128i _t3 = _mm_loadl_epi64((const __m128i*)r3);
+                        __m128i _t4 = _mm_loadl_epi64((const __m128i*)r4);
+                        __m128i _t5 = _mm_loadl_epi64((const __m128i*)r5);
+                        __m128i _t6 = _mm_loadl_epi64((const __m128i*)r6);
+                        __m128i _t7 = _mm_loadl_epi64((const __m128i*)r7);
+
+                        __m128i _t01 = _mm_unpacklo_epi8(_t0, _t1);
+                        __m128i _t23 = _mm_unpacklo_epi8(_t2, _t3);
+                        __m128i _t45 = _mm_unpacklo_epi8(_t4, _t5);
+                        __m128i _t67 = _mm_unpacklo_epi8(_t6, _t7);
+                        _t0 = _mm_unpacklo_epi16(_t01, _t23);
+                        _t1 = _mm_unpacklo_epi16(_t45, _t67);
+                        _t2 = _mm_unpacklo_epi32(_t0, _t1);
+                        _t3 = _mm_unpackhi_epi32(_t0, _t1);
+
+                        __m128i _extt2 = _mm_cmpgt_epi8(_mm_setzero_si128(), _t2);
+                        __m128i _extt3 = _mm_cmpgt_epi8(_mm_setzero_si128(), _t3);
+
+                        _r0 = _mm_unpacklo_epi8(_t2, _extt2);
+                        if (tj * 4 + 1 < w) _r1 = _mm_unpackhi_epi8(_t2, _extt2);
+                        if (tj * 4 + 2 < w) _r2 = _mm_unpacklo_epi8(_t3, _extt3);
+                        if (tj * 4 + 3 < w) _r3 = _mm_unpackhi_epi8(_t3, _extt3);
+                        if (tj * 4 + 4 < w) _r4 = _mm_setr_epi16(r0[4], r1[4], r2[4], r3[4], r4[4], r5[4], r6[4], r7[4]);
+                        if (tj * 4 + 5 < w) _r5 = _mm_setr_epi16(r0[5], r1[5], r2[5], r3[5], r4[5], r5[5], r6[5], r7[5]);
+#endif // __AVX2__
+                    }
+                }
+
+                __m128i _tmp12a = _mm_sub_epi16(_r3, _mm_mullo_epi16(_r1, _v4));
+                __m128i _tmp12b = _mm_sub_epi16(_r4, _mm_mullo_epi16(_r2, _v4));
+                __m128i _tmp34a = _mm_mullo_epi16(_mm_sub_epi16(_r3, _r1), _v2);
+                __m128i _tmp34b = _mm_sub_epi16(_r4, _r2);
+
+                __m128i _tmp0 = _mm_add_epi16(_r4, _mm_sub_epi16(_mm_mullo_epi16(_r0, _v4), _mm_mullo_epi16(_r2, _v5)));
+                __m128i _tmp1 = _mm_add_epi16(_tmp12b, _tmp12a);
+                __m128i _tmp2 = _mm_sub_epi16(_tmp12b, _tmp12a);
+                __m128i _tmp3 = _mm_add_epi16(_tmp34b, _tmp34a);
+                __m128i _tmp4 = _mm_sub_epi16(_tmp34b, _tmp34a);
+                __m128i _tmp5 = _mm_add_epi16(_r5, _mm_sub_epi16(_mm_mullo_epi16(_r1, _v4), _mm_mullo_epi16(_r3, _v5)));
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3);
+                _mm_storeu_si128((__m128i*)tmp[4][m], _tmp4);
+                _mm_storeu_si128((__m128i*)tmp[5][m], _tmp5);
+#else
+                _mm_store_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_store_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_store_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_store_si128((__m128i*)tmp[3][m], _tmp3);
+                _mm_store_si128((__m128i*)tmp[4][m], _tmp4);
+                _mm_store_si128((__m128i*)tmp[5][m], _tmp5);
+#endif
+
+                r0 += w * elempack;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 36 + jj * 8;
+            short* p1 = p0 + max_jj * 8;
+            short* p2 = p0 + max_jj * 8 * 2;
+            short* p3 = p0 + max_jj * 8 * 3;
+            short* p4 = p0 + max_jj * 8 * 4;
+            short* p5 = p0 + max_jj * 8 * 5;
+
+            for (int m = 0; m < 6; m++)
+            {
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
+                __m128i _r4 = _mm_loadu_si128((const __m128i*)tmp[m][4]);
+                __m128i _r5 = _mm_loadu_si128((const __m128i*)tmp[m][5]);
+#else
+                __m128i _r0 = _mm_load_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_load_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_load_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_load_si128((const __m128i*)tmp[m][3]);
+                __m128i _r4 = _mm_load_si128((const __m128i*)tmp[m][4]);
+                __m128i _r5 = _mm_load_si128((const __m128i*)tmp[m][5]);
+#endif
+
+                __m128i _tmp12a = _mm_sub_epi16(_r3, _mm_mullo_epi16(_r1, _v4));
+                __m128i _tmp12b = _mm_sub_epi16(_r4, _mm_mullo_epi16(_r2, _v4));
+                __m128i _tmp34a = _mm_mullo_epi16(_mm_sub_epi16(_r3, _r1), _v2);
+                __m128i _tmp34b = _mm_sub_epi16(_r4, _r2);
+
+                __m128i _tmp0 = _mm_add_epi16(_r4, _mm_sub_epi16(_mm_mullo_epi16(_r0, _v4), _mm_mullo_epi16(_r2, _v5)));
+                __m128i _tmp1 = _mm_add_epi16(_tmp12b, _tmp12a);
+                __m128i _tmp2 = _mm_sub_epi16(_tmp12b, _tmp12a);
+                __m128i _tmp3 = _mm_add_epi16(_tmp34b, _tmp34a);
+                __m128i _tmp4 = _mm_sub_epi16(_tmp34b, _tmp34a);
+                __m128i _tmp5 = _mm_add_epi16(_r5, _mm_sub_epi16(_mm_mullo_epi16(_r1, _v4), _mm_mullo_epi16(_r3, _v5)));
+
+                _mm_store_si128((__m128i*)p0, _tmp0);
+                _mm_store_si128((__m128i*)p1, _tmp1);
+                _mm_store_si128((__m128i*)p2, _tmp2);
+                _mm_store_si128((__m128i*)p3, _tmp3);
+                _mm_store_si128((__m128i*)p4, _tmp4);
+                _mm_store_si128((__m128i*)p5, _tmp5);
+
+                p0 += max_jj * 6 * 8;
+                p1 += max_jj * 6 * 8;
+                p2 += max_jj * 6 * 8;
+                p3 += max_jj * 6 * 8;
+                p4 += max_jj * 6 * 8;
+                p5 += max_jj * 6 * 8;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 8;
+    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
+#else // __SSE2__
+    nn_max_kk = (max_kk - remain_max_kk_start) / 2;
+    #pragma omp parallel for num_threads(nT)
+#endif // __SSE2__
+    for (int ppkk = 0; ppkk < nn_max_kk; ppkk++)
+    {
+        const int kk = remain_max_kk_start + ppkk * 2;
+
+        short tmp[6][6][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0 = bottom_blob.channel(k + kk).row<const signed char>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 6; m++)
+            {
+                signed char r00 = 0;
+                signed char r01 = 0;
+                signed char r10 = 0;
+                signed char r11 = 0;
+                signed char r20 = 0;
+                signed char r21 = 0;
+                signed char r30 = 0;
+                signed char r31 = 0;
+                signed char r40 = 0;
+                signed char r41 = 0;
+                signed char r50 = 0;
+                signed char r51 = 0;
+
+                if (ti * 4 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        const signed char* r1 = r0 + N;
+
+                        r00 = r0[0];
+                        r01 = r1[0];
+                        if (tj * 4 + 1 < w)
+                        {
+                            r10 = r0[1];
+                            r11 = r1[1];
+                        }
+                        if (tj * 4 + 2 < w)
+                        {
+                            r20 = r0[2];
+                            r21 = r1[2];
+                        }
+                        if (tj * 4 + 3 < w)
+                        {
+                            r30 = r0[3];
+                            r31 = r1[3];
+                        }
+                        if (tj * 4 + 4 < w)
+                        {
+                            r40 = r0[4];
+                            r41 = r1[4];
+                        }
+                        if (tj * 4 + 5 < w)
+                        {
+                            r50 = r0[5];
+                            r51 = r1[5];
+                        }
+                    }
+                }
+
+                short tmp120a = r30 - r10 * 4;
+                short tmp121a = r31 - r11 * 4;
+                short tmp120b = r40 - r20 * 4;
+                short tmp121b = r41 - r21 * 4;
+                short tmp340a = (r30 - r10) * 2;
+                short tmp341a = (r31 - r11) * 2;
+                short tmp340b = r40 - r20;
+                short tmp341b = r41 - r21;
+
+                tmp[0][m][0] = r40 + r00 * 4 - r20 * 5;
+                tmp[0][m][1] = r41 + r01 * 4 - r21 * 5;
+                tmp[1][m][0] = tmp120b + tmp120a;
+                tmp[1][m][1] = tmp121b + tmp121a;
+                tmp[2][m][0] = tmp120b - tmp120a;
+                tmp[2][m][1] = tmp121b - tmp121a;
+                tmp[3][m][0] = tmp340b + tmp340a;
+                tmp[3][m][1] = tmp341b + tmp341a;
+                tmp[4][m][0] = tmp340b - tmp340a;
+                tmp[4][m][1] = tmp341b - tmp341a;
+                tmp[5][m][0] = r50 + r10 * 4 - r30 * 5;
+                tmp[5][m][1] = r51 + r11 * 4 - r31 * 5;
+
+                r0 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 36 + jj * 2;
+            short* p1 = p0 + max_jj * 2;
+            short* p2 = p0 + max_jj * 2 * 2;
+            short* p3 = p0 + max_jj * 2 * 3;
+            short* p4 = p0 + max_jj * 2 * 4;
+            short* p5 = p0 + max_jj * 2 * 5;
+
+            for (int m = 0; m < 6; m++)
+            {
+                short r00 = tmp[m][0][0];
+                short r01 = tmp[m][0][1];
+                short r10 = tmp[m][1][0];
+                short r11 = tmp[m][1][1];
+                short r20 = tmp[m][2][0];
+                short r21 = tmp[m][2][1];
+                short r30 = tmp[m][3][0];
+                short r31 = tmp[m][3][1];
+                short r40 = tmp[m][4][0];
+                short r41 = tmp[m][4][1];
+                short r50 = tmp[m][5][0];
+                short r51 = tmp[m][5][1];
+
+                short tmp120a = r30 - r10 * 4;
+                short tmp121a = r31 - r11 * 4;
+                short tmp120b = r40 - r20 * 4;
+                short tmp121b = r41 - r21 * 4;
+                short tmp340a = (r30 - r10) * 2;
+                short tmp341a = (r31 - r11) * 2;
+                short tmp340b = r40 - r20;
+                short tmp341b = r41 - r21;
+
+                p0[0] = r40 + r00 * 4 - r20 * 5;
+                p0[1] = r41 + r01 * 4 - r21 * 5;
+                p1[0] = tmp120b + tmp120a;
+                p1[1] = tmp121b + tmp121a;
+                p2[0] = tmp120b - tmp120a;
+                p2[1] = tmp121b - tmp121a;
+                p3[0] = tmp340b + tmp340a;
+                p3[1] = tmp341b + tmp341a;
+                p4[0] = tmp340b - tmp340a;
+                p4[1] = tmp341b - tmp341a;
+                p5[0] = r50 + r10 * 4 - r30 * 5;
+                p5[1] = r51 + r11 * 4 - r31 * 5;
+
+                p0 += max_jj * 6 * 2;
+                p1 += max_jj * 6 * 2;
+                p2 += max_jj * 6 * 2;
+                p3 += max_jj * 6 * 2;
+                p4 += max_jj * 6 * 2;
+                p5 += max_jj * 6 * 2;
+            }
+        }
+    }
+    remain_max_kk_start += nn_max_kk * 2;
+    for (int kk = remain_max_kk_start; kk < max_kk; kk++)
+    {
+        short tmp[6][6];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const signed char* r0123 = bottom_blob.channel(k + kk).row<const signed char>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 6; m++)
+            {
+                signed char r0 = 0;
+                signed char r1 = 0;
+                signed char r2 = 0;
+                signed char r3 = 0;
+                signed char r4 = 0;
+                signed char r5 = 0;
+
+                if (ti * 4 + m < h)
+                {
+                    // if (elempack == 1)
+                    {
+                        r0 = r0123[0];
+                        if (tj * 4 + 1 < w) r1 = r0123[1];
+                        if (tj * 4 + 2 < w) r2 = r0123[2];
+                        if (tj * 4 + 3 < w) r3 = r0123[3];
+                        if (tj * 4 + 4 < w) r4 = r0123[4];
+                        if (tj * 4 + 5 < w) r5 = r0123[5];
+                    }
+                }
+
+                short tmp12a = r3 - r1 * 4;
+                short tmp12b = r4 - r2 * 4;
+                short tmp34a = (r3 - r1) * 2;
+                short tmp34b = r4 - r2;
+
+                tmp[0][m] = r4 + r0 * 4 - r2 * 5;
+                tmp[1][m] = tmp12b + tmp12a;
+                tmp[2][m] = tmp12b - tmp12a;
+                tmp[3][m] = tmp34b + tmp34a;
+                tmp[4][m] = tmp34b - tmp34a;
+                tmp[5][m] = r5 + r1 * 4 - r3 * 5;
+
+                r0123 += w;
+            }
+
+            short* p0 = (short*)B + kk * max_jj * 36 + jj;
+            short* p1 = p0 + max_jj;
+            short* p2 = p0 + max_jj * 2;
+            short* p3 = p0 + max_jj * 3;
+            short* p4 = p0 + max_jj * 4;
+            short* p5 = p0 + max_jj * 5;
+
+            for (int m = 0; m < 6; m++)
+            {
+                short r0 = tmp[m][0];
+                short r1 = tmp[m][1];
+                short r2 = tmp[m][2];
+                short r3 = tmp[m][3];
+                short r4 = tmp[m][4];
+                short r5 = tmp[m][5];
+
+                short tmp12a = r3 - r1 * 4;
+                short tmp12b = r4 - r2 * 4;
+                short tmp34a = (r3 - r1) * 2;
+                short tmp34b = r4 - r2;
+
+                p0[0] = r4 + r0 * 4 - r2 * 5;
+                p1[0] = tmp12b + tmp12a;
+                p2[0] = tmp12b - tmp12a;
+                p3[0] = tmp34b + tmp34a;
+                p4[0] = tmp34b - tmp34a;
+                p5[0] = r5 + r1 * 4 - r3 * 5;
+
+                p0 += max_jj * 6;
+                p1 += max_jj * 6;
+                p2 += max_jj * 6;
+                p3 += max_jj * 6;
+                p4 += max_jj * 6;
+                p5 += max_jj * 6;
+            }
+        }
+    }
+}
+
+static inline void conv3x3s1_winograd43_transform_output_tile_int8(const Mat& top_tile, Mat& top_blob, int i, int max_ii, int j, int max_jj)
+{
+    // const int otm[4][6] = {
+    //     {1, 1,  1, 1,  1, 0},
+    //     {0, 1, -1, 2, -2, 0},
+    //     {0, 1,  1, 4,  4, 0},
+    //     {0, 1, -1, 8, -8, 1}
+    // };
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int N = top_blob.cstep * out_elempack;
+
+    const int w_tiles = (outw + 3) / 4;
+
+    int ii = 0;
+#if __SSE2__
+#if __AVX2__
+#if __AVX512F__
+    for (; ii + 15 < max_ii; ii += 16)
+    {
+#ifdef _MSC_VER
+        __declspec(align(64))
+#else
+        __attribute__((aligned(64)))
+#endif
+        int tmp[4][6][16];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 16;
+            const int* r1 = r0 + max_jj * 16;
+            const int* r2 = r0 + max_jj * 16 * 2;
+            const int* r3 = r0 + max_jj * 16 * 3;
+            const int* r4 = r0 + max_jj * 16 * 4;
+            const int* r5 = r0 + max_jj * 16 * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                __m512i _r0 = _mm512_load_si512((const __m512i*)r0);
+                __m512i _r1 = _mm512_load_si512((const __m512i*)r1);
+                __m512i _r2 = _mm512_load_si512((const __m512i*)r2);
+                __m512i _r3 = _mm512_load_si512((const __m512i*)r3);
+                __m512i _r4 = _mm512_load_si512((const __m512i*)r4);
+                __m512i _r5 = _mm512_load_si512((const __m512i*)r5);
+
+                __m512i _tmp02a = _mm512_add_epi32(_r1, _r2);
+                __m512i _tmp02b = _mm512_add_epi32(_r3, _r4);
+                __m512i _tmp13a = _mm512_sub_epi32(_r1, _r2);
+                __m512i _tmp13b = _mm512_sub_epi32(_r3, _r4);
+
+                __m512i _tmp0 = _mm512_add_epi32(_mm512_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m512i _tmp1 = _mm512_add_epi32(_tmp13a, _mm512_slli_epi32(_tmp13b, 1));
+                __m512i _tmp2 = _mm512_add_epi32(_tmp02a, _mm512_slli_epi32(_tmp02b, 2));
+                __m512i _tmp3 = _mm512_add_epi32(_mm512_add_epi32(_tmp13a, _mm512_slli_epi32(_tmp13b, 3)), _mm512_slli_epi32(_r5, 2));
+
+                _mm512_store_si512((__m512i*)tmp[0][m], _tmp0);
+                _mm512_store_si512((__m512i*)tmp[1][m], _tmp1);
+                _mm512_store_si512((__m512i*)tmp[2][m], _tmp2);
+                _mm512_store_si512((__m512i*)tmp[3][m], _tmp3);
+
+                r0 += max_jj * 6 * 16;
+                r1 += max_jj * 6 * 16;
+                r2 += max_jj * 6 * 16;
+                r3 += max_jj * 6 * 16;
+                r4 += max_jj * 6 * 16;
+                r5 += max_jj * 6 * 16;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                __m512i _r0 = _mm512_load_si512((const __m512i*)r0);
+                __m512i _r1 = _mm512_load_si512((const __m512i*)r1);
+                __m512i _r2 = _mm512_load_si512((const __m512i*)r2);
+                __m512i _r3 = _mm512_load_si512((const __m512i*)r3);
+                __m512i _r4 = _mm512_load_si512((const __m512i*)r4);
+                __m512i _r5 = _mm512_load_si512((const __m512i*)r5);
+
+                __m512i _tmp02a = _mm512_add_epi32(_r1, _r2);
+                __m512i _tmp02b = _mm512_add_epi32(_r3, _r4);
+                __m512i _tmp13a = _mm512_sub_epi32(_r1, _r2);
+                __m512i _tmp13b = _mm512_sub_epi32(_r3, _r4);
+
+                __m512i _tmp0 = _mm512_add_epi32(_mm512_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m512i _tmp1 = _mm512_add_epi32(_tmp13a, _mm512_slli_epi32(_tmp13b, 1));
+                __m512i _tmp2 = _mm512_add_epi32(_tmp02a, _mm512_slli_epi32(_tmp02b, 2));
+                __m512i _tmp3 = _mm512_add_epi32(_mm512_add_epi32(_tmp13a, _mm512_slli_epi32(_tmp13b, 3)), _mm512_slli_epi32(_r5, 2));
+
+                _tmp0 = _mm512_slli_epi32(_tmp0, 2);
+                _tmp1 = _mm512_slli_epi32(_tmp1, 2);
+                _tmp2 = _mm512_slli_epi32(_tmp2, 2);
+                _tmp3 = _mm512_slli_epi32(_tmp3, 2);
+
+                _mm512_store_si512((__m512i*)tmp[0][m], _tmp0);
+                _mm512_store_si512((__m512i*)tmp[1][m], _tmp1);
+                _mm512_store_si512((__m512i*)tmp[2][m], _tmp2);
+                _mm512_store_si512((__m512i*)tmp[3][m], _tmp3);
+
+                r0 += max_jj * 6 * 16;
+                r1 += max_jj * 6 * 16;
+                r2 += max_jj * 6 * 16;
+                r3 += max_jj * 6 * 16;
+                r4 += max_jj * 6 * 16;
+                r5 += max_jj * 6 * 16;
+            }
+
+            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 4) + (tj * 4) * out_elempack;
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+                __m512i _r0 = _mm512_load_si512((const __m512i*)tmp[m][0]);
+                __m512i _r1 = _mm512_load_si512((const __m512i*)tmp[m][1]);
+                __m512i _r2 = _mm512_load_si512((const __m512i*)tmp[m][2]);
+                __m512i _r3 = _mm512_load_si512((const __m512i*)tmp[m][3]);
+                __m512i _r4 = _mm512_load_si512((const __m512i*)tmp[m][4]);
+                __m512i _r5 = _mm512_load_si512((const __m512i*)tmp[m][5]);
+
+                __m512i _tmp02a = _mm512_add_epi32(_r1, _r2);
+                __m512i _tmp02b = _mm512_add_epi32(_r3, _r4);
+                __m512i _tmp13a = _mm512_sub_epi32(_r1, _r2);
+                __m512i _tmp13b = _mm512_sub_epi32(_r3, _r4);
+
+                __m512i _tmp0 = _mm512_add_epi32(_mm512_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m512i _tmp1 = _mm512_add_epi32(_tmp13a, _mm512_slli_epi32(_tmp13b, 1));
+                __m512i _tmp2 = _mm512_add_epi32(_tmp02a, _mm512_slli_epi32(_tmp02b, 2));
+                __m512i _tmp3 = _mm512_add_epi32(_mm512_add_epi32(_tmp13a, _mm512_slli_epi32(_tmp13b, 3)), _r5);
+
+                // TODO use integer trick for division by 576
+                __m512 _v576 = _mm512_set1_ps(1.0 / 576);
+                _tmp0 = _mm512_cvttps_epi32(_mm512_mul_ps(_mm512_cvtepi32_ps(_tmp0), _v576));
+                _tmp1 = _mm512_cvttps_epi32(_mm512_mul_ps(_mm512_cvtepi32_ps(_tmp1), _v576));
+                _tmp2 = _mm512_cvttps_epi32(_mm512_mul_ps(_mm512_cvtepi32_ps(_tmp2), _v576));
+                _tmp3 = _mm512_cvttps_epi32(_mm512_mul_ps(_mm512_cvtepi32_ps(_tmp3), _v576));
+
+                if (out_elempack == 16)
+                {
+                    _mm512_store_si512((__m512i*)outptr0, _tmp0);
+                    if (tj * 4 + 1 < outw) _mm512_store_si512((__m512i*)(outptr0 + 16), _tmp1);
+                    if (tj * 4 + 2 < outw) _mm512_store_si512((__m512i*)(outptr0 + 32), _tmp2);
+                    if (tj * 4 + 3 < outw) _mm512_store_si512((__m512i*)(outptr0 + 48), _tmp3);
+                }
+                if (out_elempack == 8)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    _mm256_store_si256((__m256i*)outptr0, _mm512_extracti32x8_epi32(_tmp0, 0));
+                    _mm256_store_si256((__m256i*)outptr1, _mm512_extracti32x8_epi32(_tmp0, 1));
+                    if (tj * 4 + 1 < outw)
+                    {
+                        _mm256_store_si256((__m256i*)(outptr0 + 8), _mm512_extracti32x8_epi32(_tmp1, 0));
+                        _mm256_store_si256((__m256i*)(outptr1 + 8), _mm512_extracti32x8_epi32(_tmp1, 1));
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        _mm256_store_si256((__m256i*)(outptr0 + 16), _mm512_extracti32x8_epi32(_tmp2, 0));
+                        _mm256_store_si256((__m256i*)(outptr1 + 16), _mm512_extracti32x8_epi32(_tmp2, 1));
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        _mm256_store_si256((__m256i*)(outptr0 + 24), _mm512_extracti32x8_epi32(_tmp3, 0));
+                        _mm256_store_si256((__m256i*)(outptr1 + 24), _mm512_extracti32x8_epi32(_tmp3, 1));
+                    }
+                }
+                if (out_elempack == 4)
+                {
+                    int* outptr1 = outptr0 + N;
+                    int* outptr2 = outptr0 + N * 2;
+                    int* outptr3 = outptr0 + N * 3;
+
+                    _mm_store_si128((__m128i*)outptr0, _mm512_extracti32x4_epi32(_tmp0, 0));
+                    _mm_store_si128((__m128i*)outptr1, _mm512_extracti32x4_epi32(_tmp0, 1));
+                    _mm_store_si128((__m128i*)outptr2, _mm512_extracti32x4_epi32(_tmp0, 2));
+                    _mm_store_si128((__m128i*)outptr3, _mm512_extracti32x4_epi32(_tmp0, 3));
+                    if (tj * 4 + 1 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 4), _mm512_extracti32x4_epi32(_tmp1, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 4), _mm512_extracti32x4_epi32(_tmp1, 1));
+                        _mm_store_si128((__m128i*)(outptr2 + 4), _mm512_extracti32x4_epi32(_tmp1, 2));
+                        _mm_store_si128((__m128i*)(outptr3 + 4), _mm512_extracti32x4_epi32(_tmp1, 3));
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 8), _mm512_extracti32x4_epi32(_tmp2, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 8), _mm512_extracti32x4_epi32(_tmp2, 1));
+                        _mm_store_si128((__m128i*)(outptr2 + 8), _mm512_extracti32x4_epi32(_tmp2, 2));
+                        _mm_store_si128((__m128i*)(outptr3 + 8), _mm512_extracti32x4_epi32(_tmp2, 3));
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 12), _mm512_extracti32x4_epi32(_tmp3, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 12), _mm512_extracti32x4_epi32(_tmp3, 1));
+                        _mm_store_si128((__m128i*)(outptr2 + 12), _mm512_extracti32x4_epi32(_tmp3, 2));
+                        _mm_store_si128((__m128i*)(outptr3 + 12), _mm512_extracti32x4_epi32(_tmp3, 3));
+                    }
+                }
+                if (out_elempack == 1)
+                {
+                    __m512i _vindex = _mm512_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+                    _vindex = _mm512_mullo_epi32(_vindex, _mm512_set1_epi32(N));
+                    _mm512_i32scatter_epi32(outptr0, _vindex, _tmp0, sizeof(int));
+                    if (tj * 4 + 1 < outw) _mm512_i32scatter_epi32(outptr0 + 1, _vindex, _tmp1, sizeof(int));
+                    if (tj * 4 + 2 < outw) _mm512_i32scatter_epi32(outptr0 + 2, _vindex, _tmp2, sizeof(int));
+                    if (tj * 4 + 3 < outw) _mm512_i32scatter_epi32(outptr0 + 3, _vindex, _tmp3, sizeof(int));
+                }
+
+                outptr0 += outw * out_elempack;
+            }
+        }
+    }
+#endif // __AVX512F__
+    for (; ii + 7 < max_ii; ii += 8)
+    {
+#ifdef _MSC_VER
+        __declspec(align(32))
+#else
+        __attribute__((aligned(32)))
+#endif
+        int tmp[4][6][8];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 8;
+            const int* r1 = r0 + max_jj * 8;
+            const int* r2 = r0 + max_jj * 8 * 2;
+            const int* r3 = r0 + max_jj * 8 * 3;
+            const int* r4 = r0 + max_jj * 8 * 4;
+            const int* r5 = r0 + max_jj * 8 * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)r0);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)r1);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)r2);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)r3);
+                __m256i _r4 = _mm256_load_si256((const __m256i*)r4);
+                __m256i _r5 = _mm256_load_si256((const __m256i*)r5);
+
+                __m256i _tmp02a = _mm256_add_epi32(_r1, _r2);
+                __m256i _tmp02b = _mm256_add_epi32(_r3, _r4);
+                __m256i _tmp13a = _mm256_sub_epi32(_r1, _r2);
+                __m256i _tmp13b = _mm256_sub_epi32(_r3, _r4);
+
+                __m256i _tmp0 = _mm256_add_epi32(_mm256_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m256i _tmp1 = _mm256_add_epi32(_tmp13a, _mm256_slli_epi32(_tmp13b, 1));
+                __m256i _tmp2 = _mm256_add_epi32(_tmp02a, _mm256_slli_epi32(_tmp02b, 2));
+                __m256i _tmp3 = _mm256_add_epi32(_mm256_add_epi32(_tmp13a, _mm256_slli_epi32(_tmp13b, 3)), _mm256_slli_epi32(_r5, 2));
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm256_storeu_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_storeu_si256((__m256i*)tmp[1][m], _tmp1);
+                _mm256_storeu_si256((__m256i*)tmp[2][m], _tmp2);
+                _mm256_storeu_si256((__m256i*)tmp[3][m], _tmp3);
+#else
+                _mm256_store_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_store_si256((__m256i*)tmp[1][m], _tmp1);
+                _mm256_store_si256((__m256i*)tmp[2][m], _tmp2);
+                _mm256_store_si256((__m256i*)tmp[3][m], _tmp3);
+#endif
+
+                r0 += max_jj * 6 * 8;
+                r1 += max_jj * 6 * 8;
+                r2 += max_jj * 6 * 8;
+                r3 += max_jj * 6 * 8;
+                r4 += max_jj * 6 * 8;
+                r5 += max_jj * 6 * 8;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                __m256i _r0 = _mm256_load_si256((const __m256i*)r0);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)r1);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)r2);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)r3);
+                __m256i _r4 = _mm256_load_si256((const __m256i*)r4);
+                __m256i _r5 = _mm256_load_si256((const __m256i*)r5);
+
+                __m256i _tmp02a = _mm256_add_epi32(_r1, _r2);
+                __m256i _tmp02b = _mm256_add_epi32(_r3, _r4);
+                __m256i _tmp13a = _mm256_sub_epi32(_r1, _r2);
+                __m256i _tmp13b = _mm256_sub_epi32(_r3, _r4);
+
+                __m256i _tmp0 = _mm256_add_epi32(_mm256_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m256i _tmp1 = _mm256_add_epi32(_tmp13a, _mm256_slli_epi32(_tmp13b, 1));
+                __m256i _tmp2 = _mm256_add_epi32(_tmp02a, _mm256_slli_epi32(_tmp02b, 2));
+                __m256i _tmp3 = _mm256_add_epi32(_mm256_add_epi32(_tmp13a, _mm256_slli_epi32(_tmp13b, 3)), _mm256_slli_epi32(_r5, 2));
+
+                _tmp0 = _mm256_slli_epi32(_tmp0, 2);
+                _tmp1 = _mm256_slli_epi32(_tmp1, 2);
+                _tmp2 = _mm256_slli_epi32(_tmp2, 2);
+                _tmp3 = _mm256_slli_epi32(_tmp3, 2);
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm256_storeu_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_storeu_si256((__m256i*)tmp[1][m], _tmp1);
+                _mm256_storeu_si256((__m256i*)tmp[2][m], _tmp2);
+                _mm256_storeu_si256((__m256i*)tmp[3][m], _tmp3);
+#else
+                _mm256_store_si256((__m256i*)tmp[0][m], _tmp0);
+                _mm256_store_si256((__m256i*)tmp[1][m], _tmp1);
+                _mm256_store_si256((__m256i*)tmp[2][m], _tmp2);
+                _mm256_store_si256((__m256i*)tmp[3][m], _tmp3);
+#endif
+
+                r0 += max_jj * 6 * 8;
+                r1 += max_jj * 6 * 8;
+                r2 += max_jj * 6 * 8;
+                r3 += max_jj * 6 * 8;
+                r4 += max_jj * 6 * 8;
+                r5 += max_jj * 6 * 8;
+            }
+
+            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 4) + (tj * 4) * out_elempack;
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                __m256i _r0 = _mm256_loadu_si256((const __m256i*)tmp[m][0]);
+                __m256i _r1 = _mm256_loadu_si256((const __m256i*)tmp[m][1]);
+                __m256i _r2 = _mm256_loadu_si256((const __m256i*)tmp[m][2]);
+                __m256i _r3 = _mm256_loadu_si256((const __m256i*)tmp[m][3]);
+                __m256i _r4 = _mm256_loadu_si256((const __m256i*)tmp[m][4]);
+                __m256i _r5 = _mm256_loadu_si256((const __m256i*)tmp[m][5]);
+#else
+                __m256i _r0 = _mm256_load_si256((const __m256i*)tmp[m][0]);
+                __m256i _r1 = _mm256_load_si256((const __m256i*)tmp[m][1]);
+                __m256i _r2 = _mm256_load_si256((const __m256i*)tmp[m][2]);
+                __m256i _r3 = _mm256_load_si256((const __m256i*)tmp[m][3]);
+                __m256i _r4 = _mm256_load_si256((const __m256i*)tmp[m][4]);
+                __m256i _r5 = _mm256_load_si256((const __m256i*)tmp[m][5]);
+#endif
+
+                __m256i _tmp02a = _mm256_add_epi32(_r1, _r2);
+                __m256i _tmp02b = _mm256_add_epi32(_r3, _r4);
+                __m256i _tmp13a = _mm256_sub_epi32(_r1, _r2);
+                __m256i _tmp13b = _mm256_sub_epi32(_r3, _r4);
+
+                __m256i _tmp0 = _mm256_add_epi32(_mm256_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m256i _tmp1 = _mm256_add_epi32(_tmp13a, _mm256_slli_epi32(_tmp13b, 1));
+                __m256i _tmp2 = _mm256_add_epi32(_tmp02a, _mm256_slli_epi32(_tmp02b, 2));
+                __m256i _tmp3 = _mm256_add_epi32(_mm256_add_epi32(_tmp13a, _mm256_slli_epi32(_tmp13b, 3)), _r5);
+
+                // TODO use integer trick for division by 576
+                __m256 _v576 = _mm256_set1_ps(1.0 / 576);
+                _tmp0 = _mm256_cvttps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_tmp0), _v576));
+                _tmp1 = _mm256_cvttps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_tmp1), _v576));
+                _tmp2 = _mm256_cvttps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_tmp2), _v576));
+                _tmp3 = _mm256_cvttps_epi32(_mm256_mul_ps(_mm256_cvtepi32_ps(_tmp3), _v576));
+
+                if (out_elempack == 8)
+                {
+                    _mm256_store_si256((__m256i*)outptr0, _tmp0);
+                    if (tj * 4 + 1 < outw) _mm256_store_si256((__m256i*)(outptr0 + 8), _tmp1);
+                    if (tj * 4 + 2 < outw) _mm256_store_si256((__m256i*)(outptr0 + 16), _tmp2);
+                    if (tj * 4 + 3 < outw) _mm256_store_si256((__m256i*)(outptr0 + 24), _tmp3);
+                }
+                if (out_elempack == 4)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    _mm_store_si128((__m128i*)(outptr0), _mm256_extracti128_si256(_tmp0, 0));
+                    _mm_store_si128((__m128i*)(outptr1), _mm256_extracti128_si256(_tmp0, 1));
+                    if (tj * 4 + 1 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 4), _mm256_extracti128_si256(_tmp1, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 4), _mm256_extracti128_si256(_tmp1, 1));
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 8), _mm256_extracti128_si256(_tmp2, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 8), _mm256_extracti128_si256(_tmp2, 1));
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        _mm_store_si128((__m128i*)(outptr0 + 12), _mm256_extracti128_si256(_tmp3, 0));
+                        _mm_store_si128((__m128i*)(outptr1 + 12), _mm256_extracti128_si256(_tmp3, 1));
+                    }
+                }
+                if (out_elempack == 1)
+                {
+#if __AVX512F__
+                    __m256i _vindex = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+                    _vindex = _mm256_mullo_epi32(_vindex, _mm256_set1_epi32(N));
+                    _mm256_i32scatter_epi32(outptr0, _vindex, _tmp0, sizeof(int));
+                    if (tj * 4 + 1 < outw) _mm256_i32scatter_epi32(outptr0 + 1, _vindex, _tmp1, sizeof(int));
+                    if (tj * 4 + 2 < outw) _mm256_i32scatter_epi32(outptr0 + 2, _vindex, _tmp2, sizeof(int));
+                    if (tj * 4 + 3 < outw) _mm256_i32scatter_epi32(outptr0 + 3, _vindex, _tmp3, sizeof(int));
+#else
+                    int tmp0[8];
+                    int tmp1[8];
+                    int tmp2[8];
+                    int tmp3[8];
+                    _mm256_storeu_si256((__m256i*)tmp0, _tmp0);
+                    _mm256_storeu_si256((__m256i*)tmp1, _tmp1);
+                    _mm256_storeu_si256((__m256i*)tmp2, _tmp2);
+                    _mm256_storeu_si256((__m256i*)tmp3, _tmp3);
+
+                    int* outptr1 = outptr0 + N;
+                    int* outptr2 = outptr0 + N * 2;
+                    int* outptr3 = outptr0 + N * 3;
+                    int* outptr4 = outptr0 + N * 4;
+                    int* outptr5 = outptr0 + N * 5;
+                    int* outptr6 = outptr0 + N * 6;
+                    int* outptr7 = outptr0 + N * 7;
+
+                    outptr0[0] = tmp0[0];
+                    outptr1[0] = tmp0[1];
+                    outptr2[0] = tmp0[2];
+                    outptr3[0] = tmp0[3];
+                    outptr4[0] = tmp0[4];
+                    outptr5[0] = tmp0[5];
+                    outptr6[0] = tmp0[6];
+                    outptr7[0] = tmp0[7];
+                    if (tj * 4 + 1 < outw)
+                    {
+                        outptr0[1] = tmp1[0];
+                        outptr1[1] = tmp1[1];
+                        outptr2[1] = tmp1[2];
+                        outptr3[1] = tmp1[3];
+                        outptr4[1] = tmp1[4];
+                        outptr5[1] = tmp1[5];
+                        outptr6[1] = tmp1[6];
+                        outptr7[1] = tmp1[7];
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        outptr0[2] = tmp2[0];
+                        outptr1[2] = tmp2[1];
+                        outptr2[2] = tmp2[2];
+                        outptr3[2] = tmp2[3];
+                        outptr4[2] = tmp2[4];
+                        outptr5[2] = tmp2[5];
+                        outptr6[2] = tmp2[6];
+                        outptr7[2] = tmp2[7];
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        outptr0[3] = tmp3[0];
+                        outptr1[3] = tmp3[1];
+                        outptr2[3] = tmp3[2];
+                        outptr3[3] = tmp3[3];
+                        outptr4[3] = tmp3[4];
+                        outptr5[3] = tmp3[5];
+                        outptr6[3] = tmp3[6];
+                        outptr7[3] = tmp3[7];
+                    }
+#endif // __AVX512F__
+                }
+
+                outptr0 += outw * out_elempack;
+            }
+        }
+    }
+#endif // __AVX2__
+    for (; ii + 3 < max_ii; ii += 4)
+    {
+#ifdef _MSC_VER
+        __declspec(align(16))
+#else
+        __attribute__((aligned(16)))
+#endif
+        int tmp[4][6][4];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 4;
+            const int* r1 = r0 + max_jj * 4;
+            const int* r2 = r0 + max_jj * 4 * 2;
+            const int* r3 = r0 + max_jj * 4 * 3;
+            const int* r4 = r0 + max_jj * 4 * 4;
+            const int* r5 = r0 + max_jj * 4 * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                __m128i _r0 = _mm_load_si128((const __m128i*)r0);
+                __m128i _r1 = _mm_load_si128((const __m128i*)r1);
+                __m128i _r2 = _mm_load_si128((const __m128i*)r2);
+                __m128i _r3 = _mm_load_si128((const __m128i*)r3);
+                __m128i _r4 = _mm_load_si128((const __m128i*)r4);
+                __m128i _r5 = _mm_load_si128((const __m128i*)r5);
+
+                __m128i _tmp02a = _mm_add_epi32(_r1, _r2);
+                __m128i _tmp02b = _mm_add_epi32(_r3, _r4);
+                __m128i _tmp13a = _mm_sub_epi32(_r1, _r2);
+                __m128i _tmp13b = _mm_sub_epi32(_r3, _r4);
+
+                __m128i _tmp0 = _mm_add_epi32(_mm_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m128i _tmp1 = _mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 1));
+                __m128i _tmp2 = _mm_add_epi32(_tmp02a, _mm_slli_epi32(_tmp02b, 2));
+                __m128i _tmp3 = _mm_add_epi32(_mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 3)), _mm_slli_epi32(_r5, 2));
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3);
+#else
+                _mm_store_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_store_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_store_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_store_si128((__m128i*)tmp[3][m], _tmp3);
+#endif
+
+                r0 += max_jj * 6 * 4;
+                r1 += max_jj * 6 * 4;
+                r2 += max_jj * 6 * 4;
+                r3 += max_jj * 6 * 4;
+                r4 += max_jj * 6 * 4;
+                r5 += max_jj * 6 * 4;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                __m128i _r0 = _mm_load_si128((const __m128i*)r0);
+                __m128i _r1 = _mm_load_si128((const __m128i*)r1);
+                __m128i _r2 = _mm_load_si128((const __m128i*)r2);
+                __m128i _r3 = _mm_load_si128((const __m128i*)r3);
+                __m128i _r4 = _mm_load_si128((const __m128i*)r4);
+                __m128i _r5 = _mm_load_si128((const __m128i*)r5);
+
+                __m128i _tmp02a = _mm_add_epi32(_r1, _r2);
+                __m128i _tmp02b = _mm_add_epi32(_r3, _r4);
+                __m128i _tmp13a = _mm_sub_epi32(_r1, _r2);
+                __m128i _tmp13b = _mm_sub_epi32(_r3, _r4);
+
+                __m128i _tmp0 = _mm_add_epi32(_mm_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m128i _tmp1 = _mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 1));
+                __m128i _tmp2 = _mm_add_epi32(_tmp02a, _mm_slli_epi32(_tmp02b, 2));
+                __m128i _tmp3 = _mm_add_epi32(_mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 3)), _mm_slli_epi32(_r5, 2));
+
+                _tmp0 = _mm_slli_epi32(_tmp0, 2);
+                _tmp1 = _mm_slli_epi32(_tmp1, 2);
+                _tmp2 = _mm_slli_epi32(_tmp2, 2);
+                _tmp3 = _mm_slli_epi32(_tmp3, 2);
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                _mm_storeu_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_storeu_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_storeu_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_storeu_si128((__m128i*)tmp[3][m], _tmp3);
+#else
+                _mm_store_si128((__m128i*)tmp[0][m], _tmp0);
+                _mm_store_si128((__m128i*)tmp[1][m], _tmp1);
+                _mm_store_si128((__m128i*)tmp[2][m], _tmp2);
+                _mm_store_si128((__m128i*)tmp[3][m], _tmp3);
+#endif
+
+                r0 += max_jj * 6 * 4;
+                r1 += max_jj * 6 * 4;
+                r2 += max_jj * 6 * 4;
+                r3 += max_jj * 6 * 4;
+                r4 += max_jj * 6 * 4;
+                r5 += max_jj * 6 * 4;
+            }
+
+            int* outptr0 = top_blob.channel((i + ii) / out_elempack).row<int>(ti * 4) + (tj * 4) * out_elempack;
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+#if defined(__GNUC__) && (__GNUC__ <= 4) && (__GNUC_MINOR__ < 6)
+                __m128i _r0 = _mm_loadu_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_loadu_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_loadu_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_loadu_si128((const __m128i*)tmp[m][3]);
+                __m128i _r4 = _mm_loadu_si128((const __m128i*)tmp[m][4]);
+                __m128i _r5 = _mm_loadu_si128((const __m128i*)tmp[m][5]);
+#else
+                __m128i _r0 = _mm_load_si128((const __m128i*)tmp[m][0]);
+                __m128i _r1 = _mm_load_si128((const __m128i*)tmp[m][1]);
+                __m128i _r2 = _mm_load_si128((const __m128i*)tmp[m][2]);
+                __m128i _r3 = _mm_load_si128((const __m128i*)tmp[m][3]);
+                __m128i _r4 = _mm_load_si128((const __m128i*)tmp[m][4]);
+                __m128i _r5 = _mm_load_si128((const __m128i*)tmp[m][5]);
+#endif
+
+                __m128i _tmp02a = _mm_add_epi32(_r1, _r2);
+                __m128i _tmp02b = _mm_add_epi32(_r3, _r4);
+                __m128i _tmp13a = _mm_sub_epi32(_r1, _r2);
+                __m128i _tmp13b = _mm_sub_epi32(_r3, _r4);
+
+                __m128i _tmp0 = _mm_add_epi32(_mm_add_epi32(_tmp02a, _tmp02b), _r0);
+                __m128i _tmp1 = _mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 1));
+                __m128i _tmp2 = _mm_add_epi32(_tmp02a, _mm_slli_epi32(_tmp02b, 2));
+                __m128i _tmp3 = _mm_add_epi32(_mm_add_epi32(_tmp13a, _mm_slli_epi32(_tmp13b, 3)), _r5);
+
+                // TODO use integer trick for division by 576
+                __m128 _v576 = _mm_set1_ps(1.0 / 576);
+                _tmp0 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_tmp0), _v576));
+                _tmp1 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_tmp1), _v576));
+                _tmp2 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_tmp2), _v576));
+                _tmp3 = _mm_cvttps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(_tmp3), _v576));
+
+                if (out_elempack == 4)
+                {
+                    _mm_store_si128((__m128i*)outptr0, _tmp0);
+                    if (tj * 4 + 1 < outw) _mm_store_si128((__m128i*)(outptr0 + 4), _tmp1);
+                    if (tj * 4 + 2 < outw) _mm_store_si128((__m128i*)(outptr0 + 8), _tmp2);
+                    if (tj * 4 + 3 < outw) _mm_store_si128((__m128i*)(outptr0 + 12), _tmp3);
+                }
+                if (out_elempack == 1)
+                {
+#if __AVX512F__
+                    __m128i _vindex = _mm_setr_epi32(0, 1, 2, 3);
+                    _vindex = _mm_mullo_epi32(_vindex, _mm_set1_epi32(N));
+                    _mm_i32scatter_epi32(outptr0, _vindex, _tmp0, sizeof(int));
+                    if (tj * 4 + 1 < outw) _mm_i32scatter_epi32(outptr0 + 1, _vindex, _tmp1, sizeof(int));
+                    if (tj * 4 + 2 < outw) _mm_i32scatter_epi32(outptr0 + 2, _vindex, _tmp2, sizeof(int));
+                    if (tj * 4 + 3 < outw) _mm_i32scatter_epi32(outptr0 + 3, _vindex, _tmp3, sizeof(int));
+#else
+                    int tmp0[4];
+                    int tmp1[4];
+                    int tmp2[4];
+                    int tmp3[4];
+                    _mm_storeu_si128((__m128i*)tmp0, _tmp0);
+                    _mm_storeu_si128((__m128i*)tmp1, _tmp1);
+                    _mm_storeu_si128((__m128i*)tmp2, _tmp2);
+                    _mm_storeu_si128((__m128i*)tmp3, _tmp3);
+
+                    int* outptr1 = outptr0 + N;
+                    int* outptr2 = outptr0 + N * 2;
+                    int* outptr3 = outptr0 + N * 3;
+
+                    outptr0[0] = tmp0[0];
+                    outptr1[0] = tmp0[1];
+                    outptr2[0] = tmp0[2];
+                    outptr3[0] = tmp0[3];
+                    if (tj * 4 + 1 < outw)
+                    {
+                        outptr0[1] = tmp1[0];
+                        outptr1[1] = tmp1[1];
+                        outptr2[1] = tmp1[2];
+                        outptr3[1] = tmp1[3];
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        outptr0[2] = tmp2[0];
+                        outptr1[2] = tmp2[1];
+                        outptr2[2] = tmp2[2];
+                        outptr3[2] = tmp2[3];
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        outptr0[3] = tmp3[0];
+                        outptr1[3] = tmp3[1];
+                        outptr2[3] = tmp3[2];
+                        outptr3[3] = tmp3[3];
+                    }
+#endif // __AVX512F__
+                }
+
+                outptr0 += outw * out_elempack;
+            }
+        }
+    }
+#endif // __SSE2__
+    for (; ii + 1 < max_ii; ii += 2)
+    {
+        int tmp[4][6][2];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj * 2;
+            const int* r1 = r0 + max_jj * 2;
+            const int* r2 = r0 + max_jj * 2 * 2;
+            const int* r3 = r0 + max_jj * 2 * 3;
+            const int* r4 = r0 + max_jj * 2 * 4;
+            const int* r5 = r0 + max_jj * 2 * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                int tmp02a0 = r1[0] + r2[0];
+                int tmp02a1 = r1[1] + r2[1];
+                int tmp02b0 = r3[0] + r4[0];
+                int tmp02b1 = r3[1] + r4[1];
+                int tmp13a0 = r1[0] - r2[0];
+                int tmp13a1 = r1[1] - r2[1];
+                int tmp13b0 = r3[0] - r4[0];
+                int tmp13b1 = r3[1] - r4[1];
+
+                int tmp00 = tmp02a0 + tmp02b0 + r0[0];
+                int tmp01 = tmp02a1 + tmp02b1 + r0[1];
+                int tmp10 = tmp13a0 + tmp13b0 * 2;
+                int tmp11 = tmp13a1 + tmp13b1 * 2;
+                int tmp20 = tmp02a0 + tmp02b0 * 4;
+                int tmp21 = tmp02a1 + tmp02b1 * 4;
+                int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4;
+                int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4;
+
+                tmp[0][m][0] = tmp00;
+                tmp[0][m][1] = tmp01;
+                tmp[1][m][0] = tmp10;
+                tmp[1][m][1] = tmp11;
+                tmp[2][m][0] = tmp20;
+                tmp[2][m][1] = tmp21;
+                tmp[3][m][0] = tmp30;
+                tmp[3][m][1] = tmp31;
+
+                r0 += max_jj * 6 * 2;
+                r1 += max_jj * 6 * 2;
+                r2 += max_jj * 6 * 2;
+                r3 += max_jj * 6 * 2;
+                r4 += max_jj * 6 * 2;
+                r5 += max_jj * 6 * 2;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                int tmp02a0 = r1[0] + r2[0];
+                int tmp02a1 = r1[1] + r2[1];
+                int tmp02b0 = r3[0] + r4[0];
+                int tmp02b1 = r3[1] + r4[1];
+                int tmp13a0 = r1[0] - r2[0];
+                int tmp13a1 = r1[1] - r2[1];
+                int tmp13b0 = r3[0] - r4[0];
+                int tmp13b1 = r3[1] - r4[1];
+
+                int tmp00 = tmp02a0 + tmp02b0 + r0[0];
+                int tmp01 = tmp02a1 + tmp02b1 + r0[1];
+                int tmp10 = tmp13a0 + tmp13b0 * 2;
+                int tmp11 = tmp13a1 + tmp13b1 * 2;
+                int tmp20 = tmp02a0 + tmp02b0 * 4;
+                int tmp21 = tmp02a1 + tmp02b1 * 4;
+                int tmp30 = tmp13a0 + tmp13b0 * 8 + r5[0] * 4;
+                int tmp31 = tmp13a1 + tmp13b1 * 8 + r5[1] * 4;
+
+                tmp00 = tmp00 * 4;
+                tmp01 = tmp01 * 4;
+                tmp10 = tmp10 * 4;
+                tmp11 = tmp11 * 4;
+                tmp20 = tmp20 * 4;
+                tmp21 = tmp21 * 4;
+                tmp30 = tmp30 * 4;
+                tmp31 = tmp31 * 4;
+
+                tmp[0][m][0] = tmp00;
+                tmp[0][m][1] = tmp01;
+                tmp[1][m][0] = tmp10;
+                tmp[1][m][1] = tmp11;
+                tmp[2][m][0] = tmp20;
+                tmp[2][m][1] = tmp21;
+                tmp[3][m][0] = tmp30;
+                tmp[3][m][1] = tmp31;
+
+                r0 += max_jj * 6 * 2;
+                r1 += max_jj * 6 * 2;
+                r2 += max_jj * 6 * 2;
+                r3 += max_jj * 6 * 2;
+                r4 += max_jj * 6 * 2;
+                r5 += max_jj * 6 * 2;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+                int tmp02a0 = tmp[m][1][0] + tmp[m][2][0];
+                int tmp02a1 = tmp[m][1][1] + tmp[m][2][1];
+                int tmp02b0 = tmp[m][3][0] + tmp[m][4][0];
+                int tmp02b1 = tmp[m][3][1] + tmp[m][4][1];
+                int tmp13a0 = tmp[m][1][0] - tmp[m][2][0];
+                int tmp13a1 = tmp[m][1][1] - tmp[m][2][1];
+                int tmp13b0 = tmp[m][3][0] - tmp[m][4][0];
+                int tmp13b1 = tmp[m][3][1] - tmp[m][4][1];
+
+                int tmp00 = tmp02a0 + tmp02b0 + tmp[m][0][0];
+                int tmp01 = tmp02a1 + tmp02b1 + tmp[m][0][1];
+                int tmp10 = tmp13a0 + tmp13b0 * 2;
+                int tmp11 = tmp13a1 + tmp13b1 * 2;
+                int tmp20 = tmp02a0 + tmp02b0 * 4;
+                int tmp21 = tmp02a1 + tmp02b1 * 4;
+                int tmp30 = tmp13a0 + tmp13b0 * 8 + tmp[m][5][0];
+                int tmp31 = tmp13a1 + tmp13b1 * 8 + tmp[m][5][1];
+
+                tmp00 = tmp00 / 576;
+                tmp01 = tmp01 / 576;
+                tmp10 = tmp10 / 576;
+                tmp11 = tmp11 / 576;
+                tmp20 = tmp20 / 576;
+                tmp21 = tmp21 / 576;
+                tmp30 = tmp30 / 576;
+                tmp31 = tmp31 / 576;
+
+                // if (out_elempack == 1)
+                {
+                    int* outptr1 = outptr0 + N;
+
+                    outptr0[0] = tmp00;
+                    outptr1[0] = tmp01;
+                    if (tj * 4 + 1 < outw)
+                    {
+                        outptr0[1] = tmp10;
+                        outptr1[1] = tmp11;
+                    }
+                    if (tj * 4 + 2 < outw)
+                    {
+                        outptr0[2] = tmp20;
+                        outptr1[2] = tmp21;
+                    }
+                    if (tj * 4 + 3 < outw)
+                    {
+                        outptr0[3] = tmp30;
+                        outptr1[3] = tmp31;
+                    }
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+    for (; ii < max_ii; ii++)
+    {
+        int tmp[4][6];
+
+        int jj = 0;
+        for (; jj < max_jj; jj++)
+        {
+            int ti = (j + jj) / w_tiles;
+            int tj = (j + jj) % w_tiles;
+
+            const int* r0 = (const int*)top_tile + ii * max_jj * 36 + jj;
+            const int* r1 = r0 + max_jj;
+            const int* r2 = r0 + max_jj * 2;
+            const int* r3 = r0 + max_jj * 3;
+            const int* r4 = r0 + max_jj * 4;
+            const int* r5 = r0 + max_jj * 5;
+
+            for (int m = 0; m < 5; m++)
+            {
+                int tmp02a = r1[0] + r2[0];
+                int tmp02b = r3[0] + r4[0];
+                int tmp13a = r1[0] - r2[0];
+                int tmp13b = r3[0] - r4[0];
+
+                int tmp0 = tmp02a + tmp02b + r0[0];
+                int tmp1 = tmp13a + tmp13b * 2;
+                int tmp2 = tmp02a + tmp02b * 4;
+                int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4;
+
+                tmp[0][m] = tmp0;
+                tmp[1][m] = tmp1;
+                tmp[2][m] = tmp2;
+                tmp[3][m] = tmp3;
+
+                r0 += max_jj * 6;
+                r1 += max_jj * 6;
+                r2 += max_jj * 6;
+                r3 += max_jj * 6;
+                r4 += max_jj * 6;
+                r5 += max_jj * 6;
+            }
+            for (int m = 5; m < 6; m++)
+            {
+                int tmp02a = r1[0] + r2[0];
+                int tmp02b = r3[0] + r4[0];
+                int tmp13a = r1[0] - r2[0];
+                int tmp13b = r3[0] - r4[0];
+
+                int tmp0 = tmp02a + tmp02b + r0[0];
+                int tmp1 = tmp13a + tmp13b * 2;
+                int tmp2 = tmp02a + tmp02b * 4;
+                int tmp3 = tmp13a + tmp13b * 8 + r5[0] * 4;
+
+                tmp0 = tmp0 * 4;
+                tmp1 = tmp1 * 4;
+                tmp2 = tmp2 * 4;
+                tmp3 = tmp3 * 4;
+
+                tmp[0][m] = tmp0;
+                tmp[1][m] = tmp1;
+                tmp[2][m] = tmp2;
+                tmp[3][m] = tmp3;
+
+                r0 += max_jj * 6;
+                r1 += max_jj * 6;
+                r2 += max_jj * 6;
+                r3 += max_jj * 6;
+                r4 += max_jj * 6;
+                r5 += max_jj * 6;
+            }
+
+            int* outptr0 = top_blob.channel(i + ii).row<int>(ti * 4) + (tj * 4);
+
+            for (int m = 0; m < 4; m++)
+            {
+                if (ti * 4 + m >= outh)
+                    continue;
+
+                int tmp02a = tmp[m][1] + tmp[m][2];
+                int tmp02b = tmp[m][3] + tmp[m][4];
+                int tmp13a = tmp[m][1] - tmp[m][2];
+                int tmp13b = tmp[m][3] - tmp[m][4];
+
+                int tmp0 = tmp02a + tmp02b + tmp[m][0];
+                int tmp1 = tmp13a + tmp13b * 2;
+                int tmp2 = tmp02a + tmp02b * 4;
+                int tmp3 = tmp13a + tmp13b * 8 + tmp[m][5];
+
+                tmp0 = tmp0 / 576;
+                tmp1 = tmp1 / 576;
+                tmp2 = tmp2 / 576;
+                tmp3 = tmp3 / 576;
+
+                // if (out_elempack == 1)
+                {
+                    outptr0[0] = tmp0;
+                    if (tj * 4 + 1 < outw) outptr0[1] = tmp1;
+                    if (tj * 4 + 2 < outw) outptr0[2] = tmp2;
+                    if (tj * 4 + 3 < outw) outptr0[3] = tmp3;
+                }
+
+                outptr0 += outw;
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd43_int8(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
+{
+#if !(__AVX512VNNI__ || __AVXVNNI__ || __AVX2__ || __XOP__)
+#if NCNN_RUNTIME_CPU && NCNN_AVX512VNNI && __AVX512F__ && !__AVX512VNNI__
+    if (ncnn::cpu_support_x86_avx512_vnni())
+    {
+        conv3x3s1_winograd43_int8_avx512vnni(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_AVXVNNI && __AVX2__ && !__AVXVNNI__
+    if (ncnn::cpu_support_x86_avx_vnni())
+    {
+        conv3x3s1_winograd43_int8_avxvnni(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_AVX2 && __AVX__ && !__AVX2__
+    if (ncnn::cpu_support_x86_avx2())
+    {
+        conv3x3s1_winograd43_int8_avx2(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+
+#if NCNN_RUNTIME_CPU && NCNN_XOP && __SSE2__ && !__XOP__
+    if (ncnn::cpu_support_x86_xop())
+    {
+        conv3x3s1_winograd43_int8_xop(bottom_blob, top_blob, AT, nT, opt);
+        return;
+    }
+#endif
+#endif
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+
+    // pad to 4n+2, winograd F(4,3)
+    int w_tiles = (outw + 3) / 4;
+    int h_tiles = (outh + 3) / 4;
+    int tiles = w_tiles * h_tiles;
+
+    const int M = top_blob.c * top_blob.elempack;
+    const int N = tiles;
+    const int K = bottom_blob.c * bottom_blob.elempack;
+    const int B = 36;
+
+    // NCNN_LOGE("conv3x3s1_winograd43_int8 %d %d %d", M, N, K);
+
+    int TILE_M, TILE_N, TILE_K;
+    get_optimal_tile_mnk_int8(M, N, K, TILE_M, TILE_N, TILE_K, nT);
+
+    const int nn_M = (M + TILE_M - 1) / TILE_M;
+    const int nn_N = (N + TILE_N - 1) / TILE_N;
+    const int nn_K = (K + TILE_K - 1) / TILE_K;
+
+    // NCNN_LOGE("TILE M/N/K = %d %d %d -> %d %d %d", M, N, K, TILE_M, TILE_N, TILE_K);
+
+    Mat BT(TILE_K * TILE_N, B, (K + TILE_K - 1) / TILE_K, (N + TILE_N - 1) / TILE_N, 4u, opt.workspace_allocator);
+
+    const int nn_NK = nn_N * nn_K;
+
+    if (nT > 1 && nn_NK < nT)
+    {
+        Mat B_tile(TILE_N * B * TILE_K, 4u, opt.workspace_allocator);
+
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            // transform input
+            conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, nT);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, nT);
+        }
+    }
+    else
+    {
+        Mat B_tileX(TILE_N * B * TILE_K, 1, nT, 4u, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(nT)
+        for (int ppjk = 0; ppjk < nn_NK; ppjk++)
+        {
+            const int ppj = ppjk / nn_K;
+            const int ppk = ppjk % nn_K;
+
+            const int j = ppj * TILE_N;
+            const int k = ppk * TILE_K;
+
+            const int max_jj = std::min((N - j), TILE_N);
+            const int max_kk = std::min((K - k), TILE_K);
+
+            Mat B_tile = B_tileX.channel(get_omp_thread_num());
+
+            // transform input
+            conv3x3s1_winograd43_transform_input_tile_int8(bottom_blob, B_tile, j, max_jj, k, max_kk, 1);
+
+            Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+            transpose_pack_B_tile_int8(B_tile, BT_tile, B, max_jj, max_kk, 1);
+        }
+    }
+
+    Mat top_tileX(TILE_N * B * TILE_M, 1, nT, 4u, opt.workspace_allocator);
+
+    #pragma omp parallel for num_threads(nT)
+    for (int ppj = 0; ppj < nn_M; ppj++)
+    {
+        const int i = ppj * TILE_M;
+
+        Mat top_tile = top_tileX.channel(get_omp_thread_num());
+
+        const int max_ii = std::min((M - i), TILE_M);
+
+        for (int j = 0; j < N; j += TILE_N)
+        {
+            const int max_jj = std::min((N - j), TILE_N);
+
+            for (int k = 0; k < K; k += TILE_K)
+            {
+                const int max_kk = std::min((K - k), TILE_K);
+
+                const Mat AT_tile = AT.channel(i / TILE_M).depth(k / TILE_K);
+
+                const Mat BT_tile = BT.channel(j / TILE_N).depth(k / TILE_K);
+
+                bool k_end = k + TILE_K >= K;
+
+                gemm_transB_packed_tile_int8(AT_tile, BT_tile, top_tile, B, max_ii, max_jj, k, max_kk, k_end);
+            }
+
+            // transform output
+            conv3x3s1_winograd43_transform_output_tile_int8(top_tile, top_blob, i, max_ii, j, max_jj);
+        }
+    }
+}
diff --git a/src/layer/x86/convolution_im2col_gemm_int8.h b/src/layer/x86/convolution_im2col_gemm_int8.h
index 56a4aa4763af..da504677a68f 100644
--- a/src/layer/x86/convolution_im2col_gemm_int8.h
+++ b/src/layer/x86/convolution_im2col_gemm_int8.h
@@ -934,7 +934,6 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
                         _sumf = _mm512_shuffle_epi32(_sumf, _MM_PERM_CBAD);
                     }
 
-                    // TODO
                     __m512i _tmp0 = _mm512_shuffle_i32x4(_sum0, _sumc, _MM_SHUFFLE(2, 0, 2, 0));
                     __m512i _tmp1 = _mm512_shuffle_i32x4(_sum4, _sum0, _MM_SHUFFLE(3, 1, 3, 1));
                     __m512i _tmp2 = _mm512_shuffle_i32x4(_sum8, _sum4, _MM_SHUFFLE(0, 2, 0, 2));
@@ -2547,12 +2546,12 @@ static void convolution_gemm_transB_packed_tile_int8(const Mat& AT_tile, const M
                     __m512i _tmp7 = _mm512_shuffle_i32x4(_sum3, _sum7, _MM_SHUFFLE(3, 2, 3, 2));
                     _sum0 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(2, 0, 2, 0));
                     _sum1 = _mm512_shuffle_i32x4(_tmp4, _tmp6, _MM_SHUFFLE(2, 0, 2, 0));
-                    _sum2 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(3, 1, 3, 1));
-                    _sum3 = _mm512_shuffle_i32x4(_tmp4, _tmp6, _MM_SHUFFLE(3, 1, 3, 1));
+                    _sum2 = _mm512_shuffle_i32x4(_tmp0, _tmp2, _MM_SHUFFLE(1, 3, 1, 3));
+                    _sum3 = _mm512_shuffle_i32x4(_tmp4, _tmp6, _MM_SHUFFLE(1, 3, 1, 3));
                     _sum4 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(2, 0, 2, 0));
                     _sum5 = _mm512_shuffle_i32x4(_tmp5, _tmp7, _MM_SHUFFLE(2, 0, 2, 0));
-                    _sum6 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(3, 1, 3, 1));
-                    _sum7 = _mm512_shuffle_i32x4(_tmp5, _tmp7, _MM_SHUFFLE(3, 1, 3, 1));
+                    _sum6 = _mm512_shuffle_i32x4(_tmp1, _tmp3, _MM_SHUFFLE(1, 3, 1, 3));
+                    _sum7 = _mm512_shuffle_i32x4(_tmp5, _tmp7, _MM_SHUFFLE(1, 3, 1, 3));
 
                     _mm512_storeu_si512((__m512i*)outptr0, _sum0);
                     _mm512_storeu_si512((__m512i*)(outptr0 + 16), _sum1);
@@ -6142,14 +6141,13 @@ static void convolution_im2col_input_tile_conv1x1s1d1_int8(const Mat& bottom_blo
     }
 }
 
-static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
+template<int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h>
+#if __AVX512F__
+void convolution_im2col_input_tile_int8_avx512(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+#else  // __AVX512F__
+void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk)
+#endif // __AVX512F__
 {
-    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
-    {
-        convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk);
-        return;
-    }
-
     const int w = bottom_blob.w;
     // const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
@@ -6206,288 +6204,468 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dxe = (j + jj + 14) % outw;
         int dxf = (j + jj + 15) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dyf)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int x02 = stride_w * dx2 + dilation_w * v0;
-                int x03 = stride_w * dx3 + dilation_w * v0;
-                int x04 = stride_w * dx4 + dilation_w * v0;
-                int x05 = stride_w * dx5 + dilation_w * v0;
-                int x06 = stride_w * dx6 + dilation_w * v0;
-                int x07 = stride_w * dx7 + dilation_w * v0;
-                int x08 = stride_w * dx8 + dilation_w * v0;
-                int x09 = stride_w * dx9 + dilation_w * v0;
-                int x0a = stride_w * dxa + dilation_w * v0;
-                int x0b = stride_w * dxb + dilation_w * v0;
-                int x0c = stride_w * dxc + dilation_w * v0;
-                int x0d = stride_w * dxd + dilation_w * v0;
-                int x0e = stride_w * dxe + dilation_w * v0;
-                int x0f = stride_w * dxf + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int y02 = stride_h * dy2 + dilation_h * u0;
-                int y03 = stride_h * dy3 + dilation_h * u0;
-                int y04 = stride_h * dy4 + dilation_h * u0;
-                int y05 = stride_h * dy5 + dilation_h * u0;
-                int y06 = stride_h * dy6 + dilation_h * u0;
-                int y07 = stride_h * dy7 + dilation_h * u0;
-                int y08 = stride_h * dy8 + dilation_h * u0;
-                int y09 = stride_h * dy9 + dilation_h * u0;
-                int y0a = stride_h * dya + dilation_h * u0;
-                int y0b = stride_h * dyb + dilation_h * u0;
-                int y0c = stride_h * dyc + dilation_h * u0;
-                int y0d = stride_h * dyd + dilation_h * u0;
-                int y0e = stride_h * dye + dilation_h * u0;
-                int y0f = stride_h * dyf + dilation_h * u0;
-
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int x12 = stride_w * dx2 + dilation_w * v1;
-                int x13 = stride_w * dx3 + dilation_w * v1;
-                int x14 = stride_w * dx4 + dilation_w * v1;
-                int x15 = stride_w * dx5 + dilation_w * v1;
-                int x16 = stride_w * dx6 + dilation_w * v1;
-                int x17 = stride_w * dx7 + dilation_w * v1;
-                int x18 = stride_w * dx8 + dilation_w * v1;
-                int x19 = stride_w * dx9 + dilation_w * v1;
-                int x1a = stride_w * dxa + dilation_w * v1;
-                int x1b = stride_w * dxb + dilation_w * v1;
-                int x1c = stride_w * dxc + dilation_w * v1;
-                int x1d = stride_w * dxd + dilation_w * v1;
-                int x1e = stride_w * dxe + dilation_w * v1;
-                int x1f = stride_w * dxf + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-                int y12 = stride_h * dy2 + dilation_h * u1;
-                int y13 = stride_h * dy3 + dilation_h * u1;
-                int y14 = stride_h * dy4 + dilation_h * u1;
-                int y15 = stride_h * dy5 + dilation_h * u1;
-                int y16 = stride_h * dy6 + dilation_h * u1;
-                int y17 = stride_h * dy7 + dilation_h * u1;
-                int y18 = stride_h * dy8 + dilation_h * u1;
-                int y19 = stride_h * dy9 + dilation_h * u1;
-                int y1a = stride_h * dya + dilation_h * u1;
-                int y1b = stride_h * dyb + dilation_h * u1;
-                int y1c = stride_h * dyc + dilation_h * u1;
-                int y1d = stride_h * dyd + dilation_h * u1;
-                int y1e = stride_h * dye + dilation_h * u1;
-                int y1f = stride_h * dyf + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
-                const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
-                const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
-                const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
-                const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
-                const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
-                const signed char* sptr08 = img0.row<const signed char>(y08) + x08;
-                const signed char* sptr09 = img0.row<const signed char>(y09) + x09;
-                const signed char* sptr0a = img0.row<const signed char>(y0a) + x0a;
-                const signed char* sptr0b = img0.row<const signed char>(y0b) + x0b;
-                const signed char* sptr0c = img0.row<const signed char>(y0c) + x0c;
-                const signed char* sptr0d = img0.row<const signed char>(y0d) + x0d;
-                const signed char* sptr0e = img0.row<const signed char>(y0e) + x0e;
-                const signed char* sptr0f = img0.row<const signed char>(y0f) + x0f;
-
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-                const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
-                const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
-                const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
-                const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
-                const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
-                const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
-                const signed char* sptr18 = img1.row<const signed char>(y18) + x18;
-                const signed char* sptr19 = img1.row<const signed char>(y19) + x19;
-                const signed char* sptr1a = img1.row<const signed char>(y1a) + x1a;
-                const signed char* sptr1b = img1.row<const signed char>(y1b) + x1b;
-                const signed char* sptr1c = img1.row<const signed char>(y1c) + x1c;
-                const signed char* sptr1d = img1.row<const signed char>(y1d) + x1d;
-                const signed char* sptr1e = img1.row<const signed char>(y1e) + x1e;
-                const signed char* sptr1f = img1.row<const signed char>(y1f) + x1f;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp[4] = sptr02[0];
-                pp[5] = sptr12[0];
-                pp[6] = sptr03[0];
-                pp[7] = sptr13[0];
-                pp[8] = sptr04[0];
-                pp[9] = sptr14[0];
-                pp[10] = sptr05[0];
-                pp[11] = sptr15[0];
-                pp[12] = sptr06[0];
-                pp[13] = sptr16[0];
-                pp[14] = sptr07[0];
-                pp[15] = sptr17[0];
-                pp[16 + 0] = sptr08[0];
-                pp[16 + 1] = sptr18[0];
-                pp[16 + 2] = sptr09[0];
-                pp[16 + 3] = sptr19[0];
-                pp[16 + 4] = sptr0a[0];
-                pp[16 + 5] = sptr1a[0];
-                pp[16 + 6] = sptr0b[0];
-                pp[16 + 7] = sptr1b[0];
-                pp[16 + 8] = sptr0c[0];
-                pp[16 + 9] = sptr1c[0];
-                pp[16 + 10] = sptr0d[0];
-                pp[16 + 11] = sptr1d[0];
-                pp[16 + 12] = sptr0e[0];
-                pp[16 + 13] = sptr1e[0];
-                pp[16 + 14] = sptr0f[0];
-                pp[16 + 15] = sptr1f[0];
-                pp += 32;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _mm_store_si128((__m128i*)pp, _tmp0);
+                        _mm_store_si128((__m128i*)(pp + 16), _tmp1);
+                        pp += 32;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m256i _r0 = _mm256_loadu_si256((const __m256i*)sptr0);
+                        __m256i _r1 = _mm256_loadu_si256((const __m256i*)sptr1);
+                        __m256i _tmp0 = _mm256_unpacklo_epi8(_r0, _r1);
+                        __m256i _tmp1 = _mm256_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm256_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m256i _r01 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _mm256_storeu_si256((__m256i*)pp, _r01);
+                        pp += 32;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp[16 + 0] = sptr0[stride_w * 8];
+                        pp[16 + 1] = sptr1[stride_w * 8];
+                        pp[16 + 2] = sptr0[stride_w * 9];
+                        pp[16 + 3] = sptr1[stride_w * 9];
+                        pp[16 + 4] = sptr0[stride_w * 10];
+                        pp[16 + 5] = sptr1[stride_w * 10];
+                        pp[16 + 6] = sptr0[stride_w * 11];
+                        pp[16 + 7] = sptr1[stride_w * 11];
+                        pp[16 + 8] = sptr0[stride_w * 12];
+                        pp[16 + 9] = sptr1[stride_w * 12];
+                        pp[16 + 10] = sptr0[stride_w * 13];
+                        pp[16 + 11] = sptr1[stride_w * 13];
+                        pp[16 + 12] = sptr0[stride_w * 14];
+                        pp[16 + 13] = sptr1[stride_w * 14];
+                        pp[16 + 14] = sptr0[stride_w * 15];
+                        pp[16 + 15] = sptr1[stride_w * 15];
+                        pp += 32;
+                    }
+                }
             }
-        }
-        for (; kk < max_kk / elempack; kk++)
-        {
-            int p = (k / elempack + kk) / maxk;
-            int uv = (k / elempack + kk) % maxk;
-            int u = uv / kernel_w;
-            int v = uv % kernel_w;
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
 
-            const Mat img = bottom_blob.channel(p);
+                const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int x2 = stride_w * dx2 + dilation_w * v;
-            int x3 = stride_w * dx3 + dilation_w * v;
-            int x4 = stride_w * dx4 + dilation_w * v;
-            int x5 = stride_w * dx5 + dilation_w * v;
-            int x6 = stride_w * dx6 + dilation_w * v;
-            int x7 = stride_w * dx7 + dilation_w * v;
-            int x8 = stride_w * dx8 + dilation_w * v;
-            int x9 = stride_w * dx9 + dilation_w * v;
-            int xa = stride_w * dxa + dilation_w * v;
-            int xb = stride_w * dxb + dilation_w * v;
-            int xc = stride_w * dxc + dilation_w * v;
-            int xd = stride_w * dxd + dilation_w * v;
-            int xe = stride_w * dxe + dilation_w * v;
-            int xf = stride_w * dxf + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
-            int y2 = stride_h * dy2 + dilation_h * u;
-            int y3 = stride_h * dy3 + dilation_h * u;
-            int y4 = stride_h * dy4 + dilation_h * u;
-            int y5 = stride_h * dy5 + dilation_h * u;
-            int y6 = stride_h * dy6 + dilation_h * u;
-            int y7 = stride_h * dy7 + dilation_h * u;
-            int y8 = stride_h * dy8 + dilation_h * u;
-            int y9 = stride_h * dy9 + dilation_h * u;
-            int ya = stride_h * dya + dilation_h * u;
-            int yb = stride_h * dyb + dilation_h * u;
-            int yc = stride_h * dyc + dilation_h * u;
-            int yd = stride_h * dyd + dilation_h * u;
-            int ye = stride_h * dye + dilation_h * u;
-            int yf = stride_h * dyf + dilation_h * u;
-
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
-            const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
-            const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
-            const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
-            const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
-            const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
-            const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
-            const signed char* sptr8 = img.row<const signed char>(y8) + x8 * elempack;
-            const signed char* sptr9 = img.row<const signed char>(y9) + x9 * elempack;
-            const signed char* sptra = img.row<const signed char>(ya) + xa * elempack;
-            const signed char* sptrb = img.row<const signed char>(yb) + xb * elempack;
-            const signed char* sptrc = img.row<const signed char>(yc) + xc * elempack;
-            const signed char* sptrd = img.row<const signed char>(yd) + xd * elempack;
-            const signed char* sptre = img.row<const signed char>(ye) + xe * elempack;
-            const signed char* sptrf = img.row<const signed char>(yf) + xf * elempack;
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
 
-            if (elempack == 8)
-            {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
-                __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
-                __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
-                __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
-                __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
-                __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
-                __m128i _r8 = _mm_loadl_epi64((const __m128i*)sptr8);
-                __m128i _r9 = _mm_loadl_epi64((const __m128i*)sptr9);
-                __m128i _ra = _mm_loadl_epi64((const __m128i*)sptra);
-                __m128i _rb = _mm_loadl_epi64((const __m128i*)sptrb);
-                __m128i _rc = _mm_loadl_epi64((const __m128i*)sptrc);
-                __m128i _rd = _mm_loadl_epi64((const __m128i*)sptrd);
-                __m128i _re = _mm_loadl_epi64((const __m128i*)sptre);
-                __m128i _rf = _mm_loadl_epi64((const __m128i*)sptrf);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
-                __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
-                __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
-                __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
-                __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
-                __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
-                __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
-                _r0 = _mm_unpacklo_epi32(_r01, _r23);
-                _r1 = _mm_unpackhi_epi32(_r01, _r23);
-                _r2 = _mm_unpacklo_epi32(_r45, _r67);
-                _r3 = _mm_unpackhi_epi32(_r45, _r67);
-                _r4 = _mm_unpacklo_epi32(_r89, _rab);
-                _r5 = _mm_unpackhi_epi32(_r89, _rab);
-                _r6 = _mm_unpacklo_epi32(_rcd, _ref);
-                _r7 = _mm_unpackhi_epi32(_rcd, _ref);
-                _r8 = _mm_unpacklo_epi64(_r0, _r2);
-                _r9 = _mm_unpacklo_epi64(_r4, _r6);
-                _ra = _mm_unpackhi_epi64(_r0, _r2);
-                _rb = _mm_unpackhi_epi64(_r4, _r6);
-                _rc = _mm_unpacklo_epi64(_r1, _r3);
-                _rd = _mm_unpacklo_epi64(_r5, _r7);
-                _re = _mm_unpackhi_epi64(_r1, _r3);
-                _rf = _mm_unpackhi_epi64(_r5, _r7);
-                _mm_storeu_si128((__m128i*)pp, _r8);
-                _mm_storeu_si128((__m128i*)(pp + 16), _r9);
-                _mm_storeu_si128((__m128i*)(pp + 32), _ra);
-                _mm_storeu_si128((__m128i*)(pp + 48), _rb);
-                _mm_storeu_si128((__m128i*)(pp + 64), _rc);
-                _mm_storeu_si128((__m128i*)(pp + 80), _rd);
-                _mm_storeu_si128((__m128i*)(pp + 96), _re);
-                _mm_storeu_si128((__m128i*)(pp + 112), _rf);
-                pp += 128;
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 64));
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 72));
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 80));
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 88));
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 96));
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 104));
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 112));
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 120));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_store_si128((__m128i*)pp, _r8);
+                    _mm_store_si128((__m128i*)(pp + 16), _r9);
+                    _mm_store_si128((__m128i*)(pp + 32), _ra);
+                    _mm_store_si128((__m128i*)(pp + 48), _rb);
+                    _mm_store_si128((__m128i*)(pp + 64), _rc);
+                    _mm_store_si128((__m128i*)(pp + 80), _rd);
+                    _mm_store_si128((__m128i*)(pp + 96), _re);
+                    _mm_store_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp[8] = sptr[stride_w * 8];
+                    pp[9] = sptr[stride_w * 9];
+                    pp[10] = sptr[stride_w * 10];
+                    pp[11] = sptr[stride_w * 11];
+                    pp[12] = sptr[stride_w * 12];
+                    pp[13] = sptr[stride_w * 13];
+                    pp[14] = sptr[stride_w * 14];
+                    pp[15] = sptr[stride_w * 15];
+                    pp += 16;
+                }
             }
+        }
+        else
+        {
+            int kk = 0;
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp[2] = sptr2[0];
-                pp[3] = sptr3[0];
-                pp[4] = sptr4[0];
-                pp[5] = sptr5[0];
-                pp[6] = sptr6[0];
-                pp[7] = sptr7[0];
-                pp[8] = sptr8[0];
-                pp[9] = sptr9[0];
-                pp[10] = sptra[0];
-                pp[11] = sptrb[0];
-                pp[12] = sptrc[0];
-                pp[13] = sptrd[0];
-                pp[14] = sptre[0];
-                pp[15] = sptrf[0];
-                pp += 16;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int x08 = stride_w * dx8 + dilation_w * v0;
+                    int x09 = stride_w * dx9 + dilation_w * v0;
+                    int x0a = stride_w * dxa + dilation_w * v0;
+                    int x0b = stride_w * dxb + dilation_w * v0;
+                    int x0c = stride_w * dxc + dilation_w * v0;
+                    int x0d = stride_w * dxd + dilation_w * v0;
+                    int x0e = stride_w * dxe + dilation_w * v0;
+                    int x0f = stride_w * dxf + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+                    int y08 = stride_h * dy8 + dilation_h * u0;
+                    int y09 = stride_h * dy9 + dilation_h * u0;
+                    int y0a = stride_h * dya + dilation_h * u0;
+                    int y0b = stride_h * dyb + dilation_h * u0;
+                    int y0c = stride_h * dyc + dilation_h * u0;
+                    int y0d = stride_h * dyd + dilation_h * u0;
+                    int y0e = stride_h * dye + dilation_h * u0;
+                    int y0f = stride_h * dyf + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int x18 = stride_w * dx8 + dilation_w * v1;
+                    int x19 = stride_w * dx9 + dilation_w * v1;
+                    int x1a = stride_w * dxa + dilation_w * v1;
+                    int x1b = stride_w * dxb + dilation_w * v1;
+                    int x1c = stride_w * dxc + dilation_w * v1;
+                    int x1d = stride_w * dxd + dilation_w * v1;
+                    int x1e = stride_w * dxe + dilation_w * v1;
+                    int x1f = stride_w * dxf + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+                    int y18 = stride_h * dy8 + dilation_h * u1;
+                    int y19 = stride_h * dy9 + dilation_h * u1;
+                    int y1a = stride_h * dya + dilation_h * u1;
+                    int y1b = stride_h * dyb + dilation_h * u1;
+                    int y1c = stride_h * dyc + dilation_h * u1;
+                    int y1d = stride_h * dyd + dilation_h * u1;
+                    int y1e = stride_h * dye + dilation_h * u1;
+                    int y1f = stride_h * dyf + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+                    const signed char* sptr08 = img0.row<const signed char>(y08) + x08;
+                    const signed char* sptr09 = img0.row<const signed char>(y09) + x09;
+                    const signed char* sptr0a = img0.row<const signed char>(y0a) + x0a;
+                    const signed char* sptr0b = img0.row<const signed char>(y0b) + x0b;
+                    const signed char* sptr0c = img0.row<const signed char>(y0c) + x0c;
+                    const signed char* sptr0d = img0.row<const signed char>(y0d) + x0d;
+                    const signed char* sptr0e = img0.row<const signed char>(y0e) + x0e;
+                    const signed char* sptr0f = img0.row<const signed char>(y0f) + x0f;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+                    const signed char* sptr18 = img1.row<const signed char>(y18) + x18;
+                    const signed char* sptr19 = img1.row<const signed char>(y19) + x19;
+                    const signed char* sptr1a = img1.row<const signed char>(y1a) + x1a;
+                    const signed char* sptr1b = img1.row<const signed char>(y1b) + x1b;
+                    const signed char* sptr1c = img1.row<const signed char>(y1c) + x1c;
+                    const signed char* sptr1d = img1.row<const signed char>(y1d) + x1d;
+                    const signed char* sptr1e = img1.row<const signed char>(y1e) + x1e;
+                    const signed char* sptr1f = img1.row<const signed char>(y1f) + x1f;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp[16 + 0] = sptr08[0];
+                    pp[16 + 1] = sptr18[0];
+                    pp[16 + 2] = sptr09[0];
+                    pp[16 + 3] = sptr19[0];
+                    pp[16 + 4] = sptr0a[0];
+                    pp[16 + 5] = sptr1a[0];
+                    pp[16 + 6] = sptr0b[0];
+                    pp[16 + 7] = sptr1b[0];
+                    pp[16 + 8] = sptr0c[0];
+                    pp[16 + 9] = sptr1c[0];
+                    pp[16 + 10] = sptr0d[0];
+                    pp[16 + 11] = sptr1d[0];
+                    pp[16 + 12] = sptr0e[0];
+                    pp[16 + 13] = sptr1e[0];
+                    pp[16 + 14] = sptr0f[0];
+                    pp[16 + 15] = sptr1f[0];
+                    pp += 32;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int x8 = stride_w * dx8 + dilation_w * v;
+                int x9 = stride_w * dx9 + dilation_w * v;
+                int xa = stride_w * dxa + dilation_w * v;
+                int xb = stride_w * dxb + dilation_w * v;
+                int xc = stride_w * dxc + dilation_w * v;
+                int xd = stride_w * dxd + dilation_w * v;
+                int xe = stride_w * dxe + dilation_w * v;
+                int xf = stride_w * dxf + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+                int y8 = stride_h * dy8 + dilation_h * u;
+                int y9 = stride_h * dy9 + dilation_h * u;
+                int ya = stride_h * dya + dilation_h * u;
+                int yb = stride_h * dyb + dilation_h * u;
+                int yc = stride_h * dyc + dilation_h * u;
+                int yd = stride_h * dyd + dilation_h * u;
+                int ye = stride_h * dye + dilation_h * u;
+                int yf = stride_h * dyf + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+                const signed char* sptr8 = img.row<const signed char>(y8) + x8 * elempack;
+                const signed char* sptr9 = img.row<const signed char>(y9) + x9 * elempack;
+                const signed char* sptra = img.row<const signed char>(ya) + xa * elempack;
+                const signed char* sptrb = img.row<const signed char>(yb) + xb * elempack;
+                const signed char* sptrc = img.row<const signed char>(yc) + xc * elempack;
+                const signed char* sptrd = img.row<const signed char>(yd) + xd * elempack;
+                const signed char* sptre = img.row<const signed char>(ye) + xe * elempack;
+                const signed char* sptrf = img.row<const signed char>(yf) + xf * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)sptr8);
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)sptr9);
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)sptra);
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)sptrb);
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)sptrc);
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)sptrd);
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)sptre);
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)sptrf);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_store_si128((__m128i*)pp, _r8);
+                    _mm_store_si128((__m128i*)(pp + 16), _r9);
+                    _mm_store_si128((__m128i*)(pp + 32), _ra);
+                    _mm_store_si128((__m128i*)(pp + 48), _rb);
+                    _mm_store_si128((__m128i*)(pp + 64), _rc);
+                    _mm_store_si128((__m128i*)(pp + 80), _rd);
+                    _mm_store_si128((__m128i*)(pp + 96), _re);
+                    _mm_store_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp[8] = sptr8[0];
+                    pp[9] = sptr9[0];
+                    pp[10] = sptra[0];
+                    pp[11] = sptrb[0];
+                    pp[12] = sptrc[0];
+                    pp[13] = sptrd[0];
+                    pp[14] = sptre[0];
+                    pp[15] = sptrf[0];
+                    pp += 16;
+                }
             }
         }
     }
@@ -6511,168 +6689,298 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dx6 = (j + jj + 6) % outw;
         int dx7 = (j + jj + 7) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dy7)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int x02 = stride_w * dx2 + dilation_w * v0;
-                int x03 = stride_w * dx3 + dilation_w * v0;
-                int x04 = stride_w * dx4 + dilation_w * v0;
-                int x05 = stride_w * dx5 + dilation_w * v0;
-                int x06 = stride_w * dx6 + dilation_w * v0;
-                int x07 = stride_w * dx7 + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int y02 = stride_h * dy2 + dilation_h * u0;
-                int y03 = stride_h * dy3 + dilation_h * u0;
-                int y04 = stride_h * dy4 + dilation_h * u0;
-                int y05 = stride_h * dy5 + dilation_h * u0;
-                int y06 = stride_h * dy6 + dilation_h * u0;
-                int y07 = stride_h * dy7 + dilation_h * u0;
-
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int x12 = stride_w * dx2 + dilation_w * v1;
-                int x13 = stride_w * dx3 + dilation_w * v1;
-                int x14 = stride_w * dx4 + dilation_w * v1;
-                int x15 = stride_w * dx5 + dilation_w * v1;
-                int x16 = stride_w * dx6 + dilation_w * v1;
-                int x17 = stride_w * dx7 + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-                int y12 = stride_h * dy2 + dilation_h * u1;
-                int y13 = stride_h * dy3 + dilation_h * u1;
-                int y14 = stride_h * dy4 + dilation_h * u1;
-                int y15 = stride_h * dy5 + dilation_h * u1;
-                int y16 = stride_h * dy6 + dilation_h * u1;
-                int y17 = stride_h * dy7 + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
-                const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
-                const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
-                const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
-                const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
-                const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
-
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-                const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
-                const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
-                const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
-                const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
-                const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
-                const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp[4] = sptr02[0];
-                pp[5] = sptr12[0];
-                pp[6] = sptr03[0];
-                pp[7] = sptr13[0];
-                pp[8] = sptr04[0];
-                pp[9] = sptr14[0];
-                pp[10] = sptr05[0];
-                pp[11] = sptr15[0];
-                pp[12] = sptr06[0];
-                pp[13] = sptr16[0];
-                pp[14] = sptr07[0];
-                pp[15] = sptr17[0];
-                pp += 16;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_storeu_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m128i _r01 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_tmp0), _mm_castsi128_ps(_tmp1), _MM_SHUFFLE(1, 0, 1, 0)));
+                        _mm_storeu_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp += 16;
+                    }
+                }
             }
-        }
-        for (; kk < max_kk / elempack; kk++)
-        {
-            int p = (k / elempack + kk) / maxk;
-            int uv = (k / elempack + kk) % maxk;
-            int u = uv / kernel_w;
-            int v = uv % kernel_w;
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
 
-            const Mat img = bottom_blob.channel(p);
+                const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int x2 = stride_w * dx2 + dilation_w * v;
-            int x3 = stride_w * dx3 + dilation_w * v;
-            int x4 = stride_w * dx4 + dilation_w * v;
-            int x5 = stride_w * dx5 + dilation_w * v;
-            int x6 = stride_w * dx6 + dilation_w * v;
-            int x7 = stride_w * dx7 + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
-            int y2 = stride_h * dy2 + dilation_h * u;
-            int y3 = stride_h * dy3 + dilation_h * u;
-            int y4 = stride_h * dy4 + dilation_h * u;
-            int y5 = stride_h * dy5 + dilation_h * u;
-            int y6 = stride_h * dy6 + dilation_h * u;
-            int y7 = stride_h * dy7 + dilation_h * u;
-
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
-            const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
-            const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
-            const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
-            const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
-            const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
-            const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
 
-            if (elempack == 8)
-            {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
-                __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
-                __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
-                __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
-                __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
-                __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
-                __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
-                __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
-                _r0 = _mm_unpacklo_epi32(_r01, _r23);
-                _r1 = _mm_unpackhi_epi32(_r01, _r23);
-                _r2 = _mm_unpacklo_epi32(_r45, _r67);
-                _r3 = _mm_unpackhi_epi32(_r45, _r67);
-                _r4 = _mm_unpacklo_epi64(_r0, _r2);
-                _r5 = _mm_unpackhi_epi64(_r0, _r2);
-                _r6 = _mm_unpacklo_epi64(_r1, _r3);
-                _r7 = _mm_unpackhi_epi64(_r1, _r3);
-                _mm_storeu_si128((__m128i*)pp, _r4);
-                _mm_storeu_si128((__m128i*)(pp + 16), _r5);
-                _mm_storeu_si128((__m128i*)(pp + 32), _r6);
-                _mm_storeu_si128((__m128i*)(pp + 48), _r7);
-                pp += 64;
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp += 8;
+                }
             }
+        }
+        else
+        {
+            int kk = 0;
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp[2] = sptr2[0];
-                pp[3] = sptr3[0];
-                pp[4] = sptr4[0];
-                pp[5] = sptr5[0];
-                pp[6] = sptr6[0];
-                pp[7] = sptr7[0];
-                pp += 8;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp += 16;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp += 8;
+                }
             }
         }
     }
@@ -6688,106 +6996,206 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dx2 = (j + jj + 2) % outw;
         int dx3 = (j + jj + 3) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dy3)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int x02 = stride_w * dx2 + dilation_w * v0;
-                int x03 = stride_w * dx3 + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int y02 = stride_h * dy2 + dilation_h * u0;
-                int y03 = stride_h * dy3 + dilation_h * u0;
-
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int x12 = stride_w * dx2 + dilation_w * v1;
-                int x13 = stride_w * dx3 + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-                int y12 = stride_h * dy2 + dilation_h * u1;
-                int y13 = stride_h * dy3 + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
-                const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
-
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-                const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
-                const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp[4] = sptr02[0];
-                pp[5] = sptr12[0];
-                pp[6] = sptr03[0];
-                pp[7] = sptr13[0];
-                pp += 8;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _r01 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_r01, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _r01 = _mm_shuffle_epi32(_r01, _MM_SHUFFLE(3, 1, 2, 0));
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp += 8;
+                    }
+                }
             }
-        }
-        for (; kk < max_kk / elempack; kk++)
-        {
-            int p = (k / elempack + kk) / maxk;
-            int uv = (k / elempack + kk) % maxk;
-            int u = uv / kernel_w;
-            int v = uv % kernel_w;
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
 
-            const Mat img = bottom_blob.channel(p);
+                const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int x2 = stride_w * dx2 + dilation_w * v;
-            int x3 = stride_w * dx3 + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
-            int y2 = stride_h * dy2 + dilation_h * u;
-            int y3 = stride_h * dy3 + dilation_h * u;
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
 
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
-            const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
-            const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
 
-            if (elempack == 8)
-            {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
-                __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
-                _r0 = _mm_unpacklo_epi32(_r01, _r23);
-                _r1 = _mm_unpackhi_epi32(_r01, _r23);
-                _mm_storeu_si128((__m128i*)pp, _r0);
-                _mm_storeu_si128((__m128i*)(pp + 16), _r1);
-                pp += 32;
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp += 4;
+                }
             }
+        }
+        else
+        {
+            int kk = 0;
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp[2] = sptr2[0];
-                pp[3] = sptr3[0];
-                pp += 4;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp += 8;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp += 4;
+                }
             }
         }
     }
@@ -6799,44 +7207,154 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
         int dx0 = (j + jj) % outw;
         int dx1 = (j + jj + 1) % outw;
 
-        int kk = 0;
-        if (elempack == 1)
+        if (dy0 == dy1)
         {
-            for (; kk + 1 < max_kk; kk += 2)
+            int kk = 0;
+            if (elempack == 1)
             {
-                int p0 = (k + kk) / maxk;
-                int p1 = (k + kk + 1) / maxk;
-                int uv0 = (k + kk) % maxk;
-                int uv1 = (k + kk + 1) % maxk;
-                int u0 = uv0 / kernel_w;
-                int u1 = uv1 / kernel_w;
-                int v0 = uv0 % kernel_w;
-                int v1 = uv1 % kernel_w;
-
-                const Mat img0 = bottom_blob.channel(p0);
-                const Mat img1 = bottom_blob.channel(p1);
-
-                int x00 = stride_w * dx0 + dilation_w * v0;
-                int x01 = stride_w * dx1 + dilation_w * v0;
-                int y00 = stride_h * dy0 + dilation_h * u0;
-                int y01 = stride_h * dy1 + dilation_h * u0;
-                int x10 = stride_w * dx0 + dilation_w * v1;
-                int x11 = stride_w * dx1 + dilation_w * v1;
-                int y10 = stride_h * dy0 + dilation_h * u1;
-                int y11 = stride_h * dy1 + dilation_h * u1;
-
-                const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
-                const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
-                const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
-                const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
-
-                pp[0] = sptr00[0];
-                pp[1] = sptr10[0];
-                pp[2] = sptr01[0];
-                pp[3] = sptr11[0];
-                pp += 4;
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr0[stride_w];
+                    pp[3] = sptr1[stride_w];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp += 2;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp += 2;
+                }
             }
         }
+    }
+    for (; jj < max_jj; jj++)
+    {
+        int dy = (j + jj) / outw;
+        int dx = (j + jj) % outw;
+
+        int kk = 0;
         for (; kk < max_kk / elempack; kk++)
         {
             int p = (k / elempack + kk) / maxk;
@@ -6846,29 +7364,1309 @@ static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, i
 
             const Mat img = bottom_blob.channel(p);
 
-            int x0 = stride_w * dx0 + dilation_w * v;
-            int x1 = stride_w * dx1 + dilation_w * v;
-            int y0 = stride_h * dy0 + dilation_h * u;
-            int y1 = stride_h * dy1 + dilation_h * u;
+            int x = stride_w * dx + dilation_w * v;
+            int y = stride_h * dy + dilation_h * u;
 
-            const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
-            const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+            const signed char* sptr = img.row<const signed char>(y) + x * elempack;
 
 #if __SSE2__
             if (elempack == 8)
             {
-                __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
-                __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
-                __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
-                _mm_storeu_si128((__m128i*)pp, _r01);
-                pp += 16;
+                _mm_storel_epi64((__m128i*)pp, _mm_loadl_epi64((const __m128i*)sptr));
+                pp += 8;
             }
 #endif // __SSE2__
             if (elempack == 1)
             {
-                pp[0] = sptr0[0];
-                pp[1] = sptr1[0];
-                pp += 2;
+                pp[0] = sptr[0];
+                pp += 1;
+            }
+        }
+    }
+}
+
+#if __AVX512F__
+template void convolution_im2col_input_tile_int8_avx512<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8_avx512<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+#else  // __AVX512F__
+template void convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+template void convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk);
+#endif // __AVX512F__
+
+static void convolution_im2col_input_tile_int8(const Mat& bottom_blob, Mat& B, int j, int max_jj, int k, int max_kk, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h)
+{
+    if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        convolution_im2col_input_tile_conv1x1s1d1_int8(bottom_blob, B, j, max_jj, k, max_kk);
+        return;
+    }
+
+    if (kernel_w == 1 && kernel_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<1, 1, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<3, 3, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<3, 3, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<5, 5, 1, 1, 1, 1>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<5, 5, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    if (kernel_w == 7 && kernel_h == 7 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+    {
+#if __AVX512F__
+        convolution_im2col_input_tile_int8_avx512<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#else  // __AVX512F__
+        convolution_im2col_input_tile_int8<7, 7, 1, 1, 2, 2>(bottom_blob, B, j, max_jj, k, max_kk);
+#endif // __AVX512F__
+        return;
+    }
+
+    const int w = bottom_blob.w;
+    // const int channels = bottom_blob.c;
+    const int elempack = bottom_blob.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int outw = (w - kernel_extent_w) / stride_w + 1;
+
+    // j max_jj     outw*outh    split w and h
+
+    // k max_kk     pa*maxk*(inch/pa)    split inch
+
+    // k/max_kk shall be multiple of maxk
+
+    const int maxk = kernel_w * kernel_h;
+
+    signed char* pp = B;
+
+    int jj = 0;
+#if __SSE2__
+#if defined(__x86_64__) || defined(_M_X64)
+#if __AVX512F__
+    for (; jj + 15 < max_jj; jj += 16)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dy2 = (j + jj + 2) / outw;
+        int dy3 = (j + jj + 3) / outw;
+        int dy4 = (j + jj + 4) / outw;
+        int dy5 = (j + jj + 5) / outw;
+        int dy6 = (j + jj + 6) / outw;
+        int dy7 = (j + jj + 7) / outw;
+        int dy8 = (j + jj + 8) / outw;
+        int dy9 = (j + jj + 9) / outw;
+        int dya = (j + jj + 10) / outw;
+        int dyb = (j + jj + 11) / outw;
+        int dyc = (j + jj + 12) / outw;
+        int dyd = (j + jj + 13) / outw;
+        int dye = (j + jj + 14) / outw;
+        int dyf = (j + jj + 15) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+        int dx2 = (j + jj + 2) % outw;
+        int dx3 = (j + jj + 3) % outw;
+        int dx4 = (j + jj + 4) % outw;
+        int dx5 = (j + jj + 5) % outw;
+        int dx6 = (j + jj + 6) % outw;
+        int dx7 = (j + jj + 7) % outw;
+        int dx8 = (j + jj + 8) % outw;
+        int dx9 = (j + jj + 9) % outw;
+        int dxa = (j + jj + 10) % outw;
+        int dxb = (j + jj + 11) % outw;
+        int dxc = (j + jj + 12) % outw;
+        int dxd = (j + jj + 13) % outw;
+        int dxe = (j + jj + 14) % outw;
+        int dxf = (j + jj + 15) % outw;
+
+        if (dy0 == dyf)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _mm_store_si128((__m128i*)pp, _tmp0);
+                        _mm_store_si128((__m128i*)(pp + 16), _tmp1);
+                        pp += 32;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m256i _r0 = _mm256_loadu_si256((const __m256i*)sptr0);
+                        __m256i _r1 = _mm256_loadu_si256((const __m256i*)sptr1);
+                        __m256i _tmp0 = _mm256_unpacklo_epi8(_r0, _r1);
+                        __m256i _tmp1 = _mm256_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shufflehi_epi16(_mm256_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm256_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm256_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m256i _r01 = _mm256_unpacklo_epi64(_tmp0, _tmp1);
+                        _mm256_storeu_si256((__m256i*)pp, _r01);
+                        pp += 32;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp[16 + 0] = sptr0[stride_w * 8];
+                        pp[16 + 1] = sptr1[stride_w * 8];
+                        pp[16 + 2] = sptr0[stride_w * 9];
+                        pp[16 + 3] = sptr1[stride_w * 9];
+                        pp[16 + 4] = sptr0[stride_w * 10];
+                        pp[16 + 5] = sptr1[stride_w * 10];
+                        pp[16 + 6] = sptr0[stride_w * 11];
+                        pp[16 + 7] = sptr1[stride_w * 11];
+                        pp[16 + 8] = sptr0[stride_w * 12];
+                        pp[16 + 9] = sptr1[stride_w * 12];
+                        pp[16 + 10] = sptr0[stride_w * 13];
+                        pp[16 + 11] = sptr1[stride_w * 13];
+                        pp[16 + 12] = sptr0[stride_w * 14];
+                        pp[16 + 13] = sptr1[stride_w * 14];
+                        pp[16 + 14] = sptr0[stride_w * 15];
+                        pp[16 + 15] = sptr1[stride_w * 15];
+                        pp += 32;
+                    }
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 64));
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 72));
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 80));
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 88));
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 96));
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 104));
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 112));
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 120));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_store_si128((__m128i*)pp, _r8);
+                    _mm_store_si128((__m128i*)(pp + 16), _r9);
+                    _mm_store_si128((__m128i*)(pp + 32), _ra);
+                    _mm_store_si128((__m128i*)(pp + 48), _rb);
+                    _mm_store_si128((__m128i*)(pp + 64), _rc);
+                    _mm_store_si128((__m128i*)(pp + 80), _rd);
+                    _mm_store_si128((__m128i*)(pp + 96), _re);
+                    _mm_store_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp[8] = sptr[stride_w * 8];
+                    pp[9] = sptr[stride_w * 9];
+                    pp[10] = sptr[stride_w * 10];
+                    pp[11] = sptr[stride_w * 11];
+                    pp[12] = sptr[stride_w * 12];
+                    pp[13] = sptr[stride_w * 13];
+                    pp[14] = sptr[stride_w * 14];
+                    pp[15] = sptr[stride_w * 15];
+                    pp += 16;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int x08 = stride_w * dx8 + dilation_w * v0;
+                    int x09 = stride_w * dx9 + dilation_w * v0;
+                    int x0a = stride_w * dxa + dilation_w * v0;
+                    int x0b = stride_w * dxb + dilation_w * v0;
+                    int x0c = stride_w * dxc + dilation_w * v0;
+                    int x0d = stride_w * dxd + dilation_w * v0;
+                    int x0e = stride_w * dxe + dilation_w * v0;
+                    int x0f = stride_w * dxf + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+                    int y08 = stride_h * dy8 + dilation_h * u0;
+                    int y09 = stride_h * dy9 + dilation_h * u0;
+                    int y0a = stride_h * dya + dilation_h * u0;
+                    int y0b = stride_h * dyb + dilation_h * u0;
+                    int y0c = stride_h * dyc + dilation_h * u0;
+                    int y0d = stride_h * dyd + dilation_h * u0;
+                    int y0e = stride_h * dye + dilation_h * u0;
+                    int y0f = stride_h * dyf + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int x18 = stride_w * dx8 + dilation_w * v1;
+                    int x19 = stride_w * dx9 + dilation_w * v1;
+                    int x1a = stride_w * dxa + dilation_w * v1;
+                    int x1b = stride_w * dxb + dilation_w * v1;
+                    int x1c = stride_w * dxc + dilation_w * v1;
+                    int x1d = stride_w * dxd + dilation_w * v1;
+                    int x1e = stride_w * dxe + dilation_w * v1;
+                    int x1f = stride_w * dxf + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+                    int y18 = stride_h * dy8 + dilation_h * u1;
+                    int y19 = stride_h * dy9 + dilation_h * u1;
+                    int y1a = stride_h * dya + dilation_h * u1;
+                    int y1b = stride_h * dyb + dilation_h * u1;
+                    int y1c = stride_h * dyc + dilation_h * u1;
+                    int y1d = stride_h * dyd + dilation_h * u1;
+                    int y1e = stride_h * dye + dilation_h * u1;
+                    int y1f = stride_h * dyf + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+                    const signed char* sptr08 = img0.row<const signed char>(y08) + x08;
+                    const signed char* sptr09 = img0.row<const signed char>(y09) + x09;
+                    const signed char* sptr0a = img0.row<const signed char>(y0a) + x0a;
+                    const signed char* sptr0b = img0.row<const signed char>(y0b) + x0b;
+                    const signed char* sptr0c = img0.row<const signed char>(y0c) + x0c;
+                    const signed char* sptr0d = img0.row<const signed char>(y0d) + x0d;
+                    const signed char* sptr0e = img0.row<const signed char>(y0e) + x0e;
+                    const signed char* sptr0f = img0.row<const signed char>(y0f) + x0f;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+                    const signed char* sptr18 = img1.row<const signed char>(y18) + x18;
+                    const signed char* sptr19 = img1.row<const signed char>(y19) + x19;
+                    const signed char* sptr1a = img1.row<const signed char>(y1a) + x1a;
+                    const signed char* sptr1b = img1.row<const signed char>(y1b) + x1b;
+                    const signed char* sptr1c = img1.row<const signed char>(y1c) + x1c;
+                    const signed char* sptr1d = img1.row<const signed char>(y1d) + x1d;
+                    const signed char* sptr1e = img1.row<const signed char>(y1e) + x1e;
+                    const signed char* sptr1f = img1.row<const signed char>(y1f) + x1f;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp[16 + 0] = sptr08[0];
+                    pp[16 + 1] = sptr18[0];
+                    pp[16 + 2] = sptr09[0];
+                    pp[16 + 3] = sptr19[0];
+                    pp[16 + 4] = sptr0a[0];
+                    pp[16 + 5] = sptr1a[0];
+                    pp[16 + 6] = sptr0b[0];
+                    pp[16 + 7] = sptr1b[0];
+                    pp[16 + 8] = sptr0c[0];
+                    pp[16 + 9] = sptr1c[0];
+                    pp[16 + 10] = sptr0d[0];
+                    pp[16 + 11] = sptr1d[0];
+                    pp[16 + 12] = sptr0e[0];
+                    pp[16 + 13] = sptr1e[0];
+                    pp[16 + 14] = sptr0f[0];
+                    pp[16 + 15] = sptr1f[0];
+                    pp += 32;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int x8 = stride_w * dx8 + dilation_w * v;
+                int x9 = stride_w * dx9 + dilation_w * v;
+                int xa = stride_w * dxa + dilation_w * v;
+                int xb = stride_w * dxb + dilation_w * v;
+                int xc = stride_w * dxc + dilation_w * v;
+                int xd = stride_w * dxd + dilation_w * v;
+                int xe = stride_w * dxe + dilation_w * v;
+                int xf = stride_w * dxf + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+                int y8 = stride_h * dy8 + dilation_h * u;
+                int y9 = stride_h * dy9 + dilation_h * u;
+                int ya = stride_h * dya + dilation_h * u;
+                int yb = stride_h * dyb + dilation_h * u;
+                int yc = stride_h * dyc + dilation_h * u;
+                int yd = stride_h * dyd + dilation_h * u;
+                int ye = stride_h * dye + dilation_h * u;
+                int yf = stride_h * dyf + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+                const signed char* sptr8 = img.row<const signed char>(y8) + x8 * elempack;
+                const signed char* sptr9 = img.row<const signed char>(y9) + x9 * elempack;
+                const signed char* sptra = img.row<const signed char>(ya) + xa * elempack;
+                const signed char* sptrb = img.row<const signed char>(yb) + xb * elempack;
+                const signed char* sptrc = img.row<const signed char>(yc) + xc * elempack;
+                const signed char* sptrd = img.row<const signed char>(yd) + xd * elempack;
+                const signed char* sptre = img.row<const signed char>(ye) + xe * elempack;
+                const signed char* sptrf = img.row<const signed char>(yf) + xf * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r8 = _mm_loadl_epi64((const __m128i*)sptr8);
+                    __m128i _r9 = _mm_loadl_epi64((const __m128i*)sptr9);
+                    __m128i _ra = _mm_loadl_epi64((const __m128i*)sptra);
+                    __m128i _rb = _mm_loadl_epi64((const __m128i*)sptrb);
+                    __m128i _rc = _mm_loadl_epi64((const __m128i*)sptrc);
+                    __m128i _rd = _mm_loadl_epi64((const __m128i*)sptrd);
+                    __m128i _re = _mm_loadl_epi64((const __m128i*)sptre);
+                    __m128i _rf = _mm_loadl_epi64((const __m128i*)sptrf);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    __m128i _r89 = _mm_unpacklo_epi16(_r8, _r9);
+                    __m128i _rab = _mm_unpacklo_epi16(_ra, _rb);
+                    __m128i _rcd = _mm_unpacklo_epi16(_rc, _rd);
+                    __m128i _ref = _mm_unpacklo_epi16(_re, _rf);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi32(_r89, _rab);
+                    _r5 = _mm_unpackhi_epi32(_r89, _rab);
+                    _r6 = _mm_unpacklo_epi32(_rcd, _ref);
+                    _r7 = _mm_unpackhi_epi32(_rcd, _ref);
+                    _r8 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r9 = _mm_unpacklo_epi64(_r4, _r6);
+                    _ra = _mm_unpackhi_epi64(_r0, _r2);
+                    _rb = _mm_unpackhi_epi64(_r4, _r6);
+                    _rc = _mm_unpacklo_epi64(_r1, _r3);
+                    _rd = _mm_unpacklo_epi64(_r5, _r7);
+                    _re = _mm_unpackhi_epi64(_r1, _r3);
+                    _rf = _mm_unpackhi_epi64(_r5, _r7);
+                    _mm_store_si128((__m128i*)pp, _r8);
+                    _mm_store_si128((__m128i*)(pp + 16), _r9);
+                    _mm_store_si128((__m128i*)(pp + 32), _ra);
+                    _mm_store_si128((__m128i*)(pp + 48), _rb);
+                    _mm_store_si128((__m128i*)(pp + 64), _rc);
+                    _mm_store_si128((__m128i*)(pp + 80), _rd);
+                    _mm_store_si128((__m128i*)(pp + 96), _re);
+                    _mm_store_si128((__m128i*)(pp + 112), _rf);
+                    pp += 128;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp[8] = sptr8[0];
+                    pp[9] = sptr9[0];
+                    pp[10] = sptra[0];
+                    pp[11] = sptrb[0];
+                    pp[12] = sptrc[0];
+                    pp[13] = sptrd[0];
+                    pp[14] = sptre[0];
+                    pp[15] = sptrf[0];
+                    pp += 16;
+                }
+            }
+        }
+    }
+#endif // __AVX512F__
+    for (; jj + 7 < max_jj; jj += 8)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dy2 = (j + jj + 2) / outw;
+        int dy3 = (j + jj + 3) / outw;
+        int dy4 = (j + jj + 4) / outw;
+        int dy5 = (j + jj + 5) / outw;
+        int dy6 = (j + jj + 6) / outw;
+        int dy7 = (j + jj + 7) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+        int dx2 = (j + jj + 2) % outw;
+        int dx3 = (j + jj + 3) % outw;
+        int dx4 = (j + jj + 4) % outw;
+        int dx5 = (j + jj + 5) % outw;
+        int dx6 = (j + jj + 6) % outw;
+        int dx7 = (j + jj + 7) % outw;
+
+        if (dy0 == dy7)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_storeu_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadu_si128((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadu_si128((const __m128i*)sptr1);
+                        __m128i _tmp0 = _mm_unpacklo_epi8(_r0, _r1);
+                        __m128i _tmp1 = _mm_unpackhi_epi8(_r0, _r1);
+                        _tmp0 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp0, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_tmp1, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp0 = _mm_shuffle_epi32(_tmp0, _MM_SHUFFLE(3, 1, 2, 0));
+                        _tmp1 = _mm_shuffle_epi32(_tmp1, _MM_SHUFFLE(3, 1, 2, 0));
+                        __m128i _r01 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(_tmp0), _mm_castsi128_ps(_tmp1), _MM_SHUFFLE(1, 0, 1, 0)));
+                        _mm_storeu_si128((__m128i*)pp, _r01);
+                        pp += 16;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp[8] = sptr0[stride_w * 4];
+                        pp[9] = sptr1[stride_w * 4];
+                        pp[10] = sptr0[stride_w * 5];
+                        pp[11] = sptr1[stride_w * 5];
+                        pp[12] = sptr0[stride_w * 6];
+                        pp[13] = sptr1[stride_w * 6];
+                        pp[14] = sptr0[stride_w * 7];
+                        pp[15] = sptr1[stride_w * 7];
+                        pp += 16;
+                    }
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 32));
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 40));
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 48));
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 56));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp[4] = sptr[stride_w * 4];
+                    pp[5] = sptr[stride_w * 5];
+                    pp[6] = sptr[stride_w * 6];
+                    pp[7] = sptr[stride_w * 7];
+                    pp += 8;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int x04 = stride_w * dx4 + dilation_w * v0;
+                    int x05 = stride_w * dx5 + dilation_w * v0;
+                    int x06 = stride_w * dx6 + dilation_w * v0;
+                    int x07 = stride_w * dx7 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+                    int y04 = stride_h * dy4 + dilation_h * u0;
+                    int y05 = stride_h * dy5 + dilation_h * u0;
+                    int y06 = stride_h * dy6 + dilation_h * u0;
+                    int y07 = stride_h * dy7 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int x14 = stride_w * dx4 + dilation_w * v1;
+                    int x15 = stride_w * dx5 + dilation_w * v1;
+                    int x16 = stride_w * dx6 + dilation_w * v1;
+                    int x17 = stride_w * dx7 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+                    int y14 = stride_h * dy4 + dilation_h * u1;
+                    int y15 = stride_h * dy5 + dilation_h * u1;
+                    int y16 = stride_h * dy6 + dilation_h * u1;
+                    int y17 = stride_h * dy7 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+                    const signed char* sptr04 = img0.row<const signed char>(y04) + x04;
+                    const signed char* sptr05 = img0.row<const signed char>(y05) + x05;
+                    const signed char* sptr06 = img0.row<const signed char>(y06) + x06;
+                    const signed char* sptr07 = img0.row<const signed char>(y07) + x07;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+                    const signed char* sptr14 = img1.row<const signed char>(y14) + x14;
+                    const signed char* sptr15 = img1.row<const signed char>(y15) + x15;
+                    const signed char* sptr16 = img1.row<const signed char>(y16) + x16;
+                    const signed char* sptr17 = img1.row<const signed char>(y17) + x17;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp[8] = sptr04[0];
+                    pp[9] = sptr14[0];
+                    pp[10] = sptr05[0];
+                    pp[11] = sptr15[0];
+                    pp[12] = sptr06[0];
+                    pp[13] = sptr16[0];
+                    pp[14] = sptr07[0];
+                    pp[15] = sptr17[0];
+                    pp += 16;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int x4 = stride_w * dx4 + dilation_w * v;
+                int x5 = stride_w * dx5 + dilation_w * v;
+                int x6 = stride_w * dx6 + dilation_w * v;
+                int x7 = stride_w * dx7 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+                int y4 = stride_h * dy4 + dilation_h * u;
+                int y5 = stride_h * dy5 + dilation_h * u;
+                int y6 = stride_h * dy6 + dilation_h * u;
+                int y7 = stride_h * dy7 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+                const signed char* sptr4 = img.row<const signed char>(y4) + x4 * elempack;
+                const signed char* sptr5 = img.row<const signed char>(y5) + x5 * elempack;
+                const signed char* sptr6 = img.row<const signed char>(y6) + x6 * elempack;
+                const signed char* sptr7 = img.row<const signed char>(y7) + x7 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r4 = _mm_loadl_epi64((const __m128i*)sptr4);
+                    __m128i _r5 = _mm_loadl_epi64((const __m128i*)sptr5);
+                    __m128i _r6 = _mm_loadl_epi64((const __m128i*)sptr6);
+                    __m128i _r7 = _mm_loadl_epi64((const __m128i*)sptr7);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    __m128i _r45 = _mm_unpacklo_epi16(_r4, _r5);
+                    __m128i _r67 = _mm_unpacklo_epi16(_r6, _r7);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _r2 = _mm_unpacklo_epi32(_r45, _r67);
+                    _r3 = _mm_unpackhi_epi32(_r45, _r67);
+                    _r4 = _mm_unpacklo_epi64(_r0, _r2);
+                    _r5 = _mm_unpackhi_epi64(_r0, _r2);
+                    _r6 = _mm_unpacklo_epi64(_r1, _r3);
+                    _r7 = _mm_unpackhi_epi64(_r1, _r3);
+                    _mm_storeu_si128((__m128i*)pp, _r4);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r5);
+                    _mm_storeu_si128((__m128i*)(pp + 32), _r6);
+                    _mm_storeu_si128((__m128i*)(pp + 48), _r7);
+                    pp += 64;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp[4] = sptr4[0];
+                    pp[5] = sptr5[0];
+                    pp[6] = sptr6[0];
+                    pp[7] = sptr7[0];
+                    pp += 8;
+                }
+            }
+        }
+    }
+#endif // defined(__x86_64__) || defined(_M_X64)
+    for (; jj + 3 < max_jj; jj += 4)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dy2 = (j + jj + 2) / outw;
+        int dy3 = (j + jj + 3) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+        int dx2 = (j + jj + 2) % outw;
+        int dx3 = (j + jj + 3) % outw;
+
+        if (dy0 == dy3)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    if (stride_w == 1)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else if (stride_w == 2)
+                    {
+                        __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                        __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                        __m128i _r01 = _mm_unpacklo_epi8(_r0, _r1);
+                        _r01 = _mm_shufflehi_epi16(_mm_shufflelo_epi16(_r01, _MM_SHUFFLE(3, 1, 2, 0)), _MM_SHUFFLE(3, 1, 2, 0));
+                        _r01 = _mm_shuffle_epi32(_r01, _MM_SHUFFLE(3, 1, 2, 0));
+                        _mm_storel_epi64((__m128i*)pp, _r01);
+                        pp += 8;
+                    }
+                    else
+                    {
+                        pp[0] = sptr0[0];
+                        pp[1] = sptr1[0];
+                        pp[2] = sptr0[stride_w];
+                        pp[3] = sptr1[stride_w];
+                        pp[4] = sptr0[stride_w * 2];
+                        pp[5] = sptr1[stride_w * 2];
+                        pp[6] = sptr0[stride_w * 3];
+                        pp[7] = sptr1[stride_w * 3];
+                        pp += 8;
+                    }
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 16));
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 24));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp[2] = sptr[stride_w * 2];
+                    pp[3] = sptr[stride_w * 3];
+                    pp += 4;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int x02 = stride_w * dx2 + dilation_w * v0;
+                    int x03 = stride_w * dx3 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int y02 = stride_h * dy2 + dilation_h * u0;
+                    int y03 = stride_h * dy3 + dilation_h * u0;
+
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int x12 = stride_w * dx2 + dilation_w * v1;
+                    int x13 = stride_w * dx3 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+                    int y12 = stride_h * dy2 + dilation_h * u1;
+                    int y13 = stride_h * dy3 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr02 = img0.row<const signed char>(y02) + x02;
+                    const signed char* sptr03 = img0.row<const signed char>(y03) + x03;
+
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+                    const signed char* sptr12 = img1.row<const signed char>(y12) + x12;
+                    const signed char* sptr13 = img1.row<const signed char>(y13) + x13;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp[4] = sptr02[0];
+                    pp[5] = sptr12[0];
+                    pp[6] = sptr03[0];
+                    pp[7] = sptr13[0];
+                    pp += 8;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int x2 = stride_w * dx2 + dilation_w * v;
+                int x3 = stride_w * dx3 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+                int y2 = stride_h * dy2 + dilation_h * u;
+                int y3 = stride_h * dy3 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+                const signed char* sptr2 = img.row<const signed char>(y2) + x2 * elempack;
+                const signed char* sptr3 = img.row<const signed char>(y3) + x3 * elempack;
+
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r2 = _mm_loadl_epi64((const __m128i*)sptr2);
+                    __m128i _r3 = _mm_loadl_epi64((const __m128i*)sptr3);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    __m128i _r23 = _mm_unpacklo_epi16(_r2, _r3);
+                    _r0 = _mm_unpacklo_epi32(_r01, _r23);
+                    _r1 = _mm_unpackhi_epi32(_r01, _r23);
+                    _mm_storeu_si128((__m128i*)pp, _r0);
+                    _mm_storeu_si128((__m128i*)(pp + 16), _r1);
+                    pp += 32;
+                }
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr2[0];
+                    pp[3] = sptr3[0];
+                    pp += 4;
+                }
+            }
+        }
+    }
+#endif // __SSE2__
+    for (; jj + 1 < max_jj; jj += 2)
+    {
+        int dy0 = (j + jj) / outw;
+        int dy1 = (j + jj + 1) / outw;
+        int dx0 = (j + jj) % outw;
+        int dx1 = (j + jj + 1) % outw;
+
+        if (dy0 == dy1)
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+
+                    const signed char* sptr0 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr1 = img1.row<const signed char>(y10) + x10;
+
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp[2] = sptr0[stride_w];
+                    pp[3] = sptr1[stride_w];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+
+                const signed char* sptr = img.row<const signed char>(y0) + x0 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)(sptr + stride_w * 8));
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr[0];
+                    pp[1] = sptr[stride_w];
+                    pp += 2;
+                }
+            }
+        }
+        else
+        {
+            int kk = 0;
+            if (elempack == 1)
+            {
+                for (; kk + 1 < max_kk; kk += 2)
+                {
+                    int p0 = (k + kk) / maxk;
+                    int p1 = (k + kk + 1) / maxk;
+                    int uv0 = (k + kk) % maxk;
+                    int uv1 = (k + kk + 1) % maxk;
+                    int u0 = uv0 / kernel_w;
+                    int u1 = uv1 / kernel_w;
+                    int v0 = uv0 % kernel_w;
+                    int v1 = uv1 % kernel_w;
+
+                    const Mat img0 = bottom_blob.channel(p0);
+                    const Mat img1 = bottom_blob.channel(p1);
+
+                    int x00 = stride_w * dx0 + dilation_w * v0;
+                    int x01 = stride_w * dx1 + dilation_w * v0;
+                    int y00 = stride_h * dy0 + dilation_h * u0;
+                    int y01 = stride_h * dy1 + dilation_h * u0;
+                    int x10 = stride_w * dx0 + dilation_w * v1;
+                    int x11 = stride_w * dx1 + dilation_w * v1;
+                    int y10 = stride_h * dy0 + dilation_h * u1;
+                    int y11 = stride_h * dy1 + dilation_h * u1;
+
+                    const signed char* sptr00 = img0.row<const signed char>(y00) + x00;
+                    const signed char* sptr01 = img0.row<const signed char>(y01) + x01;
+                    const signed char* sptr10 = img1.row<const signed char>(y10) + x10;
+                    const signed char* sptr11 = img1.row<const signed char>(y11) + x11;
+
+                    pp[0] = sptr00[0];
+                    pp[1] = sptr10[0];
+                    pp[2] = sptr01[0];
+                    pp[3] = sptr11[0];
+                    pp += 4;
+                }
+            }
+            for (; kk < max_kk / elempack; kk++)
+            {
+                int p = (k / elempack + kk) / maxk;
+                int uv = (k / elempack + kk) % maxk;
+                int u = uv / kernel_w;
+                int v = uv % kernel_w;
+
+                const Mat img = bottom_blob.channel(p);
+
+                int x0 = stride_w * dx0 + dilation_w * v;
+                int x1 = stride_w * dx1 + dilation_w * v;
+                int y0 = stride_h * dy0 + dilation_h * u;
+                int y1 = stride_h * dy1 + dilation_h * u;
+
+                const signed char* sptr0 = img.row<const signed char>(y0) + x0 * elempack;
+                const signed char* sptr1 = img.row<const signed char>(y1) + x1 * elempack;
+
+#if __SSE2__
+                if (elempack == 8)
+                {
+                    __m128i _r0 = _mm_loadl_epi64((const __m128i*)sptr0);
+                    __m128i _r1 = _mm_loadl_epi64((const __m128i*)sptr1);
+                    __m128i _r01 = _mm_unpacklo_epi16(_r0, _r1);
+                    _mm_storeu_si128((__m128i*)pp, _r01);
+                    pp += 16;
+                }
+#endif // __SSE2__
+                if (elempack == 1)
+                {
+                    pp[0] = sptr0[0];
+                    pp[1] = sptr1[0];
+                    pp += 2;
+                }
             }
         }
     }
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index f870a8847462..09008985f121 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -46,16 +46,13 @@ namespace ncnn {
 
 #include "convolution_packed_int8.h"
 #include "convolution_im2col_gemm_int8.h"
+
+#include "convolution_3x3_winograd_int8.h"
 #endif // NCNN_INT8
 
 #if __SSE2__
 #include "convolution_3x3_pack1to4.h"
 
-#if NCNN_INT8
-#include "convolution_3x3_pack8to4_int8.h"
-#include "convolution_3x3_pack8to1_int8.h"
-#endif // NCNN_INT8
-
 #if __AVX__
 #include "convolution_3x3_pack1to8.h"
 #include "convolution_3x3_pack8to1.h"
@@ -1231,32 +1228,14 @@ int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
     const int maxk = kernel_w * kernel_h;
     const int num_input = weight_data_size / maxk / num_output;
 
-    int elempack = 1;
-    int out_elempack_int32 = 1;
-#if __SSE2__
-    if (opt.use_packing_layout)
-    {
-        elempack = num_input % 8 == 0 ? 8 : 1;
-        out_elempack_int32 = num_output % 4 == 0 ? 4 : 1;
-    }
-#endif // __SSE2__
+    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
 
-    if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
-    {
-#if __SSE2__
-        conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
-#endif // __SSE2__
-    }
-    else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
-    {
-#if __SSE2__
-        conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
-#endif // __SSE2__
-    }
-    else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd23_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
+    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
     {
-        conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_winograd23_data, num_input, num_output, opt);
-        // conv3x3s1_winograd43_transform_kernel_int8_sse(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        if (opt.use_winograd43_convolution)
+            conv3x3s1_winograd43_transform_kernel_int8(weight_data, weight_winograd43_data, num_input, num_output, opt);
+        else
+            conv3x3s1_winograd23_transform_kernel_int8(weight_data, weight_winograd23_data, num_input, num_output, opt);
     }
     else if (opt.use_sgemm_convolution)
     {
@@ -1352,6 +1331,8 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
     if (top_blob_int32.empty())
         return -100;
 
+    bool prefer_winograd = (opt.use_winograd23_convolution || opt.use_winograd43_convolution) && (num_input > 8 || num_output > 8);
+
     int _nT = nT ? nT : opt.num_threads;
     if (nT != 0 && opt.num_threads != nT)
     {
@@ -1360,22 +1341,12 @@ int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, con
         NCNN_LOGE("opt.num_threads %d changed, convolution gemm will use load-time value %d", opt.num_threads, nT);
     }
 
-    if (elempack == 8 && out_elempack_int32 == 4 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
-    {
-#if __SSE2__
-        conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
-#endif // __SSE2__
-    }
-    else if (elempack == 8 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd43_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
-    {
-#if __SSE2__
-        conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
-#endif // __SSE2__
-    }
-    else if (elempack == 1 && out_elempack_int32 == 1 && opt.use_winograd_convolution && opt.use_winograd23_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
+    if (opt.use_winograd_convolution && prefer_winograd && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
     {
-        conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, opt);
-        // conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, opt);
+        if (opt.use_winograd43_convolution && !weight_winograd43_data.empty())
+            conv3x3s1_winograd43_int8(bottom_blob_bordered, top_blob_int32, weight_winograd43_data, _nT, opt);
+        else
+            conv3x3s1_winograd23_int8(bottom_blob_bordered, top_blob_int32, weight_winograd23_data, _nT, opt);
     }
     else if (opt.use_sgemm_convolution)
     {
diff --git a/src/layer/x86/convolution_x86_avx2.cpp b/src/layer/x86/convolution_x86_avx2.cpp
index 38f107ee0865..49cded702137 100644
--- a/src/layer/x86/convolution_x86_avx2.cpp
+++ b/src/layer/x86/convolution_x86_avx2.cpp
@@ -20,8 +20,7 @@ namespace ncnn {
 
 #include "convolution_packed_int8.h"
 #include "convolution_im2col_gemm_int8.h"
-#include "convolution_3x3_pack8to1_int8.h"
-#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_winograd_int8.h"
 
 // packed
 void convolution_transform_kernel_packed_int8_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
@@ -46,24 +45,24 @@ void convolution_im2col_gemm_int8_avx2(const Mat& bottom_blob, Mat& top_blob, co
 }
 
 // winograd
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
+void conv3x3s1_winograd23_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
 {
-    conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
+    conv3x3s1_winograd23_transform_kernel_int8(kernel, AT, inch, outch, opt);
 }
 
-void conv3x3s1_winograd43_pack8to1_int8_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+void conv3x3s1_winograd23_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
+    conv3x3s1_winograd23_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx2(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
+void conv3x3s1_winograd43_transform_kernel_int8_avx2(const Mat& kernel, Mat& AT, int inch, int outch, const Option& opt)
 {
-    conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);
+    conv3x3s1_winograd43_transform_kernel_int8(kernel, AT, inch, outch, opt);
 }
 
-void conv3x3s1_winograd43_pack8to4_int8_sse_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+void conv3x3s1_winograd43_int8_avx2(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob, top_blob, kernel, opt);
+    conv3x3s1_winograd43_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
 } // namespace ncnn
diff --git a/src/layer/x86/convolution_x86_avx512vnni.cpp b/src/layer/x86/convolution_x86_avx512vnni.cpp
index f0ac51bbf856..8e34bb61309f 100644
--- a/src/layer/x86/convolution_x86_avx512vnni.cpp
+++ b/src/layer/x86/convolution_x86_avx512vnni.cpp
@@ -20,8 +20,7 @@ namespace ncnn {
 
 #include "convolution_packed_int8.h"
 #include "convolution_im2col_gemm_int8.h"
-#include "convolution_3x3_pack8to1_int8.h"
-#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_winograd_int8.h"
 
 // packed
 void convolution_packed_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
@@ -36,24 +35,14 @@ void convolution_im2col_gemm_int8_avx512vnni(const Mat& bottom_blob, Mat& top_bl
 }
 
 // winograd
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avx512vnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
+void conv3x3s1_winograd23_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
+    conv3x3s1_winograd23_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
-void conv3x3s1_winograd43_pack8to1_int8_sse_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+void conv3x3s1_winograd43_int8_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
-}
-
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avx512vnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
-{
-    conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);
-}
-
-void conv3x3s1_winograd43_pack8to4_int8_sse_avx512vnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
-{
-    conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob, top_blob, kernel, opt);
+    conv3x3s1_winograd43_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
 } // namespace ncnn
diff --git a/src/layer/x86/convolution_x86_avxvnni.cpp b/src/layer/x86/convolution_x86_avxvnni.cpp
index a8ef75bb968b..aa1ba401856c 100644
--- a/src/layer/x86/convolution_x86_avxvnni.cpp
+++ b/src/layer/x86/convolution_x86_avxvnni.cpp
@@ -20,8 +20,7 @@ namespace ncnn {
 
 #include "convolution_packed_int8.h"
 #include "convolution_im2col_gemm_int8.h"
-#include "convolution_3x3_pack8to1_int8.h"
-#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_winograd_int8.h"
 
 // packed
 void convolution_packed_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
@@ -36,24 +35,14 @@ void convolution_im2col_gemm_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob,
 }
 
 // winograd
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_avxvnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
+void conv3x3s1_winograd23_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
+    conv3x3s1_winograd23_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
-void conv3x3s1_winograd43_pack8to1_int8_sse_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+void conv3x3s1_winograd43_int8_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
-}
-
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_avxvnni(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
-{
-    conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);
-}
-
-void conv3x3s1_winograd43_pack8to4_int8_sse_avxvnni(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
-{
-    conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob, top_blob, kernel, opt);
+    conv3x3s1_winograd43_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
 } // namespace ncnn
diff --git a/src/layer/x86/convolution_x86_xop.cpp b/src/layer/x86/convolution_x86_xop.cpp
index d954f5545655..cacba8f07cdd 100644
--- a/src/layer/x86/convolution_x86_xop.cpp
+++ b/src/layer/x86/convolution_x86_xop.cpp
@@ -20,8 +20,7 @@ namespace ncnn {
 
 #include "convolution_packed_int8.h"
 #include "convolution_im2col_gemm_int8.h"
-#include "convolution_3x3_pack8to1_int8.h"
-#include "convolution_3x3_pack8to4_int8.h"
+#include "convolution_3x3_winograd_int8.h"
 
 // packed
 void convolution_packed_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
@@ -36,24 +35,14 @@ void convolution_im2col_gemm_int8_xop(const Mat& bottom_blob, Mat& top_blob, con
 }
 
 // winograd
-void conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse_xop(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
+void conv3x3s1_winograd23_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_transform_kernel_pack8to1_int8_sse(kernel, kernel_tm, inch, outch, opt);
+    conv3x3s1_winograd23_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
-void conv3x3s1_winograd43_pack8to1_int8_sse_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
+void conv3x3s1_winograd43_int8_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& AT, int nT, const Option& opt)
 {
-    conv3x3s1_winograd43_pack8to1_int8_sse(bottom_blob, top_blob, kernel, opt);
-}
-
-void conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse_xop(const Mat& kernel, Mat& kernel_tm, int inch, int outch, const Option& opt)
-{
-    conv3x3s1_winograd43_transform_kernel_pack8to4_int8_sse(kernel, kernel_tm, inch, outch, opt);
-}
-
-void conv3x3s1_winograd43_pack8to4_int8_sse_xop(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
-{
-    conv3x3s1_winograd43_pack8to4_int8_sse(bottom_blob, top_blob, kernel, opt);
+    conv3x3s1_winograd43_int8(bottom_blob, top_blob, AT, nT, opt);
 }
 
 } // namespace ncnn
diff --git a/src/layer/x86/gridsample_bicubic_apply_interpolation.h b/src/layer/x86/gridsample_bicubic_apply_interpolation.h
new file mode 100644
index 000000000000..0b7be771d3b7
--- /dev/null
+++ b/src/layer/x86/gridsample_bicubic_apply_interpolation.h
@@ -0,0 +1,288 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+static void cubic_interp1d_p16(__m512& coeffs0, __m512& coeffs1, __m512& coeffs2, __m512& coeffs3, const __m512& tx)
+{
+    const __m512 A = _mm512_set1_ps(-0.75f);
+
+    const __m512 x0 = _mm512_add_ps(tx, _mm512_set1_ps(1.0f));
+    const __m512& x1 = tx;
+    const __m512 x2 = _mm512_sub_ps(_mm512_set1_ps(1.0f), tx);
+
+    coeffs0 = _mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(A, x0), _mm512_mul_ps(_mm512_set1_ps(5.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(8.0f), A)), x0), _mm512_mul_ps(_mm512_set1_ps(4), A));
+    coeffs1 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x1), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x1), x1), _mm512_set1_ps(1.0f));
+    coeffs2 = _mm512_add_ps(_mm512_mul_ps(_mm512_mul_ps(_mm512_sub_ps(_mm512_mul_ps(_mm512_add_ps(A, _mm512_set1_ps(2.0f)), x2), _mm512_add_ps(A, _mm512_set1_ps(3.0f))), x2), x2), _mm512_set1_ps(1.0f));
+    coeffs3 = _mm512_sub_ps(_mm512_sub_ps(_mm512_sub_ps(_mm512_set1_ps(1.0f), coeffs0), coeffs1), coeffs2);
+}
+
+static void gridsample_2d_bicubic_apply_interpolation_p16(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            __m512 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3;
+            __m512 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3;
+            __m512 value_f[4];
+            cubic_interp1d_p16(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm512_set1_ps(offset_value_ptr[0]));
+            cubic_interp1d_p16(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm512_set1_ps(offset_value_ptr[1]));
+
+            const int* offset_ptr = (int*)offset_value_ptr + 2;
+
+            for (int ii = 0; ii < 4; ii++)
+            {
+                __m512 x0_val = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0);
+                __m512 x1_val = offset_ptr[1] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0);
+                __m512 x2_val = offset_ptr[2] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0);
+                __m512 x3_val = offset_ptr[3] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0);
+
+                value_f[ii] = _mm512_mul_ps(x_coeffs0, x0_val);
+                value_f[ii] = _mm512_fmadd_ps(x_coeffs1, x1_val, value_f[ii]);
+                value_f[ii] = _mm512_fmadd_ps(x_coeffs2, x2_val, value_f[ii]);
+                value_f[ii] = _mm512_fmadd_ps(x_coeffs3, x3_val, value_f[ii]);
+
+                offset_ptr += 4;
+            }
+
+            __m512 _v = _mm512_mul_ps(y_coeffs0, value_f[0]);
+            _v = _mm512_fmadd_ps(y_coeffs1, value_f[1], _v);
+            _v = _mm512_fmadd_ps(y_coeffs2, value_f[2], _v);
+            _v = _mm512_fmadd_ps(y_coeffs3, value_f[3], _v);
+            _mm512_storeu_ps(dstptr, _v);
+
+            dstptr += 16;
+            offset_value_ptr += 18;
+        }
+    }
+}
+
+#endif // __AVX512F__
+static void cubic_interp1d_p8(__m256& coeffs0, __m256& coeffs1, __m256& coeffs2, __m256& coeffs3, const __m256& tx)
+{
+    const __m256 A = _mm256_set1_ps(-0.75f);
+
+    const __m256 x0 = _mm256_add_ps(tx, _mm256_set1_ps(1));
+    const __m256& x1 = tx;
+    const __m256 x2 = _mm256_sub_ps(_mm256_set1_ps(1), tx);
+    //const __m256 x3 = _mm256_add_ps(x2, _mm256_set1_ps(1));
+
+    coeffs0 = _mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(A, x0), _mm256_mul_ps(_mm256_set1_ps(5.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(8.0f), A)), x0), _mm256_mul_ps(_mm256_set1_ps(4), A));
+    coeffs1 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x1), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x1), x1), _mm256_set1_ps(1));
+    coeffs2 = _mm256_add_ps(_mm256_mul_ps(_mm256_mul_ps(_mm256_sub_ps(_mm256_mul_ps(_mm256_add_ps(A, _mm256_set1_ps(2.0f)), x2), _mm256_add_ps(A, _mm256_set1_ps(3.0f))), x2), x2), _mm256_set1_ps(1));
+    coeffs3 = _mm256_sub_ps(_mm256_sub_ps(_mm256_sub_ps(_mm256_set1_ps(1), coeffs0), coeffs1), coeffs2);
+}
+
+static void gridsample_2d_bicubic_apply_interpolation_p8(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            __m256 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3;
+            __m256 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3;
+            __m256 value_f[4];
+            cubic_interp1d_p8(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm256_set1_ps(offset_value_ptr[0]));
+            cubic_interp1d_p8(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm256_set1_ps(offset_value_ptr[1]));
+
+            const int* offset_ptr = (int*)offset_value_ptr + 2;
+
+            for (int ii = 0; ii < 4; ii++)
+            {
+                __m256 x0_val = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0);
+                __m256 x1_val = offset_ptr[1] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0);
+                __m256 x2_val = offset_ptr[2] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0);
+                __m256 x3_val = offset_ptr[3] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0);
+
+                value_f[ii] = _mm256_mul_ps(x_coeffs0, x0_val);
+                value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]);
+                value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]);
+                value_f[ii] = _mm256_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]);
+
+                offset_ptr += 4;
+            }
+
+            __m256 _v = _mm256_mul_ps(y_coeffs0, value_f[0]);
+            _v = _mm256_comp_fmadd_ps(y_coeffs1, value_f[1], _v);
+            _v = _mm256_comp_fmadd_ps(y_coeffs2, value_f[2], _v);
+            _v = _mm256_comp_fmadd_ps(y_coeffs3, value_f[3], _v);
+            _mm256_storeu_ps(dstptr, _v);
+
+            dstptr += 8;
+            offset_value_ptr += 18;
+        }
+    }
+}
+
+#endif // __AVX__
+static void cubic_interp1d_p4(__m128& coeffs0, __m128& coeffs1, __m128& coeffs2, __m128& coeffs3, const __m128& tx)
+{
+    const __m128 A = _mm_set_ps1(-0.75f);
+
+    const __m128 x0 = _mm_add_ps(tx, _mm_set_ps1(1.0f));
+    const __m128& x1 = tx;
+    const __m128 x2 = _mm_sub_ps(_mm_set_ps1(1.0f), tx);
+    //const __m128 x3 = _mm_add_ps(x2, _mm_set_ps1(1.0f));
+
+    coeffs0 = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(A, x0), _mm_mul_ps(_mm_set_ps1(5.0f), A)), x0), _mm_mul_ps(_mm_set_ps1(8.0f), A)), x0), _mm_mul_ps(_mm_set_ps1(4), A));
+    coeffs1 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set_ps1(2.0f)), x1), _mm_add_ps(A, _mm_set_ps1(3.0f))), x1), x1), _mm_set_ps1(1.0f));
+    coeffs2 = _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_sub_ps(_mm_mul_ps(_mm_add_ps(A, _mm_set_ps1(2.0f)), x2), _mm_add_ps(A, _mm_set_ps1(3.0f))), x2), x2), _mm_set_ps1(1.0f));
+    coeffs3 = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(_mm_set_ps1(1.0f), coeffs0), coeffs1), coeffs2);
+}
+
+static void gridsample_2d_bicubic_apply_interpolation_p4(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    __m128 x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3;
+    __m128 y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3;
+    __m128 value_f[4];
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            cubic_interp1d_p4(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, _mm_set_ps1(offset_value_ptr[0]));
+            cubic_interp1d_p4(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, _mm_set_ps1(offset_value_ptr[1]));
+
+            const int* offset_ptr = (int*)offset_value_ptr + 2;
+
+            for (int ii = 0; ii < 4; ii++)
+            {
+                __m128 x0_val = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0);
+                __m128 x1_val = offset_ptr[1] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0);
+                __m128 x2_val = offset_ptr[2] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0);
+                __m128 x3_val = offset_ptr[3] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0);
+
+                value_f[ii] = _mm_mul_ps(x_coeffs0, x0_val);
+                value_f[ii] = _mm_comp_fmadd_ps(x_coeffs1, x1_val, value_f[ii]);
+                value_f[ii] = _mm_comp_fmadd_ps(x_coeffs2, x2_val, value_f[ii]);
+                value_f[ii] = _mm_comp_fmadd_ps(x_coeffs3, x3_val, value_f[ii]);
+
+                offset_ptr += 4;
+            }
+
+            __m128 _v = _mm_mul_ps(y_coeffs0, value_f[0]);
+            _v = _mm_comp_fmadd_ps(y_coeffs1, value_f[1], _v);
+            _v = _mm_comp_fmadd_ps(y_coeffs2, value_f[2], _v);
+            _v = _mm_comp_fmadd_ps(y_coeffs3, value_f[3], _v);
+            _mm_storeu_ps(dstptr, _v);
+
+            dstptr += 4;
+            offset_value_ptr += 18;
+        }
+    }
+}
+
+#endif // __SSE2__
+
+static inline void cubic_interp1d(float& coeffs0, float& coeffs1, float& coeffs2, float& coeffs3, float fx)
+{
+    const float A = -0.75f;
+
+    float fx0 = fx + 1;
+    float fx1 = fx;
+    float fx2 = 1 - fx;
+    // float fx3 = 2 - fx;
+
+    coeffs0 = A * fx0 * fx0 * fx0 - 5 * A * fx0 * fx0 + 8 * A * fx0 - 4 * A;
+    coeffs1 = (A + 2) * fx1 * fx1 * fx1 - (A + 3) * fx1 * fx1 + 1;
+    coeffs2 = (A + 2) * fx2 * fx2 * fx2 - (A + 3) * fx2 * fx2 + 1;
+    coeffs3 = 1.f - coeffs0 - coeffs1 - coeffs2;
+}
+
+static void gridsample_2d_bicubic_apply_interpolation_p1(const Mat& src, Mat& dst, Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int x = 0; x < grid_size; x++)
+        {
+            float x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3;
+            float y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3;
+            float value_f[4];
+            cubic_interp1d(x_coeffs0, x_coeffs1, x_coeffs2, x_coeffs3, offset_value_ptr[0]);
+            cubic_interp1d(y_coeffs0, y_coeffs1, y_coeffs2, y_coeffs3, offset_value_ptr[1]);
+
+            const int* offset_ptr = (int*)offset_value_ptr + 2;
+
+            for (int ii = 0; ii < 4; ii++)
+            {
+                float x0_val = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0;
+                float x1_val = offset_ptr[1] >= 0 ? *(srcptr + offset_ptr[1]) : 0;
+                float x2_val = offset_ptr[2] >= 0 ? *(srcptr + offset_ptr[2]) : 0;
+                float x3_val = offset_ptr[3] >= 0 ? *(srcptr + offset_ptr[3]) : 0;
+
+                value_f[ii] = x_coeffs0 * x0_val;
+                value_f[ii] = x_coeffs1 * x1_val + value_f[ii];
+                value_f[ii] = x_coeffs2 * x2_val + value_f[ii];
+                value_f[ii] = x_coeffs3 * x3_val + value_f[ii];
+
+                offset_ptr += 4;
+            }
+
+            float _v = y_coeffs0 * value_f[0];
+            _v = y_coeffs1 * value_f[1] + _v;
+            _v = y_coeffs2 * value_f[2] + _v;
+            _v = y_coeffs3 * value_f[3] + _v;
+            *dstptr = _v;
+
+            dstptr++;
+            offset_value_ptr += 18;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/layer/x86/gridsample_bicubic_compute_blob.h b/src/layer/x86/gridsample_bicubic_compute_blob.h
new file mode 100644
index 000000000000..9006153d9d26
--- /dev/null
+++ b/src/layer/x86/gridsample_bicubic_compute_blob.h
@@ -0,0 +1,299 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+template<GridSample::PaddingMode pd, bool align_corner>
+void gridsample_2d_bicubic_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion)
+{
+    const int grid_size = grid.w * grid.h;
+
+    float* offset_value_ptr = offset_value.channel(0);
+
+    grid_sample_unormalize<align_corner> unormalize;
+    compute_coord<pd, align_corner> get_coord;
+
+    if (permute_fusion == 0)
+    {
+        for (int y = 0; y < grid.c; y++)
+        {
+            const float* gridptr = grid.channel(y);
+            int x = 0;
+#if __AVX__
+            for (; x + 15 < grid_size; x += 16)
+            {
+                __m256 gx = _mm256_loadu_ps(gridptr);
+                __m256 gy = _mm256_loadu_ps(gridptr + 8);
+
+                transpose2x8_ps(gx, gy);
+
+                gx = unormalize(_mm256_set1_ps(src.w), gx);
+                gy = unormalize(_mm256_set1_ps(src.h), gy);
+
+                __m256 gx_floor = _mm256_floor_ps(gx);
+                __m256 gy_floor = _mm256_floor_ps(gy);
+
+                __m256 tx = _mm256_sub_ps(gx, gx_floor);
+                __m256 ty = _mm256_sub_ps(gy, gy_floor);
+
+                __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1));
+                __m256 gx1 = gx_floor;
+                __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1));
+                __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1));
+
+                gx0 = get_coord(_mm256_set1_ps(src.w), gx0);
+                gx1 = get_coord(_mm256_set1_ps(src.w), gx1);
+                gx2 = get_coord(_mm256_set1_ps(src.w), gx2);
+                gx3 = get_coord(_mm256_set1_ps(src.w), gx3);
+
+                __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx0, _CMP_GT_OS));
+                __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS));
+                __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS));
+                __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS));
+                __m256 v0_offset_f[4], v1_offset_f[4], v2_offset_f[4], v3_offset_f[4];
+                for (int i = 0; i < 4; i++)
+                {
+                    gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i));
+                    gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+                    __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS));
+
+                    __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w));
+
+                    v0_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(src.elempack));
+                    v1_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(src.elempack));
+                    v2_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(src.elempack));
+                    v3_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(src.elempack));
+
+                    v0_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f[i], _mm256_and_ps(x0_in_range, y_in_range));
+                    v1_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f[i], _mm256_and_ps(x1_in_range, y_in_range));
+                    v2_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f[i], _mm256_and_ps(x2_in_range, y_in_range));
+                    v3_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f[i], _mm256_and_ps(x3_in_range, y_in_range));
+
+                    v0_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v0_offset_f[i]));
+                    v1_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v1_offset_f[i]));
+                    v2_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v2_offset_f[i]));
+                    v3_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v3_offset_f[i]));
+                }
+
+                transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]);
+
+                _mm256_storeu_ps(offset_value_ptr, tx);
+                _mm256_storeu_ps(offset_value_ptr + 8, ty);
+                offset_value_ptr += 16;
+                for (int i = 0; i < 4; i++)
+                {
+                    _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]);
+                    _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]);
+                    _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]);
+                    _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]);
+                    offset_value_ptr += 32;
+                }
+                gridptr += 16;
+            }
+
+#endif // __AVX__
+
+            for (; x < grid_size; x += 2)
+            {
+                float sample_x = *gridptr;
+                float sample_y = *(gridptr + 1);
+
+                sample_x = unormalize(src.w, sample_x);
+                sample_y = unormalize(src.h, sample_y);
+
+                int x1 = floorf(sample_x);
+                int y1 = floorf(sample_y);
+                int x0 = x1 - 1;
+                int x2 = x1 + 1;
+                int x3 = x1 + 2;
+
+                offset_value_ptr[0] = sample_x - static_cast<float>(x1);
+                offset_value_ptr[1] = sample_y - static_cast<float>(y1);
+
+                x1 = get_coord(src.w, x1);
+                x0 = get_coord(src.w, x0);
+                x2 = get_coord(src.w, x2);
+                x3 = get_coord(src.w, x3);
+
+                bool x1_in_range = (x1 > -1) & (x1 < src.w);
+                bool x0_in_range = (x0 > -1) & (x0 < src.w);
+                bool x2_in_range = (x2 > -1) & (x2 < src.w);
+                bool x3_in_range = (x3 > -1) & (x3 < src.w);
+
+                int* offset_ptr = (int*)offset_value_ptr + 2;
+
+                for (int i = 0; i < 4; i++)
+                {
+                    int gy = y1 + i - 1;
+                    gy = get_coord(src.h, gy);
+                    int offset_y = gy * src.w;
+
+                    bool y_in_range = (gy > -1) & (gy < src.h);
+
+                    bool v0_in_bound = (x0_in_range & y_in_range);
+                    bool v1_in_bound = (x1_in_range & y_in_range);
+                    bool v2_in_bound = (x2_in_range & y_in_range);
+                    bool v3_in_bound = (x3_in_range & y_in_range);
+
+                    offset_ptr[0] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0;
+                    offset_ptr[1] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0;
+                    offset_ptr[2] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0;
+                    offset_ptr[3] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0;
+
+                    offset_ptr += 4;
+                }
+
+                gridptr += 2;
+                offset_value_ptr += 18;
+            }
+        }
+    }
+    else
+    {
+        const float* gridptr_x = grid.channel(0);
+        const float* gridptr_y = grid.channel(1);
+
+        int x = 0;
+#if __AVX__
+        for (; x + 7 < grid_size; x += 8)
+        {
+            __m256 gx = _mm256_loadu_ps(gridptr_x);
+            __m256 gy = _mm256_loadu_ps(gridptr_y);
+
+            gx = unormalize(_mm256_set1_ps(src.w), gx);
+            gy = unormalize(_mm256_set1_ps(src.h), gy);
+
+            __m256 gx_floor = _mm256_floor_ps(gx);
+            __m256 gy_floor = _mm256_floor_ps(gy);
+
+            __m256 tx = _mm256_sub_ps(gx, gx_floor);
+            __m256 ty = _mm256_sub_ps(gy, gy_floor);
+
+            __m256 gx0 = _mm256_add_ps(gx_floor, _mm256_set1_ps(-1));
+            __m256 gx1 = gx_floor;
+            __m256 gx2 = _mm256_add_ps(gx_floor, _mm256_set1_ps(1));
+            __m256 gx3 = _mm256_add_ps(gx2, _mm256_set1_ps(1));
+
+            gx0 = get_coord(_mm256_set1_ps(src.w), gx0);
+            gx1 = get_coord(_mm256_set1_ps(src.w), gx1);
+            gx2 = get_coord(_mm256_set1_ps(src.w), gx2);
+            gx3 = get_coord(_mm256_set1_ps(src.w), gx3);
+
+            __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(gx0, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx0, _CMP_GT_OS));
+            __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(gx1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx1, _CMP_GT_OS));
+            __m256 x2_in_range = _mm256_and_ps(_mm256_cmp_ps(gx2, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx2, _CMP_GT_OS));
+            __m256 x3_in_range = _mm256_and_ps(_mm256_cmp_ps(gx3, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx3, _CMP_GT_OS));
+
+            __m256 v0_offset_f[4], v1_offset_f[4], v2_offset_f[4], v3_offset_f[4];
+            for (int i = 0; i < 4; i++)
+            {
+                gy = _mm256_add_ps(gy_floor, _mm256_set1_ps(-1.0f + i));
+                gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+                __m256 y_in_range = _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS));
+
+                __m256 gy_offset = _mm256_mul_ps(gy, _mm256_set1_ps(src.w));
+
+                v0_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx0), _mm256_set1_ps(src.elempack));
+                v1_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx1), _mm256_set1_ps(src.elempack));
+                v2_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx2), _mm256_set1_ps(src.elempack));
+                v3_offset_f[i] = _mm256_mul_ps(_mm256_add_ps(gy_offset, gx3), _mm256_set1_ps(src.elempack));
+
+                v0_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v0_offset_f[i], _mm256_and_ps(x0_in_range, y_in_range));
+                v1_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v1_offset_f[i], _mm256_and_ps(x1_in_range, y_in_range));
+                v2_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v2_offset_f[i], _mm256_and_ps(x2_in_range, y_in_range));
+                v3_offset_f[i] = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), v3_offset_f[i], _mm256_and_ps(x3_in_range, y_in_range));
+
+                v0_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v0_offset_f[i]));
+                v1_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v1_offset_f[i]));
+                v2_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v2_offset_f[i]));
+                v3_offset_f[i] = _mm256_castsi256_ps(_mm256_cvtps_epi32(v3_offset_f[i]));
+            }
+
+            transpose8x18_ps(tx, ty, v0_offset_f[0], v1_offset_f[0], v2_offset_f[0], v3_offset_f[0], v0_offset_f[1], v1_offset_f[1], v2_offset_f[1], v3_offset_f[1], v0_offset_f[2], v1_offset_f[2], v2_offset_f[2], v3_offset_f[2], v0_offset_f[3], v1_offset_f[3], v2_offset_f[3], v3_offset_f[3]);
+
+            _mm256_storeu_ps(offset_value_ptr, tx);
+            _mm256_storeu_ps(offset_value_ptr + 8, ty);
+            offset_value_ptr += 16;
+            for (int i = 0; i < 4; i++)
+            {
+                _mm256_storeu_ps(offset_value_ptr, v0_offset_f[i]);
+                _mm256_storeu_ps(offset_value_ptr + 8, v1_offset_f[i]);
+                _mm256_storeu_ps(offset_value_ptr + 16, v2_offset_f[i]);
+                _mm256_storeu_ps(offset_value_ptr + 24, v3_offset_f[i]);
+                offset_value_ptr += 32;
+            }
+
+            gridptr_x += 8;
+            gridptr_y += 8;
+        }
+
+#endif // __AVX__
+
+        for (; x < grid_size; x++)
+        {
+            float sample_x = *gridptr_x;
+            float sample_y = *gridptr_y;
+
+            sample_x = unormalize(src.w, sample_x);
+            sample_y = unormalize(src.h, sample_y);
+
+            int x1 = floorf(sample_x);
+            int y1 = floorf(sample_y);
+            int x0 = x1 - 1;
+            int x2 = x1 + 1;
+            int x3 = x1 + 2;
+
+            offset_value_ptr[0] = sample_x - static_cast<float>(x1);
+            offset_value_ptr[1] = sample_y - static_cast<float>(y1);
+
+            x1 = get_coord(src.w, x1);
+            x0 = get_coord(src.w, x0);
+            x2 = get_coord(src.w, x2);
+            x3 = get_coord(src.w, x3);
+
+            bool x1_in_range = (x1 > -1) & (x1 < src.w);
+            bool x0_in_range = (x0 > -1) & (x0 < src.w);
+            bool x2_in_range = (x2 > -1) & (x2 < src.w);
+            bool x3_in_range = (x3 > -1) & (x3 < src.w);
+
+            int* offset_ptr = (int*)offset_value_ptr + 2;
+
+            for (int i = 0; i < 4; i++)
+            {
+                int gy = y1 + i - 1;
+                gy = get_coord(src.h, gy);
+                int offset_y = gy * src.w;
+
+                bool y_in_range = (gy > -1) & (gy < src.h);
+
+                bool v0_in_bound = (x0_in_range & y_in_range);
+                bool v1_in_bound = (x1_in_range & y_in_range);
+                bool v2_in_bound = (x2_in_range & y_in_range);
+                bool v3_in_bound = (x3_in_range & y_in_range);
+
+                offset_ptr[0] = v0_in_bound ? (offset_y + x0) * src.elempack : -1.0;
+                offset_ptr[1] = v1_in_bound ? (offset_y + x1) * src.elempack : -1.0;
+                offset_ptr[2] = v2_in_bound ? (offset_y + x2) * src.elempack : -1.0;
+                offset_ptr[3] = v3_in_bound ? (offset_y + x3) * src.elempack : -1.0;
+
+                offset_ptr += 4;
+            }
+
+            gridptr_x++;
+            gridptr_y++;
+
+            offset_value_ptr += 18;
+        }
+    }
+}
diff --git a/src/layer/x86/gridsample_bilinear_apply_interpolation.h b/src/layer/x86/gridsample_bilinear_apply_interpolation.h
new file mode 100644
index 000000000000..0af661b6ca0e
--- /dev/null
+++ b/src/layer/x86/gridsample_bilinear_apply_interpolation.h
@@ -0,0 +1,373 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+static void gridsample_2d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 4;
+
+            __m512 v00_val = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0);
+            __m512 v01_val = offset_ptr[1] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0);
+            __m512 v10_val = offset_ptr[2] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0);
+            __m512 v11_val = offset_ptr[3] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0);
+
+            __m512 value1 = _mm512_set1_ps(value_ptr[0]);
+            __m512 v0 = _mm512_fmadd_ps(v01_val, value1, _mm512_fnmadd_ps(v00_val, value1, v00_val));
+            __m512 v1 = _mm512_fmadd_ps(v11_val, value1, _mm512_fnmadd_ps(v10_val, value1, v10_val));
+
+            __m512 value2 = _mm512_set1_ps(value_ptr[1]);
+            __m512 _v = _mm512_fmadd_ps(v1, value2, _mm512_fnmadd_ps(v0, value2, v0));
+            _mm512_storeu_ps(dstptr, _v);
+
+            dstptr += 16;
+            offset_value_ptr += 6;
+        }
+    }
+}
+
+static void gridsample_3d_bilinear_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 8;
+
+            __m512 v000_val = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0);
+            __m512 v001_val = offset_ptr[1] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[1]) : _mm512_set1_ps(0);
+            __m512 v010_val = offset_ptr[2] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[2]) : _mm512_set1_ps(0);
+            __m512 v011_val = offset_ptr[3] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[3]) : _mm512_set1_ps(0);
+
+            __m512 v100_val = offset_ptr[4] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[4]) : _mm512_set1_ps(0);
+            __m512 v101_val = offset_ptr[5] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[5]) : _mm512_set1_ps(0);
+            __m512 v110_val = offset_ptr[6] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[6]) : _mm512_set1_ps(0);
+            __m512 v111_val = offset_ptr[7] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[7]) : _mm512_set1_ps(0);
+
+            __m512 value = _mm512_set1_ps(value_ptr[0]);
+            __m512 v00 = _mm512_fmadd_ps(v001_val, value, _mm512_fnmadd_ps(v000_val, value, v000_val));
+            __m512 v01 = _mm512_fmadd_ps(v011_val, value, _mm512_fnmadd_ps(v010_val, value, v010_val));
+            __m512 v10 = _mm512_fmadd_ps(v101_val, value, _mm512_fnmadd_ps(v100_val, value, v100_val));
+            __m512 v11 = _mm512_fmadd_ps(v111_val, value, _mm512_fnmadd_ps(v110_val, value, v110_val));
+
+            value = _mm512_set1_ps(value_ptr[1]);
+            __m512 v0 = _mm512_fmadd_ps(v01, value, _mm512_fnmadd_ps(v00, value, v00));
+            __m512 v1 = _mm512_fmadd_ps(v11, value, _mm512_fnmadd_ps(v10, value, v10));
+
+            value = _mm512_set1_ps(value_ptr[2]);
+            __m512 _v = _mm512_fmadd_ps(v1, value, _mm512_fnmadd_ps(v0, value, v0));
+            _mm512_storeu_ps(dstptr, _v);
+
+            dstptr += 16;
+            offset_value_ptr += 11;
+        }
+    }
+}
+
+#endif // __AVX512F__
+
+static void gridsample_2d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 4;
+
+            __m256 v00_val = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0);
+            __m256 v01_val = offset_ptr[1] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0);
+            __m256 v10_val = offset_ptr[2] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0);
+            __m256 v11_val = offset_ptr[3] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0);
+
+            __m256 value1 = _mm256_set1_ps(value_ptr[0]);
+            __m256 v0 = _mm256_comp_fmadd_ps(v01_val, value1, _mm256_comp_fnmadd_ps(v00_val, value1, v00_val));
+            __m256 v1 = _mm256_comp_fmadd_ps(v11_val, value1, _mm256_comp_fnmadd_ps(v10_val, value1, v10_val));
+
+            __m256 value2 = _mm256_set1_ps(value_ptr[1]);
+            __m256 _v = _mm256_comp_fmadd_ps(v1, value2, _mm256_comp_fnmadd_ps(v0, value2, v0));
+            _mm256_storeu_ps(dstptr, _v);
+
+            dstptr += 8;
+            offset_value_ptr += 6;
+        }
+    }
+}
+static void gridsample_3d_bilinear_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 8;
+
+            __m256 v000_val = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0);
+            __m256 v001_val = offset_ptr[1] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[1]) : _mm256_set1_ps(0);
+            __m256 v010_val = offset_ptr[2] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[2]) : _mm256_set1_ps(0);
+            __m256 v011_val = offset_ptr[3] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[3]) : _mm256_set1_ps(0);
+
+            __m256 v100_val = offset_ptr[4] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[4]) : _mm256_set1_ps(0);
+            __m256 v101_val = offset_ptr[5] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[5]) : _mm256_set1_ps(0);
+            __m256 v110_val = offset_ptr[6] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[6]) : _mm256_set1_ps(0);
+            __m256 v111_val = offset_ptr[7] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[7]) : _mm256_set1_ps(0);
+
+            __m256 value = _mm256_set1_ps(value_ptr[0]);
+            __m256 v00 = _mm256_comp_fmadd_ps(v001_val, value, _mm256_comp_fnmadd_ps(v000_val, value, v000_val));
+            __m256 v01 = _mm256_comp_fmadd_ps(v011_val, value, _mm256_comp_fnmadd_ps(v010_val, value, v010_val));
+            __m256 v10 = _mm256_comp_fmadd_ps(v101_val, value, _mm256_comp_fnmadd_ps(v100_val, value, v100_val));
+            __m256 v11 = _mm256_comp_fmadd_ps(v111_val, value, _mm256_comp_fnmadd_ps(v110_val, value, v110_val));
+
+            value = _mm256_set1_ps(value_ptr[1]);
+            __m256 v0 = _mm256_comp_fmadd_ps(v01, value, _mm256_comp_fnmadd_ps(v00, value, v00));
+            __m256 v1 = _mm256_comp_fmadd_ps(v11, value, _mm256_comp_fnmadd_ps(v10, value, v10));
+
+            value = _mm256_set1_ps(value_ptr[2]);
+            __m256 _v = _mm256_comp_fmadd_ps(v1, value, _mm256_comp_fnmadd_ps(v0, value, v0));
+            _mm256_storeu_ps(dstptr, _v);
+
+            dstptr += 8;
+            offset_value_ptr += 11;
+        }
+    }
+}
+#endif // __AVX__
+static void gridsample_2d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 4;
+
+            __m128 v00_val = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0);
+            __m128 v01_val = offset_ptr[1] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0);
+            __m128 v10_val = offset_ptr[2] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0);
+            __m128 v11_val = offset_ptr[3] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0);
+
+            __m128 value1 = _mm_set1_ps(value_ptr[0]);
+            __m128 v0 = _mm_comp_fmadd_ps(v01_val, value1, _mm_comp_fnmadd_ps(v00_val, value1, v00_val));
+            __m128 v1 = _mm_comp_fmadd_ps(v11_val, value1, _mm_comp_fnmadd_ps(v10_val, value1, v10_val));
+
+            __m128 value2 = _mm_set1_ps(value_ptr[1]);
+            __m128 _v = _mm_comp_fmadd_ps(v1, value2, _mm_comp_fnmadd_ps(v0, value2, v0));
+            _mm_storeu_ps(dstptr, _v);
+
+            dstptr += 4;
+            offset_value_ptr += 6;
+        }
+    }
+}
+static void gridsample_3d_bilinear_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 8;
+
+            __m128 v000_val = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0);
+            __m128 v001_val = offset_ptr[1] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[1]) : _mm_set1_ps(0);
+            __m128 v010_val = offset_ptr[2] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[2]) : _mm_set1_ps(0);
+            __m128 v011_val = offset_ptr[3] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[3]) : _mm_set1_ps(0);
+
+            __m128 v100_val = offset_ptr[4] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[4]) : _mm_set1_ps(0);
+            __m128 v101_val = offset_ptr[5] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[5]) : _mm_set1_ps(0);
+            __m128 v110_val = offset_ptr[6] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[6]) : _mm_set1_ps(0);
+            __m128 v111_val = offset_ptr[7] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[7]) : _mm_set1_ps(0);
+
+            __m128 value = _mm_set1_ps(value_ptr[0]);
+            __m128 v00 = _mm_comp_fmadd_ps(v001_val, value, _mm_comp_fnmadd_ps(v000_val, value, v000_val));
+            __m128 v01 = _mm_comp_fmadd_ps(v011_val, value, _mm_comp_fnmadd_ps(v010_val, value, v010_val));
+            __m128 v10 = _mm_comp_fmadd_ps(v101_val, value, _mm_comp_fnmadd_ps(v100_val, value, v100_val));
+            __m128 v11 = _mm_comp_fmadd_ps(v111_val, value, _mm_comp_fnmadd_ps(v110_val, value, v110_val));
+
+            value = _mm_set1_ps(value_ptr[1]);
+            __m128 v0 = _mm_comp_fmadd_ps(v01, value, _mm_comp_fnmadd_ps(v00, value, v00));
+            __m128 v1 = _mm_comp_fmadd_ps(v11, value, _mm_comp_fnmadd_ps(v10, value, v10));
+
+            value = _mm_set1_ps(value_ptr[2]);
+            __m128 _v = _mm_comp_fmadd_ps(v1, value, _mm_comp_fnmadd_ps(v0, value, v0));
+            _mm_storeu_ps(dstptr, _v);
+
+            dstptr += 4;
+            offset_value_ptr += 11;
+        }
+    }
+}
+#pragma fenv_access(off)
+
+#pragma float_control(precise, off)
+#endif // __SSE2__
+
+static void gridsample_2d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int grid_size = outw * outh;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int x = 0; x < grid_size; x++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 4;
+
+            float v00 = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0;
+            float v01 = offset_ptr[1] >= 0 ? *(srcptr + offset_ptr[1]) : 0;
+            float v10 = offset_ptr[2] >= 0 ? *(srcptr + offset_ptr[2]) : 0;
+            float v11 = offset_ptr[3] >= 0 ? *(srcptr + offset_ptr[3]) : 0;
+
+            float v0 = v00 * (1 - value_ptr[0]) + v01 * value_ptr[0];
+            float v1 = v10 * (1 - value_ptr[0]) + v11 * value_ptr[0];
+
+            *dstptr = v0 * (1 - value_ptr[1]) + v1 * value_ptr[1];
+
+            dstptr++;
+            offset_value_ptr += 6;
+        }
+    }
+}
+
+static void gridsample_3d_bilinear_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const float* offset_value_ptr = offset_value.channel(0);
+
+        for (int x = 0; x < grid_size; x++)
+        {
+            const int* offset_ptr = (int*)offset_value_ptr;
+            const float* value_ptr = offset_value_ptr + 8;
+
+            float v000 = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0;
+            float v001 = offset_ptr[1] >= 0 ? *(srcptr + offset_ptr[1]) : 0;
+            float v010 = offset_ptr[2] >= 0 ? *(srcptr + offset_ptr[2]) : 0;
+            float v011 = offset_ptr[3] >= 0 ? *(srcptr + offset_ptr[3]) : 0;
+
+            float v100 = offset_ptr[4] >= 0 ? *(srcptr + offset_ptr[4]) : 0;
+            float v101 = offset_ptr[5] >= 0 ? *(srcptr + offset_ptr[5]) : 0;
+            float v110 = offset_ptr[6] >= 0 ? *(srcptr + offset_ptr[6]) : 0;
+            float v111 = offset_ptr[7] >= 0 ? *(srcptr + offset_ptr[7]) : 0;
+
+            float v00 = v000 * (1 - value_ptr[0]) + v001 * value_ptr[0];
+            float v01 = v010 * (1 - value_ptr[0]) + v011 * value_ptr[0];
+            float v10 = v100 * (1 - value_ptr[0]) + v101 * value_ptr[0];
+            float v11 = v110 * (1 - value_ptr[0]) + v111 * value_ptr[0];
+
+            float v0 = v00 * (1 - value_ptr[1]) + v01 * value_ptr[1];
+            float v1 = v10 * (1 - value_ptr[1]) + v11 * value_ptr[1];
+
+            *dstptr = v0 * (1 - value_ptr[2]) + v1 * value_ptr[2];
+
+            dstptr++;
+            offset_value_ptr += 11;
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/layer/x86/gridsample_bilinear_compute_blob.h b/src/layer/x86/gridsample_bilinear_compute_blob.h
new file mode 100644
index 000000000000..f78e017c1c75
--- /dev/null
+++ b/src/layer/x86/gridsample_bilinear_compute_blob.h
@@ -0,0 +1,623 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+template<GridSample::PaddingMode pd, bool align_corner>
+void gridsample_2d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion)
+{
+    const int grid_size = grid.w * grid.h;
+
+    float* offset_value_ptr = offset_value.channel(0);
+
+    grid_sample_unormalize<align_corner> unormalize;
+    compute_coord<pd, align_corner> get_coord;
+
+    if (permute_fusion == 0)
+    {
+        for (int y = 0; y < grid.c; y++)
+        {
+            const float* gridptr = grid.channel(y);
+            int x = 0;
+#if __AVX__
+            for (; x + 15 < grid_size; x += 16)
+            {
+                __m256 gx = _mm256_loadu_ps(gridptr);
+                __m256 gy = _mm256_loadu_ps(gridptr + 8);
+
+                transpose2x8_ps(gx, gy);
+
+                gx = unormalize(_mm256_set1_ps(src.w), gx);
+                gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+                gy = unormalize(_mm256_set1_ps(src.h), gy);
+                gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+                __m256 x_w = _mm256_floor_ps(gx);
+                __m256 y_n = _mm256_floor_ps(gy);
+
+                __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1));
+                __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1));
+
+                __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS));
+                __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS));
+                __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS));
+                __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS));
+
+                __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range);
+                __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range);
+                __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range);
+                __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range);
+
+                __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(src.elempack));
+                __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(src.elempack));
+                __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack), nw_offset);
+                __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(src.elempack));
+
+                nw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), nw_offset, v00_in_range);
+                ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range);
+                sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range);
+                se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range);
+
+                nw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(nw_offset));
+                ne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(ne_offset));
+                sw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(sw_offset));
+                se_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(se_offset));
+
+                __m256 alpha = _mm256_sub_ps(gx, x_w);
+                __m256 beta = _mm256_sub_ps(gy, y_n);
+
+                transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta);
+
+                _mm256_storeu_ps(offset_value_ptr, nw_offset);
+                _mm256_storeu_ps(offset_value_ptr + 8, ne_offset);
+                _mm256_storeu_ps(offset_value_ptr + 16, sw_offset);
+                _mm256_storeu_ps(offset_value_ptr + 24, se_offset);
+
+                _mm256_storeu_ps(offset_value_ptr + 32, alpha);
+                _mm256_storeu_ps(offset_value_ptr + 40, beta);
+
+                gridptr += 16;
+                offset_value_ptr += 48;
+            }
+#endif // __AVX__
+
+            for (; x < grid_size; x += 2)
+            {
+                float sample_x = *gridptr;
+                float sample_y = *(gridptr + 1);
+
+                sample_x = unormalize(src.w, sample_x);
+                sample_x = get_coord(src.w, sample_x);
+
+                sample_y = unormalize(src.h, sample_y);
+                sample_y = get_coord(src.h, sample_y);
+
+                int x0 = (int)floorf(sample_x);
+                int y0 = (int)floorf(sample_y);
+                int x1 = x0 + 1;
+                int y1 = y0 + 1;
+
+                bool x0_in_bound = (x0 > -1) & (x0 < src.w);
+                bool x1_in_bound = (x1 > -1) & (x1 < src.w);
+                bool y0_in_bound = (y0 > -1) & (y0 < src.h);
+                bool y1_in_bound = (y1 > -1) & (y1 < src.h);
+
+                bool in_bound_00 = x0_in_bound & y0_in_bound;
+                bool in_bound_01 = x1_in_bound & y0_in_bound;
+                bool in_bound_10 = x0_in_bound & y1_in_bound;
+                bool in_bound_11 = x1_in_bound & y1_in_bound;
+
+                int* offset_ptr = (int*)offset_value_ptr;
+                float* value_ptr = offset_value_ptr + 4;
+
+                offset_ptr[0] = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0;
+                offset_ptr[1] = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0;
+                offset_ptr[2] = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0;
+                offset_ptr[3] = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0;
+
+                value_ptr[0] = sample_x - x0;
+                value_ptr[1] = sample_y - y0;
+
+                gridptr += 2;
+                offset_value_ptr += 6;
+            }
+        }
+    }
+    else
+    {
+        const float* gridptr_x = grid.channel(0);
+        const float* gridptr_y = grid.channel(1);
+
+        int x = 0;
+#if __AVX__
+        for (; x + 7 < grid_size; x += 8)
+        {
+            __m256 gx = _mm256_loadu_ps(gridptr_x);
+            __m256 gy = _mm256_loadu_ps(gridptr_y);
+
+            gx = unormalize(_mm256_set1_ps(src.w), gx);
+            gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+            gy = unormalize(_mm256_set1_ps(src.h), gy);
+            gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+            __m256 x_w = _mm256_floor_ps(gx);
+            __m256 y_n = _mm256_floor_ps(gy);
+
+            __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1));
+            __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1));
+
+            __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS));
+            __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS));
+            __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS));
+            __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS));
+
+            __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range);
+            __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range);
+            __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range);
+            __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range);
+
+            __m256 nw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w), _mm256_set1_ps(src.elempack));
+            __m256 ne_offset = _mm256_add_ps(nw_offset, _mm256_set1_ps(src.elempack));
+            __m256 sw_offset = _mm256_comp_fmadd_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack), nw_offset);
+            __m256 se_offset = _mm256_add_ps(sw_offset, _mm256_set1_ps(src.elempack));
+
+            nw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), nw_offset, v00_in_range);
+            ne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), ne_offset, v01_in_range);
+            sw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), sw_offset, v10_in_range);
+            se_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), se_offset, v11_in_range);
+
+            nw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(nw_offset));
+            ne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(ne_offset));
+            sw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(sw_offset));
+            se_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(se_offset));
+
+            __m256 alpha = _mm256_sub_ps(gx, x_w);
+            __m256 beta = _mm256_sub_ps(gy, y_n);
+
+            transpose8x6_ps(nw_offset, ne_offset, sw_offset, se_offset, alpha, beta);
+
+            _mm256_storeu_ps(offset_value_ptr, nw_offset);
+            _mm256_storeu_ps(offset_value_ptr + 8, ne_offset);
+            _mm256_storeu_ps(offset_value_ptr + 16, sw_offset);
+            _mm256_storeu_ps(offset_value_ptr + 24, se_offset);
+
+            _mm256_storeu_ps(offset_value_ptr + 32, alpha);
+            _mm256_storeu_ps(offset_value_ptr + 40, beta);
+
+            gridptr_x += 8;
+            gridptr_y += 8;
+            offset_value_ptr += 48;
+        }
+
+#endif // __AVX__
+
+        for (; x < grid_size; x++)
+        {
+            float sample_x = *gridptr_x;
+            float sample_y = *gridptr_y;
+
+            sample_x = unormalize(src.w, sample_x);
+            sample_x = get_coord(src.w, sample_x);
+
+            sample_y = unormalize(src.h, sample_y);
+            sample_y = get_coord(src.h, sample_y);
+
+            int x0 = (int)floorf(sample_x);
+            int y0 = (int)floorf(sample_y);
+            int x1 = x0 + 1;
+            int y1 = y0 + 1;
+
+            bool x0_in_bound = (x0 > -1) & (x0 < src.w);
+            bool x1_in_bound = (x1 > -1) & (x1 < src.w);
+            bool y0_in_bound = (y0 > -1) & (y0 < src.h);
+            bool y1_in_bound = (y1 > -1) & (y1 < src.h);
+
+            bool in_bound_00 = x0_in_bound & y0_in_bound;
+            bool in_bound_01 = x1_in_bound & y0_in_bound;
+            bool in_bound_10 = x0_in_bound & y1_in_bound;
+            bool in_bound_11 = x1_in_bound & y1_in_bound;
+
+            int* offset_ptr = (int*)offset_value_ptr;
+            float* value_ptr = offset_value_ptr + 4;
+
+            offset_ptr[0] = in_bound_00 ? (x0 + y0 * src.w) * src.elempack : -1.0;
+            offset_ptr[1] = in_bound_01 ? (x1 + y0 * src.w) * src.elempack : -1.0;
+            offset_ptr[2] = in_bound_10 ? (x0 + y1 * src.w) * src.elempack : -1.0;
+            offset_ptr[3] = in_bound_11 ? (x1 + y1 * src.w) * src.elempack : -1.0;
+
+            value_ptr[0] = sample_x - x0;
+            value_ptr[1] = sample_y - y0;
+
+            gridptr_x++;
+            gridptr_y++;
+            offset_value_ptr += 6;
+        }
+    }
+}
+
+template<GridSample::PaddingMode pd, bool align_corner>
+void gridsample_3d_bilinear_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion)
+{
+    const int grid_size = grid.w * grid.h * grid.d;
+
+    float* offset_value_ptr = offset_value.channel(0);
+
+    grid_sample_unormalize<align_corner> unormalize;
+    compute_coord<pd, align_corner> get_coord;
+
+    if (permute_fusion == 0)
+    {
+        for (int y = 0; y < grid.c; y++)
+        {
+            const float* gridptr = grid.channel(y);
+            int x = 0;
+#if __AVX__
+            for (; x + 23 < grid_size; x += 24)
+            {
+                __m256 gx = _mm256_loadu_ps(gridptr);
+                __m256 gy = _mm256_loadu_ps(gridptr + 8);
+                __m256 gz = _mm256_loadu_ps(gridptr + 16);
+
+                transpose3x8_ps(gx, gy, gz);
+
+                gx = unormalize(_mm256_set1_ps(src.w), gx);
+                gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+                gy = unormalize(_mm256_set1_ps(src.h), gy);
+                gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+                gz = unormalize(_mm256_set1_ps(src.d), gz);
+                gz = get_coord(_mm256_set1_ps(src.d), gz);
+
+                __m256 x_w = _mm256_floor_ps(gx);
+                __m256 y_n = _mm256_floor_ps(gy);
+                __m256 z_t = _mm256_floor_ps(gz);
+
+                __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1));
+                __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1));
+                __m256 z1 = _mm256_add_ps(z_t, _mm256_set1_ps(1));
+
+                __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS));
+                __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS));
+                __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS));
+                __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS));
+                __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS));
+                __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS));
+
+                __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range;
+                {
+                    __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range);
+                    __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range);
+                    __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range);
+                    __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range);
+
+                    v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range);
+                    v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range);
+                    v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range);
+                    v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range);
+
+                    v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range);
+                    v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range);
+                    v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range);
+                    v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range);
+                }
+
+                __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t,
+                                                  _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)),
+                                                  _mm256_set1_ps(src.elempack));
+                __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(src.elempack));
+                __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack)));
+                __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(src.elempack));
+
+                __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(src.elempack), tnw_offset);
+                __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(src.elempack));
+                __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack)));
+                __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(src.elempack));
+
+                tnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tnw_offset, v000_in_range);
+                tne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tne_offset, v001_in_range);
+                tsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tsw_offset, v010_in_range);
+                tse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tse_offset, v011_in_range);
+
+                bnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bnw_offset, v100_in_range);
+                bne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bne_offset, v101_in_range);
+                bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range);
+                bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range);
+
+                tnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tnw_offset));
+                tne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tne_offset));
+                tsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tsw_offset));
+                tse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tse_offset));
+
+                bnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bnw_offset));
+                bne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bne_offset));
+                bsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bsw_offset));
+                bse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bse_offset));
+
+                __m256 alpha = _mm256_sub_ps(gx, x_w);
+                __m256 beta = _mm256_sub_ps(gy, y_n);
+                __m256 gamma = _mm256_sub_ps(gz, z_t);
+
+                transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma);
+
+                _mm256_storeu_ps(offset_value_ptr, tnw_offset);
+                _mm256_storeu_ps(offset_value_ptr + 8, tne_offset);
+                _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset);
+                _mm256_storeu_ps(offset_value_ptr + 24, tse_offset);
+
+                _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset);
+                _mm256_storeu_ps(offset_value_ptr + 40, bne_offset);
+                _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset);
+                _mm256_storeu_ps(offset_value_ptr + 56, bse_offset);
+
+                _mm256_storeu_ps(offset_value_ptr + 64, alpha);
+                _mm256_storeu_ps(offset_value_ptr + 72, beta);
+                _mm256_storeu_ps(offset_value_ptr + 80, gamma);
+
+                gridptr += 24;
+
+                offset_value_ptr += 88;
+            }
+#endif // __AVX__
+
+            for (; x < grid_size; x += 3)
+            {
+                float sample_x = *gridptr;
+                float sample_y = *(gridptr + 1);
+                float sample_z = *(gridptr + 2);
+
+                sample_x = unormalize(src.w, sample_x);
+                sample_x = get_coord(src.w, sample_x);
+
+                sample_y = unormalize(src.h, sample_y);
+                sample_y = get_coord(src.h, sample_y);
+
+                sample_z = unormalize(src.d, sample_z);
+                sample_z = get_coord(src.d, sample_z);
+
+                int x0 = (int)floorf(sample_x);
+                int y0 = (int)floorf(sample_y);
+                int z0 = (int)floorf(sample_z);
+                int x1 = x0 + 1;
+                int y1 = y0 + 1;
+                int z1 = z0 + 1;
+
+                bool x0_in_range = (x0 > -1) & (x0 < src.w);
+                bool y0_in_range = (y0 > -1) & (y0 < src.h);
+                bool z0_in_range = (z0 > -1) & (z0 < src.d);
+                bool x1_in_range = (x1 > -1) & (x1 < src.w);
+                bool y1_in_range = (y1 > -1) & (y1 < src.h);
+                bool z1_in_range = (z1 > -1) & (z1 < src.d);
+
+                bool v00_in_range = x0_in_range & y0_in_range;
+                bool v01_in_range = x1_in_range & y0_in_range;
+                bool v10_in_range = x0_in_range & y1_in_range;
+                bool v11_in_range = x1_in_range & y1_in_range;
+
+                bool in_bound_000 = v00_in_range & z0_in_range;
+                bool in_bound_001 = v01_in_range & z0_in_range;
+                bool in_bound_010 = v10_in_range & z0_in_range;
+                bool in_bound_011 = v11_in_range & z0_in_range;
+
+                bool in_bound_100 = v00_in_range & z1_in_range;
+                bool in_bound_101 = v01_in_range & z1_in_range;
+                bool in_bound_110 = v10_in_range & z1_in_range;
+                bool in_bound_111 = v11_in_range & z1_in_range;
+
+                int* offset_ptr = (int*)offset_value_ptr;
+                float* value_ptr = offset_value_ptr + 8;
+
+                offset_ptr[0] = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+                offset_ptr[1] = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+                offset_ptr[2] = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+                offset_ptr[3] = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+
+                offset_ptr[4] = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+                offset_ptr[5] = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+                offset_ptr[6] = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+                offset_ptr[7] = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+
+                value_ptr[0] = sample_x - x0;
+                value_ptr[1] = sample_y - y0;
+                value_ptr[2] = sample_z - z0;
+
+                gridptr += 3;
+                offset_value_ptr += 11;
+            }
+        }
+    }
+    else
+    {
+        const float* gridptr_x = grid.channel(0);
+        const float* gridptr_y = grid.channel(1);
+        const float* gridptr_z = grid.channel(2);
+
+        int x = 0;
+#if __AVX__
+        for (; x + 7 < grid_size; x += 8)
+        {
+            __m256 gx = _mm256_loadu_ps(gridptr_x);
+            __m256 gy = _mm256_loadu_ps(gridptr_y);
+            __m256 gz = _mm256_loadu_ps(gridptr_z);
+
+            gx = unormalize(_mm256_set1_ps(src.w), gx);
+            gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+            gy = unormalize(_mm256_set1_ps(src.h), gy);
+            gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+            gz = unormalize(_mm256_set1_ps(src.d), gz);
+            gz = get_coord(_mm256_set1_ps(src.d), gz);
+
+            __m256 x_w = _mm256_floor_ps(gx);
+            __m256 y_n = _mm256_floor_ps(gy);
+            __m256 z_t = _mm256_floor_ps(gz);
+
+            __m256 x1 = _mm256_add_ps(x_w, _mm256_set1_ps(1));
+            __m256 y1 = _mm256_add_ps(y_n, _mm256_set1_ps(1));
+            __m256 z1 = _mm256_add_ps(z_t, _mm256_set1_ps(1));
+
+            __m256 x0_in_range = _mm256_and_ps(_mm256_cmp_ps(x_w, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x_w, _CMP_GT_OS));
+            __m256 x1_in_range = _mm256_and_ps(_mm256_cmp_ps(x1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), x1, _CMP_GT_OS));
+            __m256 y0_in_range = _mm256_and_ps(_mm256_cmp_ps(y_n, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y_n, _CMP_GT_OS));
+            __m256 y1_in_range = _mm256_and_ps(_mm256_cmp_ps(y1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), y1, _CMP_GT_OS));
+            __m256 z0_in_range = _mm256_and_ps(_mm256_cmp_ps(z_t, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z_t, _CMP_GT_OS));
+            __m256 z1_in_range = _mm256_and_ps(_mm256_cmp_ps(z1, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), z1, _CMP_GT_OS));
+
+            __m256 v000_in_range, v010_in_range, v100_in_range, v110_in_range, v001_in_range, v011_in_range, v101_in_range, v111_in_range;
+            {
+                __m256 v00_in_range = _mm256_and_ps(x0_in_range, y0_in_range);
+                __m256 v01_in_range = _mm256_and_ps(x1_in_range, y0_in_range);
+                __m256 v10_in_range = _mm256_and_ps(x0_in_range, y1_in_range);
+                __m256 v11_in_range = _mm256_and_ps(x1_in_range, y1_in_range);
+
+                v000_in_range = _mm256_and_ps(v00_in_range, z0_in_range);
+                v001_in_range = _mm256_and_ps(v01_in_range, z0_in_range);
+                v010_in_range = _mm256_and_ps(v10_in_range, z0_in_range);
+                v011_in_range = _mm256_and_ps(v11_in_range, z0_in_range);
+
+                v100_in_range = _mm256_and_ps(v00_in_range, z1_in_range);
+                v101_in_range = _mm256_and_ps(v01_in_range, z1_in_range);
+                v110_in_range = _mm256_and_ps(v10_in_range, z1_in_range);
+                v111_in_range = _mm256_and_ps(v11_in_range, z1_in_range);
+            }
+
+            __m256 tnw_offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), z_t,
+                                              _mm256_comp_fmadd_ps(y_n, _mm256_set1_ps(src.w), x_w)),
+                                              _mm256_set1_ps(src.elempack));
+            __m256 tne_offset = _mm256_add_ps(tnw_offset, _mm256_set1_ps(src.elempack));
+            __m256 tsw_offset = _mm256_add_ps(tnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack)));
+            __m256 tse_offset = _mm256_add_ps(tsw_offset, _mm256_set1_ps(src.elempack));
+
+            __m256 bnw_offset = _mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), _mm256_set1_ps(src.elempack), tnw_offset);
+            __m256 bne_offset = _mm256_add_ps(bnw_offset, _mm256_set1_ps(src.elempack));
+            __m256 bsw_offset = _mm256_add_ps(bnw_offset, _mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.elempack)));
+            __m256 bse_offset = _mm256_add_ps(bsw_offset, _mm256_set1_ps(src.elempack));
+
+            tnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tnw_offset, v000_in_range);
+            tne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tne_offset, v001_in_range);
+            tsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tsw_offset, v010_in_range);
+            tse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), tse_offset, v011_in_range);
+
+            bnw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bnw_offset, v100_in_range);
+            bne_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bne_offset, v101_in_range);
+            bsw_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bsw_offset, v110_in_range);
+            bse_offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), bse_offset, v111_in_range);
+
+            tnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tnw_offset));
+            tne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tne_offset));
+            tsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tsw_offset));
+            tse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(tse_offset));
+
+            bnw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bnw_offset));
+            bne_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bne_offset));
+            bsw_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bsw_offset));
+            bse_offset = _mm256_castsi256_ps(_mm256_cvtps_epi32(bse_offset));
+
+            __m256 alpha = _mm256_sub_ps(gx, x_w);
+            __m256 beta = _mm256_sub_ps(gy, y_n);
+            __m256 gamma = _mm256_sub_ps(gz, z_t);
+
+            transpose8x11_ps(tnw_offset, tne_offset, tsw_offset, tse_offset, bnw_offset, bne_offset, bsw_offset, bse_offset, alpha, beta, gamma);
+
+            _mm256_storeu_ps(offset_value_ptr, tnw_offset);
+            _mm256_storeu_ps(offset_value_ptr + 8, tne_offset);
+            _mm256_storeu_ps(offset_value_ptr + 16, tsw_offset);
+            _mm256_storeu_ps(offset_value_ptr + 24, tse_offset);
+
+            _mm256_storeu_ps(offset_value_ptr + 32, bnw_offset);
+            _mm256_storeu_ps(offset_value_ptr + 40, bne_offset);
+            _mm256_storeu_ps(offset_value_ptr + 48, bsw_offset);
+            _mm256_storeu_ps(offset_value_ptr + 56, bse_offset);
+
+            _mm256_storeu_ps(offset_value_ptr + 64, alpha);
+            _mm256_storeu_ps(offset_value_ptr + 72, beta);
+            _mm256_storeu_ps(offset_value_ptr + 80, gamma);
+
+            gridptr_x += 8;
+            gridptr_y += 8;
+            gridptr_z += 8;
+
+            offset_value_ptr += 88;
+        }
+#endif // __AVX__
+
+        for (; x < grid_size; x++)
+        {
+            float sample_x = *gridptr_x;
+            float sample_y = *gridptr_y;
+            float sample_z = *gridptr_z;
+
+            sample_x = unormalize(src.w, sample_x);
+            sample_x = get_coord(src.w, sample_x);
+
+            sample_y = unormalize(src.h, sample_y);
+            sample_y = get_coord(src.h, sample_y);
+
+            sample_z = unormalize(src.d, sample_z);
+            sample_z = get_coord(src.d, sample_z);
+
+            int x0 = (int)floorf(sample_x);
+            int y0 = (int)floorf(sample_y);
+            int z0 = (int)floorf(sample_z);
+            int x1 = x0 + 1;
+            int y1 = y0 + 1;
+            int z1 = z0 + 1;
+
+            bool x0_in_range = (x0 > -1) & (x0 < src.w);
+            bool y0_in_range = (y0 > -1) & (y0 < src.h);
+            bool z0_in_range = (z0 > -1) & (z0 < src.d);
+            bool x1_in_range = (x1 > -1) & (x1 < src.w);
+            bool y1_in_range = (y1 > -1) & (y1 < src.h);
+            bool z1_in_range = (z1 > -1) & (z1 < src.d);
+
+            bool v00_in_range = x0_in_range & y0_in_range;
+            bool v01_in_range = x1_in_range & y0_in_range;
+            bool v10_in_range = x0_in_range & y1_in_range;
+            bool v11_in_range = x1_in_range & y1_in_range;
+
+            bool in_bound_000 = v00_in_range & z0_in_range;
+            bool in_bound_001 = v01_in_range & z0_in_range;
+            bool in_bound_010 = v10_in_range & z0_in_range;
+            bool in_bound_011 = v11_in_range & z0_in_range;
+
+            bool in_bound_100 = v00_in_range & z1_in_range;
+            bool in_bound_101 = v01_in_range & z1_in_range;
+            bool in_bound_110 = v10_in_range & z1_in_range;
+            bool in_bound_111 = v11_in_range & z1_in_range;
+
+            int* offset_ptr = (int*)offset_value_ptr;
+            float* value_ptr = offset_value_ptr + 8;
+
+            offset_ptr[0] = in_bound_000 ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+            offset_ptr[1] = in_bound_001 ? (x1 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+            offset_ptr[2] = in_bound_010 ? (x0 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+            offset_ptr[3] = in_bound_011 ? (x1 + y1 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+
+            offset_ptr[4] = in_bound_100 ? (x0 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+            offset_ptr[5] = in_bound_101 ? (x1 + y0 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+            offset_ptr[6] = in_bound_110 ? (x0 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+            offset_ptr[7] = in_bound_111 ? (x1 + y1 * src.w + z1 * src.w * src.h) * src.elempack : -1.0;
+
+            value_ptr[0] = sample_x - x0;
+            value_ptr[1] = sample_y - y0;
+            value_ptr[2] = sample_z - z0;
+
+            gridptr_x++;
+            gridptr_y++;
+            gridptr_z++;
+            offset_value_ptr += 11;
+        }
+    }
+}
diff --git a/src/layer/x86/gridsample_compute_blob.h b/src/layer/x86/gridsample_compute_blob.h
new file mode 100644
index 000000000000..4fb41f2cc247
--- /dev/null
+++ b/src/layer/x86/gridsample_compute_blob.h
@@ -0,0 +1,145 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "x86_usability.h"
+
+template<bool align_corner>
+struct grid_sample_unormalize;
+
+template<>
+struct grid_sample_unormalize</*align_corner*/ true>
+{
+#if __AVX__
+    __m256 operator()(__m256 length, __m256 coord)
+    {
+        return _mm256_mul_ps(_mm256_div_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), _mm256_set1_ps(2)), _mm256_sub_ps(length, _mm256_set1_ps(1)));
+    }
+#endif // __AVX__
+    float operator()(int length, float coord)
+    {
+        return (coord + 1) / 2.f * (length - 1);
+    }
+};
+
+template<>
+struct grid_sample_unormalize</*align_corner*/ false>
+{
+#if __AVX__
+    __m256 operator()(__m256 length, __m256 coord)
+    {
+        return _mm256_div_ps(_mm256_comp_fmsub_ps(_mm256_add_ps(coord, _mm256_set1_ps(1)), length, _mm256_set1_ps(1)), _mm256_set1_ps(2));
+    }
+#endif // __AVX__
+    float operator()(int length, float coord)
+    {
+        return ((coord + 1) * length - 1) / 2.f;
+    }
+};
+
+template<GridSample::PaddingMode pd, bool align_corner>
+struct compute_coord
+{
+#if __AVX__
+    __m256 operator()(__m256 length, __m256 coord)
+    {
+        return coord;
+    }
+#endif // __AVX__
+    float operator()(int length, float coord)
+    {
+        return coord;
+    }
+};
+
+template<bool align_corner>
+struct compute_coord<GridSample::Padding_BORDER, align_corner>
+{
+#if __AVX__
+    __m256 operator()(__m256 length, __m256 coord)
+    {
+        const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1));
+
+        coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps()));
+
+        return coord;
+    }
+#endif // __AVX__
+    float operator()(int length, float coord)
+    {
+        return std::min(length - 1.0f, std::max(coord, 0.0f));
+    }
+};
+
+template<>
+struct compute_coord<GridSample::Padding_REFLECTION, /*align_corner*/ true>
+{
+#if __AVX__
+    __m256 operator()(__m256 length, __m256 coord)
+    {
+        const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1));
+
+        coord = abs256_ps(coord);
+
+        __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, border_x));
+        coord = _mm256_sub_ps(border_x, reflectx_v);
+
+        return coord;
+    }
+#endif // __AVX__
+    float operator()(int length, float coord)
+    {
+        coord = fabs(coord);
+        coord = (length - 1) - fabs(coord - (length - 1));
+
+        return std::min(length - 1.0f, std::max(coord, 0.0f));
+    }
+};
+
+template<>
+struct compute_coord<GridSample::Padding_REFLECTION, /*align_corner*/ false>
+{
+#if __AVX__
+    __m256 operator()(__m256 length, __m256 coord)
+    {
+        const __m256 border_x = _mm256_sub_ps(length, _mm256_set1_ps(1));
+
+        __m256 v0p5fp8 = _mm256_set1_ps(0.5f);
+        coord = _mm256_add_ps(coord, v0p5fp8);
+
+        coord = abs256_ps(coord);
+
+        __m256 reflectx_v = abs256_ps(_mm256_sub_ps(coord, length));
+        coord = _mm256_sub_ps(length, reflectx_v);
+
+        coord = _mm256_sub_ps(coord, v0p5fp8);
+
+        _mm256_sub_ps(coord, v0p5fp8);
+
+        coord = _mm256_min_ps(border_x, _mm256_max_ps(coord, _mm256_setzero_ps()));
+
+        return coord;
+    }
+#endif // __AVX__
+    float operator()(int length, float coord)
+    {
+        coord = fabs(coord + 0.5f);
+        coord = length - fabs(coord - length) - 0.5;
+
+        return std::min(length - 1.0f, std::max(coord, 0.0f));
+    }
+};
+
+#include "gridsample_bilinear_compute_blob.h"
+#include "gridsample_bicubic_compute_blob.h"
+#include "gridsample_nearest_compute_blob.h"
\ No newline at end of file
diff --git a/src/layer/x86/gridsample_nearest_apply_interpolation.h b/src/layer/x86/gridsample_nearest_apply_interpolation.h
new file mode 100644
index 000000000000..e84cdc7de25e
--- /dev/null
+++ b/src/layer/x86/gridsample_nearest_apply_interpolation.h
@@ -0,0 +1,126 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+static void gridsample_nearest_apply_interpolation_p16(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const int* offset_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            __m512 _v = offset_ptr[0] >= 0 ? _mm512_loadu_ps(srcptr + offset_ptr[0]) : _mm512_set1_ps(0);
+            offset_ptr++;
+
+            _mm512_storeu_ps(dstptr, _v);
+            dstptr += 16;
+        }
+    }
+}
+#endif // __AVX512F__
+
+static void gridsample_nearest_apply_interpolation_p8(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const int* offset_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            __m256 _v = offset_ptr[0] >= 0 ? _mm256_loadu_ps(srcptr + offset_ptr[0]) : _mm256_set1_ps(0);
+            offset_ptr++;
+
+            _mm256_storeu_ps(dstptr, _v);
+            dstptr += 8;
+        }
+    }
+}
+#endif // __AVX__
+static void gridsample_nearest_apply_interpolation_p4(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const int* offset_ptr = offset_value.channel(0);
+
+        for (int i = 0; i < grid_size; i++)
+        {
+            __m128 _v = offset_ptr[0] >= 0 ? _mm_loadu_ps(srcptr + offset_ptr[0]) : _mm_set1_ps(0);
+            offset_ptr++;
+
+            _mm_storeu_ps(dstptr, _v);
+            dstptr += 4;
+        }
+    }
+}
+
+#endif // __SSE2__
+
+static void gridsample_nearest_apply_interpolation_p1(const Mat& src, Mat& dst, const Mat& offset_value, const Option& opt)
+{
+    const int channels = dst.c;
+    const int outw = dst.w;
+    const int outh = dst.h;
+    const int outd = dst.d;
+    const int grid_size = outw * outh * outd;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int q = 0; q < channels; q++)
+    {
+        const float* srcptr = src.channel(q);
+        float* dstptr = dst.channel(q);
+
+        const int* offset_ptr = offset_value.channel(0);
+
+        for (int x = 0; x < grid_size; x++)
+        {
+            *dstptr = offset_ptr[0] >= 0 ? *(srcptr + offset_ptr[0]) : 0;
+
+            offset_ptr++;
+            dstptr++;
+        }
+    }
+}
diff --git a/src/layer/x86/gridsample_nearest_compute_blob.h b/src/layer/x86/gridsample_nearest_compute_blob.h
new file mode 100644
index 000000000000..a7a12066d216
--- /dev/null
+++ b/src/layer/x86/gridsample_nearest_compute_blob.h
@@ -0,0 +1,315 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+template<GridSample::PaddingMode pd, bool align_corner>
+void gridsample_2d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion)
+{
+    const int grid_size = grid.w * grid.h;
+
+    float* offset_ptr = offset_value.channel(0);
+
+    grid_sample_unormalize<align_corner> unormalize;
+    compute_coord<pd, align_corner> get_coord;
+
+    if (permute_fusion == 0)
+    {
+        for (int y = 0; y < grid.c; y++)
+        {
+            const float* gridptr = grid.channel(y);
+            int x = 0;
+#if __AVX__
+            for (; x + 15 < grid_size; x += 16)
+            {
+                __m256 gx = _mm256_loadu_ps(gridptr);
+                __m256 gy = _mm256_loadu_ps(gridptr + 8);
+
+                transpose2x8_ps(gx, gy);
+
+                gx = unormalize(_mm256_set1_ps(src.w), gx);
+                gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+                gy = unormalize(_mm256_set1_ps(src.h), gy);
+                gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+                gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f)));
+                gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f)));
+
+                __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)),
+                                                  _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)));
+
+                __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(src.elempack));
+
+                offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range);
+
+                _mm256_storeu_ps(offset_ptr, offset);
+
+                gridptr += 16;
+                offset_ptr += 8;
+            }
+
+#endif // __AVX__
+
+            for (; x < grid_size; x += 2)
+            {
+                float sample_x = *gridptr;
+                float sample_y = *(gridptr + 1);
+
+                sample_x = unormalize(src.w, sample_x);
+                sample_x = get_coord(src.w, sample_x);
+
+                sample_y = unormalize(src.h, sample_y);
+                sample_y = get_coord(src.h, sample_y);
+
+                int x0 = static_cast<int>(floorf(sample_x + 0.5f));
+                int y0 = static_cast<int>(floorf(sample_y + 0.5f));
+
+                bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h));
+
+                int* iptr = (int*)offset_ptr;
+                *iptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0;
+
+                gridptr += 2;
+                offset_ptr++;
+            }
+        }
+    }
+    else
+    {
+        const float* gridptr_x = grid.channel(0);
+        const float* gridptr_y = grid.channel(1);
+
+        int x = 0;
+#if __AVX__
+        for (; x + 7 < grid_size; x += 8)
+        {
+            __m256 gx = _mm256_loadu_ps(gridptr_x);
+            __m256 gy = _mm256_loadu_ps(gridptr_y);
+
+            gx = unormalize(_mm256_set1_ps(src.w), gx);
+            gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+            gy = unormalize(_mm256_set1_ps(src.h), gy);
+            gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+            gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f)));
+            gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f)));
+
+            __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)),
+                                              _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)));
+
+            __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx), _mm256_set1_ps(src.elempack));
+
+            offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range);
+
+            _mm256_storeu_ps(offset_ptr, offset);
+
+            gridptr_x += 8;
+            gridptr_y += 8;
+            offset_ptr += 8;
+        }
+
+#endif // __AVX__
+
+        for (; x < grid_size; x++)
+        {
+            float sample_x = *gridptr_x;
+            float sample_y = *gridptr_y;
+
+            sample_x = unormalize(src.w, sample_x);
+            sample_x = get_coord(src.w, sample_x);
+
+            sample_y = unormalize(src.h, sample_y);
+            sample_y = get_coord(src.h, sample_y);
+
+            int x0 = static_cast<int>(floorf(sample_x + 0.5f));
+            int y0 = static_cast<int>(floorf(sample_y + 0.5f));
+
+            bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h));
+
+            int* iptr = (int*)offset_ptr;
+            *iptr = in_bound ? (x0 + y0 * src.w) * src.elempack : -1.0;
+
+            gridptr_x++;
+            gridptr_y++;
+
+            offset_ptr++;
+        }
+    }
+}
+
+template<GridSample::PaddingMode pd, bool align_corner>
+void gridsample_3d_nearest_compute_blob(const Mat& src, const Mat& grid, Mat& offset_value, int permute_fusion)
+{
+    const int grid_size = grid.w * grid.h * grid.d;
+
+    float* offset_ptr = offset_value.channel(0);
+
+    grid_sample_unormalize<align_corner> unormalize;
+    compute_coord<pd, align_corner> get_coord;
+
+    if (permute_fusion == 0)
+    {
+        for (int y = 0; y < grid.c; y++)
+        {
+            const float* gridptr = grid.channel(y);
+            int x = 0;
+#if __AVX__
+            for (; x + 23 < grid_size; x += 24)
+            {
+                __m256 gx = _mm256_loadu_ps(gridptr);
+                __m256 gy = _mm256_loadu_ps(gridptr + 8);
+                __m256 gz = _mm256_loadu_ps(gridptr + 16);
+
+                transpose3x8_ps(gx, gy, gz);
+
+                gx = unormalize(_mm256_set1_ps(src.w), gx);
+                gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+                gy = unormalize(_mm256_set1_ps(src.h), gy);
+                gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+                gz = unormalize(_mm256_set1_ps(src.d), gz);
+                gz = get_coord(_mm256_set1_ps(src.d), gz);
+
+                gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f)));
+                gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f)));
+                gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f)));
+
+                __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)),
+                                                  _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)));
+                v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS)));
+
+                __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), gz,
+                                              _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)),
+                                              _mm256_set1_ps(src.elempack));
+
+                offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range);
+
+                _mm256_storeu_ps(offset_ptr, offset);
+
+                gridptr += 24;
+                offset_ptr += 8;
+            }
+
+#endif // __AVX__
+
+            for (; x < grid_size; x += 3)
+            {
+                float sample_x = *gridptr;
+                float sample_y = *(gridptr + 1);
+                float sample_z = *(gridptr + 2);
+
+                sample_x = unormalize(src.w, sample_x);
+                sample_x = get_coord(src.w, sample_x);
+
+                sample_y = unormalize(src.h, sample_y);
+                sample_y = get_coord(src.h, sample_y);
+
+                sample_z = unormalize(src.d, sample_z);
+                sample_z = get_coord(src.d, sample_z);
+
+                int x0 = static_cast<int>(floorf(sample_x + 0.5f));
+                int y0 = static_cast<int>(floorf(sample_y + 0.5f));
+                int z0 = static_cast<int>(floorf(sample_z + 0.5f));
+
+                bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d));
+
+                int* iptr = (int*)offset_ptr;
+                *iptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+
+                gridptr += 3;
+                offset_ptr++;
+            }
+        }
+    }
+    else
+    {
+        const float* gridptr_x = grid.channel(0);
+        const float* gridptr_y = grid.channel(1);
+        const float* gridptr_z = grid.channel(2);
+
+        int x = 0;
+#if __AVX__
+        for (; x + 7 < grid_size; x += 8)
+        {
+            __m256 gx = _mm256_loadu_ps(gridptr_x);
+            __m256 gy = _mm256_loadu_ps(gridptr_y);
+            __m256 gz = _mm256_loadu_ps(gridptr_z);
+
+            gx = unormalize(_mm256_set1_ps(src.w), gx);
+            gx = get_coord(_mm256_set1_ps(src.w), gx);
+
+            gy = unormalize(_mm256_set1_ps(src.h), gy);
+            gy = get_coord(_mm256_set1_ps(src.h), gy);
+
+            gz = unormalize(_mm256_set1_ps(src.d), gz);
+            gz = get_coord(_mm256_set1_ps(src.d), gz);
+
+            gx = _mm256_floor_ps(_mm256_add_ps(gx, _mm256_set1_ps(0.5f)));
+            gy = _mm256_floor_ps(_mm256_add_ps(gy, _mm256_set1_ps(0.5f)));
+            gz = _mm256_floor_ps(_mm256_add_ps(gz, _mm256_set1_ps(0.5f)));
+
+            __m256 v_in_range = _mm256_and_ps(_mm256_and_ps(_mm256_cmp_ps(gx, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.w), gx, _CMP_GT_OS)),
+                                              _mm256_and_ps(_mm256_cmp_ps(gy, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.h), gy, _CMP_GT_OS)));
+            v_in_range = _mm256_and_ps(v_in_range, _mm256_and_ps(_mm256_cmp_ps(gz, _mm256_set1_ps(-1), _CMP_GT_OS), _mm256_cmp_ps(_mm256_set1_ps(src.d), gz, _CMP_GT_OS)));
+
+            __m256 offset = _mm256_mul_ps(_mm256_comp_fmadd_ps(_mm256_mul_ps(_mm256_set1_ps(src.w), _mm256_set1_ps(src.h)), gz,
+                                          _mm256_comp_fmadd_ps(gy, _mm256_set1_ps(src.w), gx)),
+                                          _mm256_set1_ps(src.elempack));
+
+            offset = _mm256_blendv_ps(_mm256_set1_ps(-1.0f), _mm256_castsi256_ps(_mm256_cvtps_epi32(offset)), v_in_range);
+
+            _mm256_storeu_ps(offset_ptr, offset);
+
+            gridptr_x += 8;
+            gridptr_y += 8;
+            gridptr_z += 8;
+
+            offset_ptr += 8;
+        }
+
+#endif // __AVX__
+
+        for (; x < grid_size; x++)
+        {
+            float sample_x = *gridptr_x;
+            float sample_y = *gridptr_y;
+            float sample_z = *gridptr_z;
+
+            sample_x = unormalize(src.w, sample_x);
+            sample_x = get_coord(src.w, sample_x);
+
+            sample_y = unormalize(src.h, sample_y);
+            sample_y = get_coord(src.h, sample_y);
+
+            sample_z = unormalize(src.d, sample_z);
+            sample_z = get_coord(src.d, sample_z);
+
+            int x0 = static_cast<int>(floorf(sample_x + 0.5f));
+            int y0 = static_cast<int>(floorf(sample_y + 0.5f));
+            int z0 = static_cast<int>(floorf(sample_z + 0.5f));
+
+            bool in_bound = ((x0 > -1) & (x0 < src.w) & (y0 > -1) & (y0 < src.h) & (z0 > -1) & (z0 < src.d));
+
+            int* iptr = (int*)offset_ptr;
+            *iptr = in_bound ? (x0 + y0 * src.w + z0 * src.w * src.h) * src.elempack : -1.0;
+
+            gridptr_x++;
+            gridptr_y++;
+            gridptr_z++;
+
+            offset_ptr++;
+        }
+    }
+}
diff --git a/src/layer/x86/gridsample_x86.cpp b/src/layer/x86/gridsample_x86.cpp
new file mode 100644
index 000000000000..004bc4d0895b
--- /dev/null
+++ b/src/layer/x86/gridsample_x86.cpp
@@ -0,0 +1,455 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "gridsample_x86.h"
+
+#if __SSE2__
+#include <emmintrin.h>
+#include "sse_mathfun.h"
+#if __AVX__
+#include <immintrin.h>
+#include "avx_mathfun.h"
+#if __AVX512F__
+#include "avx512_mathfun.h"
+#endif // __AVX512F__
+#endif // __AVX__
+#endif // __SSE2__
+#include "x86_usability.h"
+#include "cpu.h"
+
+namespace ncnn {
+
+#include "gridsample_compute_blob.h"
+#include "gridsample_bilinear_apply_interpolation.h"
+#include "gridsample_bicubic_apply_interpolation.h"
+#include "gridsample_nearest_apply_interpolation.h"
+
+GridSample_x86::GridSample_x86()
+{
+#if __SSE2__
+    support_packing = true;
+#endif // __SSE2__
+}
+
+int GridSample_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    const Mat& bottom_blob = bottom_blobs[0];
+    const Mat& grid = bottom_blobs[1];
+    Mat& top_blob = top_blobs[0];
+    int elempack = bottom_blob.elempack;
+
+    int channels = bottom_blob.c;
+    int dims = bottom_blob.dims;
+    size_t elemsize = bottom_blob.elemsize;
+
+    int outw, outh, outd;
+    Mat offset_value_blob;
+
+    Mat grid_p1;
+    if (grid.elempack != 1)
+    {
+        convert_packing(grid, grid_p1, 1, opt);
+    }
+    else
+    {
+        grid_p1 = grid;
+    }
+
+    if (dims == 3)
+    {
+        outw = permute_fusion == 0 ? grid_p1.h : grid_p1.w;
+        outh = permute_fusion == 0 ? grid_p1.c : grid_p1.h;
+
+        top_blob.create(outw, outh, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (sample_type == GridSample::Interpolation_BILINEAR)
+        {
+            offset_value_blob.create(outw, outh, elemsize * 6, 6, opt.workspace_allocator);
+            if (offset_value_blob.empty())
+                return -100;
+
+            if (padding_mode == GridSample::Padding_ZEROS)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_bilinear_compute_blob<GridSample::Padding_ZEROS, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_bilinear_compute_blob<GridSample::Padding_ZEROS, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_BORDER)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_bilinear_compute_blob<GridSample::Padding_BORDER, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_bilinear_compute_blob<GridSample::Padding_BORDER, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_REFLECTION)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_bilinear_compute_blob<GridSample::Padding_REFLECTION, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_bilinear_compute_blob<GridSample::Padding_REFLECTION, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else
+            {
+                NCNN_LOGE("gridsample padding_mode error\n");
+                return -100;
+            }
+        }
+
+        if (sample_type == GridSample::Interpolation_NEAREST)
+        {
+            offset_value_blob.create(outw, outh, 1, elemsize, 1, opt.workspace_allocator);
+            if (offset_value_blob.empty())
+                return -100;
+
+            if (padding_mode == GridSample::Padding_ZEROS)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_nearest_compute_blob<GridSample::Padding_ZEROS, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_nearest_compute_blob<GridSample::Padding_ZEROS, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_BORDER)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_nearest_compute_blob<GridSample::Padding_BORDER, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_nearest_compute_blob<GridSample::Padding_BORDER, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_REFLECTION)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_nearest_compute_blob<GridSample::Padding_REFLECTION, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_nearest_compute_blob<GridSample::Padding_REFLECTION, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else
+            {
+                NCNN_LOGE("gridsample padding_mode error\n");
+                return -100;
+            }
+        }
+
+        if (sample_type == GridSample::Interpolation_BICUBIC)
+        {
+            offset_value_blob.create(outw, outh, elemsize * 18, 18, opt.workspace_allocator);
+            if (offset_value_blob.empty())
+                return -100;
+
+            if (padding_mode == GridSample::Padding_ZEROS)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_bicubic_compute_blob<GridSample::Padding_ZEROS, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_bicubic_compute_blob<GridSample::Padding_ZEROS, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_BORDER)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_bicubic_compute_blob<GridSample::Padding_BORDER, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_bicubic_compute_blob<GridSample::Padding_BORDER, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_REFLECTION)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_2d_bicubic_compute_blob<GridSample::Padding_REFLECTION, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_2d_bicubic_compute_blob<GridSample::Padding_REFLECTION, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else
+            {
+                NCNN_LOGE("gridsample padding_mode error\n");
+                return -100;
+            }
+        }
+    }
+
+    if (dims == 4)
+    {
+        outw = permute_fusion == 0 ? grid_p1.h : grid_p1.w;
+        outh = permute_fusion == 0 ? grid_p1.d : grid_p1.h;
+        outd = permute_fusion == 0 ? grid_p1.c : grid_p1.d;
+
+        top_blob.create(outw, outh, outd, channels, elemsize, elempack, opt.blob_allocator);
+        if (top_blob.empty())
+            return -100;
+
+        if (sample_type == GridSample::Interpolation_BILINEAR)
+        {
+            offset_value_blob.create(outw, outh, outd, elemsize * 11, 11, opt.workspace_allocator);
+            if (offset_value_blob.empty())
+                return -100;
+
+            if (padding_mode == GridSample::Padding_ZEROS)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_3d_bilinear_compute_blob<GridSample::Padding_ZEROS, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_3d_bilinear_compute_blob<GridSample::Padding_ZEROS, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_BORDER)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_3d_bilinear_compute_blob<GridSample::Padding_BORDER, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_3d_bilinear_compute_blob<GridSample::Padding_BORDER, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_REFLECTION)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_3d_bilinear_compute_blob<GridSample::Padding_REFLECTION, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_3d_bilinear_compute_blob<GridSample::Padding_REFLECTION, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else
+            {
+                NCNN_LOGE("gridsample padding_mode error\n");
+                return -100;
+            }
+        }
+
+        if (sample_type == GridSample::Interpolation_NEAREST)
+        {
+            offset_value_blob.create(outw, outh, outd, 1, elemsize, 1, opt.workspace_allocator);
+            if (offset_value_blob.empty())
+                return -100;
+
+            if (padding_mode == GridSample::Padding_ZEROS)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_3d_nearest_compute_blob<GridSample::Padding_ZEROS, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_3d_nearest_compute_blob<GridSample::Padding_ZEROS, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_BORDER)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_3d_nearest_compute_blob<GridSample::Padding_BORDER, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_3d_nearest_compute_blob<GridSample::Padding_BORDER, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else if (padding_mode == GridSample::Padding_REFLECTION)
+            {
+                if (align_corner == 0)
+                {
+                    gridsample_3d_nearest_compute_blob<GridSample::Padding_REFLECTION, false>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+                else
+                {
+                    gridsample_3d_nearest_compute_blob<GridSample::Padding_REFLECTION, true>(bottom_blob, grid_p1, offset_value_blob, permute_fusion);
+                }
+            }
+            else
+            {
+                NCNN_LOGE("gridsample padding_mode error\n");
+                return -100;
+            }
+        }
+
+        if (sample_type == 3)
+        {
+            NCNN_LOGE("unsupported bicubic when dims == 4");
+            return -100;
+        }
+    }
+
+#if __SSE2__
+#if __AVX__
+#if __AVX512F__
+    if (elempack == 16)
+    {
+        if (dims == 3)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_2d_bilinear_apply_interpolation_p16(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p16(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_BICUBIC)
+            {
+                gridsample_2d_bicubic_apply_interpolation_p16(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+        else if (dims == 4)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_3d_bilinear_apply_interpolation_p16(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p16(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+    }
+#endif // __AVX512F__
+    if (elempack == 8)
+    {
+        if (dims == 3)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_2d_bilinear_apply_interpolation_p8(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p8(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_BICUBIC)
+            {
+                gridsample_2d_bicubic_apply_interpolation_p8(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+        else if (dims == 4)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_3d_bilinear_apply_interpolation_p8(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p8(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+    }
+
+#endif // __AVX__
+    if (elempack == 4)
+    {
+        if (dims == 3)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_2d_bilinear_apply_interpolation_p4(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p4(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_BICUBIC)
+            {
+                gridsample_2d_bicubic_apply_interpolation_p4(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+        else if (dims == 4)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_3d_bilinear_apply_interpolation_p4(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p4(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+    }
+
+#endif // __SSE2__
+
+    if (elempack == 1)
+    {
+        if (dims == 3)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_2d_bilinear_apply_interpolation_p1(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p1(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_BICUBIC)
+            {
+                gridsample_2d_bicubic_apply_interpolation_p1(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+        else if (dims == 4)
+        {
+            if (sample_type == GridSample::Interpolation_BILINEAR)
+            {
+                gridsample_3d_bilinear_apply_interpolation_p1(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+            else if (sample_type == GridSample::Interpolation_NEAREST)
+            {
+                gridsample_nearest_apply_interpolation_p1(bottom_blob, top_blob, offset_value_blob, opt);
+            }
+        }
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/x86/gridsample_x86.h b/src/layer/x86/gridsample_x86.h
new file mode 100644
index 000000000000..826414eefc9b
--- /dev/null
+++ b/src/layer/x86/gridsample_x86.h
@@ -0,0 +1,32 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_GRIDSAMPLE_X86_H
+#define LAYER_GRIDSAMPLE_X86_H
+
+#include "gridsample.h"
+
+namespace ncnn {
+
+class GridSample_x86 : virtual public GridSample
+{
+public:
+    GridSample_x86();
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GRIDSAMPLE_X86_H
diff --git a/src/layer/x86/shufflechannel_x86.cpp b/src/layer/x86/shufflechannel_x86.cpp
index f4326289b8b0..8afb22b2e2ed 100644
--- a/src/layer/x86/shufflechannel_x86.cpp
+++ b/src/layer/x86/shufflechannel_x86.cpp
@@ -343,9 +343,9 @@ int ShuffleChannel_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
                 for (int i = 0; i < size; i++)
                 {
                     __m256 _p0 = _mm256_loadu_ps(ptr0);
-                    // macro `_mm256_loadu2_m128` is declared in Intel® Intrinsics Guide but somehow missed in <immintrin.h>
-                    // __m256 _p1 = _mm256_loadu2_m128(ptr2, ptr1);
-                    __m256 _p1 = _mm256_set_m128(_mm_loadu_ps(ptr2), _mm_loadu_ps(ptr1));
+
+                    __m256 _p1 = _mm256_castps128_ps256(_mm_loadu_ps(ptr1));
+                    _p1 = _mm256_insertf128_ps(_p1, _mm_loadu_ps(ptr2), 1);
 
                     __m256 _lo = _mm256_unpacklo_ps(_p0, _p1);
                     __m256 _hi = _mm256_unpackhi_ps(_p0, _p1);
diff --git a/src/layer/x86/x86_usability.h b/src/layer/x86/x86_usability.h
index 401573584c6e..9cb826fa2b19 100644
--- a/src/layer/x86/x86_usability.h
+++ b/src/layer/x86/x86_usability.h
@@ -41,6 +41,83 @@ static NCNN_FORCEINLINE signed char float2int8(float v)
 }
 
 #if __SSE2__
+static NCNN_FORCEINLINE void transpose4x8_epi32(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7)
+{
+    __m128i _tmp0 = _mm_unpacklo_epi32(_r0, _r1);
+    __m128i _tmp1 = _mm_unpackhi_epi32(_r0, _r1);
+    __m128i _tmp2 = _mm_unpacklo_epi32(_r2, _r3);
+    __m128i _tmp3 = _mm_unpackhi_epi32(_r2, _r3);
+    __m128i _tmp4 = _mm_unpacklo_epi32(_r4, _r5);
+    __m128i _tmp5 = _mm_unpackhi_epi32(_r4, _r5);
+    __m128i _tmp6 = _mm_unpacklo_epi32(_r6, _r7);
+    __m128i _tmp7 = _mm_unpackhi_epi32(_r6, _r7);
+
+    _r0 = _mm_unpacklo_epi64(_tmp0, _tmp2);
+    _r1 = _mm_unpacklo_epi64(_tmp4, _tmp6);
+    _r2 = _mm_unpackhi_epi64(_tmp0, _tmp2);
+    _r3 = _mm_unpackhi_epi64(_tmp4, _tmp6);
+    _r4 = _mm_unpacklo_epi64(_tmp1, _tmp3);
+    _r5 = _mm_unpacklo_epi64(_tmp5, _tmp7);
+    _r6 = _mm_unpackhi_epi64(_tmp1, _tmp3);
+    _r7 = _mm_unpackhi_epi64(_tmp5, _tmp7);
+}
+
+static NCNN_FORCEINLINE void transpose4x4_epi32(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3)
+{
+    __m128i _tmp0 = _mm_unpacklo_epi32(_r0, _r1);
+    __m128i _tmp1 = _mm_unpackhi_epi32(_r0, _r1);
+    __m128i _tmp2 = _mm_unpacklo_epi32(_r2, _r3);
+    __m128i _tmp3 = _mm_unpackhi_epi32(_r2, _r3);
+
+    _r0 = _mm_unpacklo_epi64(_tmp0, _tmp2);
+    _r1 = _mm_unpackhi_epi64(_tmp0, _tmp2);
+    _r2 = _mm_unpacklo_epi64(_tmp1, _tmp3);
+    _r3 = _mm_unpackhi_epi64(_tmp1, _tmp3);
+}
+
+static NCNN_FORCEINLINE void transpose8x8_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7)
+{
+    __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
+    __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
+    __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
+    __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
+    __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
+    __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
+    __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
+    __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
+
+    __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2);
+    __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2);
+    __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3);
+    __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3);
+    __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6);
+    __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6);
+    __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7);
+    __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7);
+
+    _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc);
+    _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc);
+    _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd);
+    _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd);
+    _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe);
+    _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe);
+    _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf);
+    _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf);
+}
+
+static NCNN_FORCEINLINE void transpose8x4_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3)
+{
+    __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
+    __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
+    __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
+    __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
+
+    _r0 = _mm_unpacklo_epi32(_tmp0, _tmp2);
+    _r1 = _mm_unpackhi_epi32(_tmp0, _tmp2);
+    _r2 = _mm_unpacklo_epi32(_tmp1, _tmp3);
+    _r3 = _mm_unpackhi_epi32(_tmp1, _tmp3);
+}
+
 static NCNN_FORCEINLINE float _mm_reduce_add_ps(__m128 x128)
 {
     const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
@@ -195,6 +272,14 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128
 {
     return _mm_sub_ps(_c, _mm_mul_ps(_a, _b));
 }
+static NCNN_FORCEINLINE __m128 _mm_comp_fmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c)
+{
+    return _mm_sub_ps(_mm_mul_ps(_a, _b), _c);
+}
+static NCNN_FORCEINLINE __m128 _mm_comp_fnmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c)
+{
+    return _mm_sub_ps(_c, _mm_mul_ps(_mm_mul_ps(_a, _b), _mm_set1_ps(-1)));
+}
 #else
 static NCNN_FORCEINLINE __m128 _mm_comp_fmadd_ps(const __m128& _a, const __m128& _b, const __m128& _c)
 {
@@ -205,6 +290,14 @@ static NCNN_FORCEINLINE __m128 _mm_comp_fnmadd_ps(const __m128& _a, const __m128
     // return -a * b + c
     return _mm_fnmadd_ps(_a, _b, _c);
 }
+static NCNN_FORCEINLINE __m128 _mm_comp_fmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c)
+{
+    return _mm_fmsub_ps(_a, _b, _c);
+}
+static NCNN_FORCEINLINE __m128 _mm_comp_fnmsub_ps(const __m128& _a, const __m128& _b, const __m128& _c)
+{
+    return _mm_fnmsub_ps(_a, _b, _c);
+}
 #endif // !__FMA__
 
 #if __AVX__
@@ -217,9 +310,18 @@ static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m
 {
     return _mm256_sub_ps(_c, _mm256_mul_ps(_a, _b));
 }
+static NCNN_FORCEINLINE __m256 _mm256_comp_fmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c)
+{
+    return _mm256_sub_ps(_mm256_mul_ps(_a, _b), _c);
+}
+static NCNN_FORCEINLINE __m256 _mm256_comp_fnmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c)
+{
+    return _mm256_sub_ps(_c, _mm256_mul_ps(_mm256_mul_ps(_a, _b), _mm256_set1_ps(-1)));
+}
 #else
 static NCNN_FORCEINLINE __m256 _mm256_comp_fmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c)
 {
+    // return a * b + c
     return _mm256_fmadd_ps(_a, _b, _c);
 }
 static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m256& _b, const __m256& _c)
@@ -227,6 +329,16 @@ static NCNN_FORCEINLINE __m256 _mm256_comp_fnmadd_ps(const __m256& _a, const __m
     // return -a * b + c
     return _mm256_fnmadd_ps(_a, _b, _c);
 }
+static NCNN_FORCEINLINE __m256 _mm256_comp_fmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c)
+{
+    // return a * b - c
+    return _mm256_fmsub_ps(_a, _b, _c);
+}
+static NCNN_FORCEINLINE __m256 _mm256_comp_fnmsub_ps(const __m256& _a, const __m256& _b, const __m256& _c)
+{
+    // return -(a * b) - c
+    return _mm256_fnmsub_ps(_a, _b, _c);
+}
 #endif
 
 static NCNN_FORCEINLINE __m256 _mm256_fmadd_1_ps(const __m256& a, const __m256& b, float c)
@@ -340,34 +452,161 @@ static NCNN_FORCEINLINE void transpose8x2_ps(__m256& _r0, __m256& _r1)
     _r1 = _mm256_permute2f128_ps(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
 }
 
-static NCNN_FORCEINLINE void transpose8x8_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7)
+static NCNN_FORCEINLINE void transpose2x8_ps(__m256& _r0, __m256& _r1)
 {
-    __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
-    __m128i _tmp1 = _mm_unpackhi_epi16(_r0, _r1);
-    __m128i _tmp2 = _mm_unpacklo_epi16(_r2, _r3);
-    __m128i _tmp3 = _mm_unpackhi_epi16(_r2, _r3);
-    __m128i _tmp4 = _mm_unpacklo_epi16(_r4, _r5);
-    __m128i _tmp5 = _mm_unpackhi_epi16(_r4, _r5);
-    __m128i _tmp6 = _mm_unpacklo_epi16(_r6, _r7);
-    __m128i _tmp7 = _mm_unpackhi_epi16(_r6, _r7);
+    __m256 _tmp0 = _mm256_permute2f128_ps(_r0, _r1, _MM_SHUFFLE(0, 2, 0, 0));
+    __m256 _tmp1 = _mm256_permute2f128_ps(_r0, _r1, _MM_SHUFFLE(0, 3, 0, 1));
 
-    __m128i _tmp8 = _mm_unpacklo_epi32(_tmp0, _tmp2);
-    __m128i _tmp9 = _mm_unpackhi_epi32(_tmp0, _tmp2);
-    __m128i _tmpa = _mm_unpacklo_epi32(_tmp1, _tmp3);
-    __m128i _tmpb = _mm_unpackhi_epi32(_tmp1, _tmp3);
-    __m128i _tmpc = _mm_unpacklo_epi32(_tmp4, _tmp6);
-    __m128i _tmpd = _mm_unpackhi_epi32(_tmp4, _tmp6);
-    __m128i _tmpe = _mm_unpacklo_epi32(_tmp5, _tmp7);
-    __m128i _tmpf = _mm_unpackhi_epi32(_tmp5, _tmp7);
+    _r0 = _mm256_shuffle_ps(_tmp0, _tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+    _r1 = _mm256_shuffle_ps(_tmp0, _tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+}
 
-    _r0 = _mm_unpacklo_epi64(_tmp8, _tmpc);
-    _r1 = _mm_unpackhi_epi64(_tmp8, _tmpc);
-    _r2 = _mm_unpacklo_epi64(_tmp9, _tmpd);
-    _r3 = _mm_unpackhi_epi64(_tmp9, _tmpd);
-    _r4 = _mm_unpacklo_epi64(_tmpa, _tmpe);
-    _r5 = _mm_unpackhi_epi64(_tmpa, _tmpe);
-    _r6 = _mm_unpacklo_epi64(_tmpb, _tmpf);
-    _r7 = _mm_unpackhi_epi64(_tmpb, _tmpf);
+static NCNN_FORCEINLINE void transpose3x8_ps(__m256& _r0, __m256& _r1, __m256& _r2)
+{
+    __m256 _tmp0 = _mm256_permute2f128_ps(_r0, _r1, _MM_SHUFFLE(0, 3, 0, 0));
+    __m256 _tmp1 = _mm256_permute2f128_ps(_r0, _r2, _MM_SHUFFLE(0, 2, 0, 1));
+    __m256 _tmp2 = _mm256_permute2f128_ps(_r1, _r2, _MM_SHUFFLE(0, 3, 0, 0));
+
+    __m256 _tmp4 = _mm256_shuffle_ps(_tmp0, _tmp1, _MM_SHUFFLE(1, 0, 2, 1));
+    __m256 _tmp5 = _mm256_shuffle_ps(_tmp1, _tmp2, _MM_SHUFFLE(2, 1, 3, 2));
+
+    _r0 = _mm256_shuffle_ps(_tmp0, _tmp5, _MM_SHUFFLE(2, 0, 3, 0));
+    _r1 = _mm256_shuffle_ps(_tmp4, _tmp5, _MM_SHUFFLE(3, 1, 2, 0));
+    _r2 = _mm256_shuffle_ps(_tmp4, _tmp2, _MM_SHUFFLE(3, 0, 3, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x6_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
+    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
+
+    __m256 _tmp6 = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmp7 = _mm256_shuffle_ps(_tmp4, _tmp0, _MM_SHUFFLE(3, 2, 1, 0));
+    __m256 _tmp8 = _mm256_shuffle_ps(_tmp2, _tmp4, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmp9 = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpa = _mm256_shuffle_ps(_tmp5, _tmp1, _MM_SHUFFLE(3, 2, 1, 0));
+    __m256 _tmpb = _mm256_shuffle_ps(_tmp3, _tmp5, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmp8, _tmp9, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmpa, _tmpb, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2f128_ps(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+    _r4 = _mm256_permute2f128_ps(_tmp8, _tmp9, _MM_SHUFFLE(0, 3, 0, 1));
+    _r5 = _mm256_permute2f128_ps(_tmpa, _tmpb, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose8x11_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7, __m256& _r8, __m256& _r9, __m256& _ra)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
+    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
+    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
+    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
+    __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
+    __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
+    __m256 _tmpa = _mm256_unpacklo_ps(_ra, _r0);
+    __m256 _tmpb = _mm256_shuffle_ps(_ra, _tmp1, _MM_SHUFFLE(3, 2, 1, 2));
+    __m256 _tmpc = _mm256_unpacklo_ps(_r1, _r2);
+    __m256 _tmpd = _mm256_unpackhi_ps(_r1, _r2);
+    __m256 _tmpe = _mm256_unpacklo_ps(_r3, _r4);
+    __m256 _tmpf = _mm256_unpackhi_ps(_r3, _r4);
+    __m256 _tmpg = _mm256_unpacklo_ps(_r5, _r6);
+    __m256 _tmph = _mm256_unpackhi_ps(_r5, _r6);
+    __m256 _tmpi = _mm256_unpacklo_ps(_r7, _r8);
+    __m256 _tmpj = _mm256_unpackhi_ps(_r7, _r8);
+    __m256 _tmpk = _mm256_unpacklo_ps(_r9, _ra);
+    __m256 _tmpl = _mm256_unpackhi_ps(_r9, _ra);
+
+    __m256 _tmpm = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpn = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpo = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(3, 0, 1, 0));
+    __m256 _tmpp = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpq = _mm256_shuffle_ps(_tmpg, _tmpi, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpr = _mm256_shuffle_ps(_tmpk, _tmp1, _MM_SHUFFLE(1, 0, 3, 2));
+    __m256 _tmps = _mm256_shuffle_ps(_tmp3, _tmp5, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpt = _mm256_shuffle_ps(_tmp7, _tmp9, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpu = _mm256_shuffle_ps(_tmpb, _tmpd, _MM_SHUFFLE(3, 2, 2, 0));
+    __m256 _tmpv = _mm256_shuffle_ps(_tmpf, _tmph, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpw = _mm256_shuffle_ps(_tmpj, _tmpl, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmpm, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmpo, _tmpp, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmpq, _tmpr, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2f128_ps(_tmps, _tmpt, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2f128_ps(_tmpu, _tmpv, _MM_SHUFFLE(0, 2, 0, 0));
+    _r5 = _mm256_permute2f128_ps(_tmpw, _tmpm, _MM_SHUFFLE(0, 3, 0, 0));
+    _r6 = _mm256_permute2f128_ps(_tmpn, _tmpo, _MM_SHUFFLE(0, 3, 0, 1));
+    _r7 = _mm256_permute2f128_ps(_tmpp, _tmpq, _MM_SHUFFLE(0, 3, 0, 1));
+    _r8 = _mm256_permute2f128_ps(_tmpr, _tmps, _MM_SHUFFLE(0, 3, 0, 1));
+    _r9 = _mm256_permute2f128_ps(_tmpt, _tmpu, _MM_SHUFFLE(0, 3, 0, 1));
+    _ra = _mm256_permute2f128_ps(_tmpv, _tmpw, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static void transpose8x18_ps(__m256& _r0, __m256& _r1, __m256& _r2, __m256& _r3, __m256& _r4, __m256& _r5, __m256& _r6, __m256& _r7, __m256& _r8, __m256& _r9, __m256& _ra, __m256& _rb, __m256& _rc, __m256& _rd, __m256& _re, __m256& _rf, __m256& _rg, __m256& _rh)
+{
+    __m256 _tmp0 = _mm256_unpacklo_ps(_r0, _r1);
+    __m256 _tmp1 = _mm256_unpackhi_ps(_r0, _r1);
+    __m256 _tmp2 = _mm256_unpacklo_ps(_r2, _r3);
+    __m256 _tmp3 = _mm256_unpackhi_ps(_r2, _r3);
+    __m256 _tmp4 = _mm256_unpacklo_ps(_r4, _r5);
+    __m256 _tmp5 = _mm256_unpackhi_ps(_r4, _r5);
+    __m256 _tmp6 = _mm256_unpacklo_ps(_r6, _r7);
+    __m256 _tmp7 = _mm256_unpackhi_ps(_r6, _r7);
+    __m256 _tmp8 = _mm256_unpacklo_ps(_r8, _r9);
+    __m256 _tmp9 = _mm256_unpackhi_ps(_r8, _r9);
+    __m256 _tmpa = _mm256_unpacklo_ps(_ra, _rb);
+    __m256 _tmpb = _mm256_unpackhi_ps(_ra, _rb);
+    __m256 _tmpc = _mm256_unpacklo_ps(_rc, _rd);
+    __m256 _tmpd = _mm256_unpackhi_ps(_rc, _rd);
+    __m256 _tmpe = _mm256_unpacklo_ps(_re, _rf);
+    __m256 _tmpf = _mm256_unpackhi_ps(_re, _rf);
+    __m256 _tmpg = _mm256_unpacklo_ps(_rg, _rh);
+    __m256 _tmph = _mm256_unpackhi_ps(_rg, _rh);
+
+    __m256 _tmpi = _mm256_shuffle_ps(_tmp0, _tmp2, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpj = _mm256_shuffle_ps(_tmp4, _tmp6, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpk = _mm256_shuffle_ps(_tmp8, _tmpa, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpl = _mm256_shuffle_ps(_tmpc, _tmpe, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpm = _mm256_shuffle_ps(_tmpg, _tmp0, _MM_SHUFFLE(3, 2, 1, 0));
+    __m256 _tmpn = _mm256_shuffle_ps(_tmp2, _tmp4, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpo = _mm256_shuffle_ps(_tmp6, _tmp8, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpp = _mm256_shuffle_ps(_tmpa, _tmpc, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpq = _mm256_shuffle_ps(_tmpe, _tmpg, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpr = _mm256_shuffle_ps(_tmp1, _tmp3, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmps = _mm256_shuffle_ps(_tmp5, _tmp7, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpt = _mm256_shuffle_ps(_tmp9, _tmpb, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpu = _mm256_shuffle_ps(_tmpd, _tmpf, _MM_SHUFFLE(1, 0, 1, 0));
+    __m256 _tmpv = _mm256_shuffle_ps(_tmph, _tmp1, _MM_SHUFFLE(3, 2, 1, 0));
+    __m256 _tmpw = _mm256_shuffle_ps(_tmp3, _tmp5, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpx = _mm256_shuffle_ps(_tmp7, _tmp9, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpy = _mm256_shuffle_ps(_tmpb, _tmpd, _MM_SHUFFLE(3, 2, 3, 2));
+    __m256 _tmpz = _mm256_shuffle_ps(_tmpf, _tmph, _MM_SHUFFLE(3, 2, 3, 2));
+
+    _r0 = _mm256_permute2f128_ps(_tmpi, _tmpj, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2f128_ps(_tmpk, _tmpl, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2f128_ps(_tmpm, _tmpn, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2f128_ps(_tmpo, _tmpp, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2f128_ps(_tmpq, _tmpr, _MM_SHUFFLE(0, 2, 0, 0));
+    _r5 = _mm256_permute2f128_ps(_tmps, _tmpt, _MM_SHUFFLE(0, 2, 0, 0));
+    _r6 = _mm256_permute2f128_ps(_tmpu, _tmpv, _MM_SHUFFLE(0, 2, 0, 0));
+    _r7 = _mm256_permute2f128_ps(_tmpw, _tmpx, _MM_SHUFFLE(0, 2, 0, 0));
+    _r8 = _mm256_permute2f128_ps(_tmpy, _tmpz, _MM_SHUFFLE(0, 2, 0, 0));
+    _r9 = _mm256_permute2f128_ps(_tmpi, _tmpj, _MM_SHUFFLE(0, 3, 0, 1));
+    _ra = _mm256_permute2f128_ps(_tmpk, _tmpl, _MM_SHUFFLE(0, 3, 0, 1));
+    _rb = _mm256_permute2f128_ps(_tmpm, _tmpn, _MM_SHUFFLE(0, 3, 0, 1));
+    _rc = _mm256_permute2f128_ps(_tmpo, _tmpp, _MM_SHUFFLE(0, 3, 0, 1));
+    _rd = _mm256_permute2f128_ps(_tmpq, _tmpr, _MM_SHUFFLE(0, 3, 0, 1));
+    _re = _mm256_permute2f128_ps(_tmps, _tmpt, _MM_SHUFFLE(0, 3, 0, 1));
+    _rf = _mm256_permute2f128_ps(_tmpu, _tmpv, _MM_SHUFFLE(0, 3, 0, 1));
+    _rg = _mm256_permute2f128_ps(_tmpw, _tmpx, _MM_SHUFFLE(0, 3, 0, 1));
+    _rh = _mm256_permute2f128_ps(_tmpy, _tmpz, _MM_SHUFFLE(0, 3, 0, 1));
 }
 
 static NCNN_FORCEINLINE __m256 HorizontalSums(__m256& v0, __m256& v1, __m256& v2, __m256& v3, __m256& v4, __m256& v5, __m256& v6, __m256& v7)
@@ -597,6 +836,55 @@ static NCNN_FORCEINLINE __m256i float2bfloat_avx(const __m256& v0, const __m256&
     return _v;
 }
 
+#if __AVX2__
+static NCNN_FORCEINLINE void transpose8x2_epi32(__m256i& _r0, __m256i& _r1)
+{
+    __m256i _tmp0 = _mm256_unpacklo_epi32(_r0, _r1);
+    __m256i _tmp1 = _mm256_unpackhi_epi32(_r0, _r1);
+
+    _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
+static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m256i& _r2, __m256i& _r3, __m256i& _r4, __m256i& _r5, __m256i& _r6, __m256i& _r7)
+{
+    __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
+    __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
+    __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
+    __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
+    __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
+    __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
+    __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
+    __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
+
+    __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
+    __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
+    __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
+    __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
+    __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
+    __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
+    __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
+    __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
+
+    _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
+    _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
+    _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
+    _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
+    _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
+    _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
+    _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
+    _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
+
+    _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
+    _r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0));
+    _r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
+    _r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
+    _r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
+    _r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1));
+    _r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
+    _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
+}
+
 #if __AVX512F__
 static NCNN_FORCEINLINE void transpose16x16_ps(__m512& _r0, __m512& _r1, __m512& _r2, __m512& _r3, __m512& _r4, __m512& _r5, __m512& _r6, __m512& _r7,
         __m512& _r8, __m512& _r9, __m512& _ra, __m512& _rb, __m512& _rc, __m512& _rd, __m512& _re, __m512& _rf)
@@ -927,45 +1215,6 @@ static NCNN_FORCEINLINE void transpose16x16_epi16(__m256i& _r0, __m256i& _r1, __
     _rf = _mm256_permute2x128_si256(_tmp7, _tmpf, _MM_SHUFFLE(0, 3, 0, 1));
 }
 
-static NCNN_FORCEINLINE void transpose16x8_epi16(__m256i& _r0, __m256i& _r1, __m256i& _r2, __m256i& _r3, __m256i& _r4, __m256i& _r5, __m256i& _r6, __m256i& _r7)
-{
-    __m256i _tmp0 = _mm256_unpacklo_epi16(_r0, _r1);
-    __m256i _tmp1 = _mm256_unpackhi_epi16(_r0, _r1);
-    __m256i _tmp2 = _mm256_unpacklo_epi16(_r2, _r3);
-    __m256i _tmp3 = _mm256_unpackhi_epi16(_r2, _r3);
-    __m256i _tmp4 = _mm256_unpacklo_epi16(_r4, _r5);
-    __m256i _tmp5 = _mm256_unpackhi_epi16(_r4, _r5);
-    __m256i _tmp6 = _mm256_unpacklo_epi16(_r6, _r7);
-    __m256i _tmp7 = _mm256_unpackhi_epi16(_r6, _r7);
-
-    __m256i _tmpg = _mm256_unpacklo_epi32(_tmp0, _tmp2);
-    __m256i _tmph = _mm256_unpackhi_epi32(_tmp0, _tmp2);
-    __m256i _tmpi = _mm256_unpacklo_epi32(_tmp1, _tmp3);
-    __m256i _tmpj = _mm256_unpackhi_epi32(_tmp1, _tmp3);
-    __m256i _tmpk = _mm256_unpacklo_epi32(_tmp4, _tmp6);
-    __m256i _tmpl = _mm256_unpackhi_epi32(_tmp4, _tmp6);
-    __m256i _tmpm = _mm256_unpacklo_epi32(_tmp5, _tmp7);
-    __m256i _tmpn = _mm256_unpackhi_epi32(_tmp5, _tmp7);
-
-    _tmp0 = _mm256_unpacklo_epi64(_tmpg, _tmpk);
-    _tmp1 = _mm256_unpackhi_epi64(_tmpg, _tmpk);
-    _tmp2 = _mm256_unpacklo_epi64(_tmph, _tmpl);
-    _tmp3 = _mm256_unpackhi_epi64(_tmph, _tmpl);
-    _tmp4 = _mm256_unpacklo_epi64(_tmpi, _tmpm);
-    _tmp5 = _mm256_unpackhi_epi64(_tmpi, _tmpm);
-    _tmp6 = _mm256_unpacklo_epi64(_tmpj, _tmpn);
-    _tmp7 = _mm256_unpackhi_epi64(_tmpj, _tmpn);
-
-    _r0 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 2, 0, 0));
-    _r1 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 2, 0, 0));
-    _r2 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 2, 0, 0));
-    _r3 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 2, 0, 0));
-    _r4 = _mm256_permute2x128_si256(_tmp0, _tmp1, _MM_SHUFFLE(0, 3, 0, 1));
-    _r5 = _mm256_permute2x128_si256(_tmp2, _tmp3, _MM_SHUFFLE(0, 3, 0, 1));
-    _r6 = _mm256_permute2x128_si256(_tmp4, _tmp5, _MM_SHUFFLE(0, 3, 0, 1));
-    _r7 = _mm256_permute2x128_si256(_tmp6, _tmp7, _MM_SHUFFLE(0, 3, 0, 1));
-}
-
 static NCNN_FORCEINLINE void transpose8x16_epi16(__m128i& _r0, __m128i& _r1, __m128i& _r2, __m128i& _r3, __m128i& _r4, __m128i& _r5, __m128i& _r6, __m128i& _r7, __m128i& _r8, __m128i& _r9, __m128i& _ra, __m128i& _rb, __m128i& _rc, __m128i& _rd, __m128i& _re, __m128i& _rf)
 {
     __m128i _tmp0 = _mm_unpacklo_epi16(_r0, _r1);
@@ -1087,6 +1336,7 @@ static NCNN_FORCEINLINE __m512i float2bfloat_avx512(const __m512& v0, const __m5
 }
 
 #endif // __AVX512F__
+#endif // __AVX2__
 #endif // __AVX__
 #endif // __SSE2__
 
diff --git a/tests/test_convolution_3.cpp b/tests/test_convolution_3.cpp
index fa358d0670cc..1d0f8f079b62 100644
--- a/tests/test_convolution_3.cpp
+++ b/tests/test_convolution_3.cpp
@@ -190,6 +190,30 @@ static int test_convolution_int8(int w, int h, int c, int outch, int kernel, int
         return ret;
     }
 
+    if (kernel == 3 && dilation == 1 && stride == 1)
+    {
+        ncnn::Option opt;
+        opt.num_threads = 1;
+        opt.use_packing_layout = true;
+        opt.use_fp16_packed = false;
+        opt.use_fp16_storage = false;
+        opt.use_fp16_arithmetic = false;
+        opt.use_bf16_storage = false;
+        opt.use_shader_pack8 = false;
+        opt.use_image_storage = false;
+        opt.use_sgemm_convolution = false;
+        opt.use_winograd_convolution = true;
+        opt.use_winograd23_convolution = true;
+        opt.use_winograd43_convolution = false;
+
+        ret = test_layer_opt<ncnn::Convolution>("Convolution", pd, weights, opt, a, requant ? 1.0f : 0.001f, 0, flag);
+        if (ret != 0)
+        {
+            fprintf(stderr, "test_convolution_int8 failed w=%d h=%d c=%d outch=%d kernel=%d dilation=%d stride=%d pad=%d bias=%d requant=%d act=%d actparams=[%f,%f]\n", w, h, c, outch, kernel, dilation, stride, pad, bias, requant, activation_type, activation_params[0], activation_params[1]);
+            return ret;
+        }
+    }
+
     {
         ncnn::Option opt;
         opt.num_threads = 1;
@@ -310,6 +334,7 @@ static int test_convolution_1()
            || test_convolution_int8(4, 20, 16, 24, 3, 1, 1, 1, 0)
            || test_convolution_int8(6, 7, 64, 64, 3, 1, 2, 0, 1)
            || test_convolution_int8(25, 33, 16, 15, 3, 1, 1, 1, 0)
+           || test_convolution_int8(25, 33, 31, 31, 3, 1, 1, 1, 0)
            || test_convolution_int8(7, 7, 15, 12, 3, 1, 1, 1, 0)
            || test_convolution_int8(5, 6, 31, 9, 5, 1, 1, 0, 1)
            || test_convolution_int8(5, 7, 32, 8, 5, 1, 2, 0, 1)
diff --git a/tests/test_gridsample.cpp b/tests/test_gridsample.cpp
index 70c96b304805..0e3841153521 100644
--- a/tests/test_gridsample.cpp
+++ b/tests/test_gridsample.cpp
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -15,12 +15,13 @@
 #include "layer/gridsample.h"
 #include "testutil.h"
 
-static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner)
+static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample_type, int padding_mode, int align_corner, int permute_fusion)
 {
     ncnn::ParamDict pd;
     pd.set(0, sample_type);
     pd.set(1, padding_mode);
     pd.set(2, align_corner);
+    pd.set(3, permute_fusion);
 
     std::vector<ncnn::Mat> weights(0);
 
@@ -31,9 +32,9 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample
     int ret = test_layer<ncnn::GridSample>("GridSample", pd, weights, as);
     if (ret != 0)
     {
-        fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d",
+        fprintf(stderr, "test_gridsample failed a.dims=%d a=(%d %d %d %d) grid.dims=%d grid=(%d %d %d %d) sample_type=%d padding_mode=%d align_corner=%d permute_fusion=%d",
                 a.dims, a.w, a.h, a.d, a.c, grid.dims, grid.w, grid.h, grid.d, grid.c,
-                sample_type, padding_mode, align_corner);
+                sample_type, padding_mode, align_corner, permute_fusion);
     }
 
     return ret;
@@ -42,81 +43,141 @@ static int test_gridsample(const ncnn::Mat& a, const ncnn::Mat& grid, int sample
 static int test_gridsample_0()
 {
     return 0
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 1, 3, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 2, 3, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 16, 12), 3, 3, 1);
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 1, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 1, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 2, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 2, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 3, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 1, 3, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 1, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 1, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 2, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 2, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 3, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 2, 3, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 1, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 1, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 2, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 2, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 3, 0, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(2, 11, 13), 3, 3, 1, 0)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 1, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 1, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 2, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 2, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 3, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 1, 3, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 1, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 1, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 2, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 2, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 3, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 2, 3, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 1, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 1, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 2, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 2, 1, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 3, 0, 1)
+           || test_gridsample(RandomMat(3, 7, 1), RandomMat(11, 13, 2), 3, 3, 1, 1);
 }
 
 static int test_gridsample_1()
 {
     return 0
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 1, 3, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 2, 3, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 3), RandomMat(2, 27, 21), 3, 3, 1);
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 1, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 1, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 2, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 2, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 3, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 1, 3, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 1, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 1, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 2, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 2, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 3, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 2, 3, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 1, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 1, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 2, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 2, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 3, 0, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(2, 24, 16), 3, 3, 1, 0)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 1, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 1, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 2, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 2, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 3, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 1, 3, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 1, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 1, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 2, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 2, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 3, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 2, 3, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 1, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 1, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 2, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 2, 1, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 3, 0, 1)
+           || test_gridsample(RandomMat(8, 12, 16), RandomMat(24, 16, 2), 3, 3, 1, 1);
 }
 
 static int test_gridsample_2()
 {
     return 0
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 1, 3, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 27, 21, 10), 2, 3, 1);
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 1, 0, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 1, 1, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 2, 0, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 2, 1, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 3, 0, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 1, 3, 1, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 1, 0, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 1, 1, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 2, 0, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 2, 1, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 3, 0, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(3, 17, 11, 13), 2, 3, 1, 0)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 1, 0, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 1, 1, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 2, 0, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 2, 1, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 3, 0, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 1, 3, 1, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 1, 0, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 1, 1, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 2, 0, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 2, 1, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 3, 0, 1)
+           || test_gridsample(RandomMat(5, 7, 11, 13), RandomMat(17, 11, 13, 3), 2, 3, 1, 1);
 }
 
 static int test_gridsample_3()
 {
     return 0
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 1, 3, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 1, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 2, 1)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 0)
-           || test_gridsample(RandomMat(16, 12, 10, 5), RandomMat(3, 16, 12, 10), 2, 3, 1);
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 1, 0, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 1, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 2, 0, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 2, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 3, 0, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 1, 3, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 1, 0, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 1, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 2, 0, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 2, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 3, 0, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(3, 11, 12, 16), 2, 3, 1, 0)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 1, 0, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 1, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 2, 0, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 2, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 3, 0, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 1, 3, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 1, 0, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 1, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 2, 0, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 2, 1, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 3, 0, 1)
+           || test_gridsample(RandomMat(16, 12, 11, 16), RandomMat(11, 12, 16, 3), 2, 3, 1, 1);
 }
 
 int main()
diff --git a/tools/modelwriter.h b/tools/modelwriter.h
index 3d09ec1859dc..fd5105e612fe 100644
--- a/tools/modelwriter.h
+++ b/tools/modelwriter.h
@@ -1734,6 +1734,7 @@ int ModelWriter::save(const char* parampath, const char* binpath)
             fprintf_param_value(" 0=%d", sample_type)
             fprintf_param_value(" 1=%d", padding_mode)
             fprintf_param_value(" 2=%d", align_corner)
+            fprintf_param_value(" 3=%d", permute_fusion)
         }
         else if (layer->type == "GroupNorm")
         {
diff --git a/tools/pnnx/CMakeLists.txt b/tools/pnnx/CMakeLists.txt
index 3a08cbc249e8..0c8326fc942f 100644
--- a/tools/pnnx/CMakeLists.txt
+++ b/tools/pnnx/CMakeLists.txt
@@ -70,6 +70,11 @@ if(Torch_VERSION VERSION_LESS "1.8")
     message(FATAL_ERROR "pnnx only supports PyTorch >= 1.8")
 endif()
 
+if(Torch_VERSION VERSION_GREATER_EQUAL "2.1")
+    # c++17 is required for using torch 2.1+ headers
+    set(CMAKE_CXX_STANDARD 17)
+endif()
+
 if(TorchVision_FOUND)
     message(STATUS "Building with TorchVision")
     add_definitions(-DPNNX_TORCHVISION)
diff --git a/tools/pnnx/src/CMakeLists.txt b/tools/pnnx/src/CMakeLists.txt
index c15ba0973d67..58264dfd9754 100644
--- a/tools/pnnx/src/CMakeLists.txt
+++ b/tools/pnnx/src/CMakeLists.txt
@@ -270,6 +270,8 @@ set(pnnx_pass_level2_SRCS
     pass_level2/torch_unbind.cpp
     pass_level2/torch_unsqueeze.cpp
     pass_level2/torch_var.cpp
+    pass_level2/torch_view_as_complex.cpp
+    pass_level2/torch_view_as_real.cpp    
     pass_level2/torch_zeros.cpp
     pass_level2/torch_zeros_like.cpp
     pass_level2/torch_stft.cpp
diff --git a/tools/pnnx/src/main.cpp b/tools/pnnx/src/main.cpp
index dc8ca72dc7e8..f745ef03473e 100644
--- a/tools/pnnx/src/main.cpp
+++ b/tools/pnnx/src/main.cpp
@@ -300,6 +300,11 @@ int main(int argc, char** argv)
         fprintf(stderr, "\n");
     }
 
+#ifdef PNNX_TORCHVISION
+    // call some vision api to register vision ops  :P
+    (void)vision::cuda_version();
+#endif
+
     for (auto m : customop_modules)
     {
         fprintf(stderr, "load custom module %s\n", m.c_str());
diff --git a/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp b/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
index e7ca7bbf8243..8dcfafaf12b4 100644
--- a/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
+++ b/tools/pnnx/src/pass_level2/F_scaled_dot_product_attention.cpp
@@ -42,4 +42,31 @@ pnnx.Output             output      1 0 out
 
 REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention, 10)
 
+class F_scaled_dot_product_attention_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+9 8
+pnnx.Input              input_0     0 1 query
+pnnx.Input              input_1     0 1 key
+pnnx.Input              input_2     0 1 value
+pnnx.Input              input_3     0 1 attn_mask
+prim::Constant          op_0        0 1 dropout_p value=%dropout_p
+prim::Constant          op_1        0 1 is_causal value=%is_causal
+prim::Constant          op_2        0 1 scale value=%scale
+aten::scaled_dot_product_attention  op_3 7 1 query key value attn_mask dropout_p is_causal scale out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "F.scaled_dot_product_attention";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(F_scaled_dot_product_attention_1, 10)
+
 } // namespace pnnx
diff --git a/tools/pnnx/src/pass_level2/torch_view_as_complex.cpp b/tools/pnnx/src/pass_level2/torch_view_as_complex.cpp
new file mode 100644
index 000000000000..e00ff1371ca4
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/torch_view_as_complex.cpp
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class torch_view_as_complex : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+aten::view_as_complex   op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.view_as_complex";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_view_as_complex, 20)
+
+} // namespace pnnx
\ No newline at end of file
diff --git a/tools/pnnx/src/pass_level2/torch_view_as_real.cpp b/tools/pnnx/src/pass_level2/torch_view_as_real.cpp
new file mode 100644
index 000000000000..83327e01ef96
--- /dev/null
+++ b/tools/pnnx/src/pass_level2/torch_view_as_real.cpp
@@ -0,0 +1,40 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "pass_level2.h"
+
+namespace pnnx {
+
+class torch_view_as_real : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+3 2
+pnnx.Input              input       0 1 input
+aten::view_as_real      op_0        1 1 input out
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "torch.view_as_real";
+    }
+};
+
+REGISTER_GLOBAL_PNNX_GRAPH_REWRITER_PASS(torch_view_as_real, 20)
+
+} // namespace pnnx
\ No newline at end of file
diff --git a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp
index 41dfc65ee39c..b71e64dcbde0 100644
--- a/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp
+++ b/tools/pnnx/src/pass_ncnn/F_grid_sample.cpp
@@ -1,6 +1,6 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -15,7 +15,6 @@
 #include "pass_ncnn.h"
 
 namespace pnnx {
-
 namespace ncnn {
 
 class F_grid_sample : public GraphRewriterPass
@@ -61,11 +60,113 @@ pnnx.Output             output        1 0 out
             op->params["1"] = 3;
 
         op->params["2"] = captured_params.at("align_corners").b ? 1 : 0;
+        op->params["3"] = 0;
     }
 };
 
 REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample, 20)
 
+class F_grid_sample_1 : public GraphRewriterPass
+{
+public:
+    const char* match_pattern_graph() const
+    {
+        return R"PNNXIR(7767517
+5 4
+pnnx.Input              input_a     0 1 a
+pnnx.Input              input_b     0 1 b
+torch.permute           op_0        1 1 b b1 dims=%dims
+F.grid_sample           op_1        2 1 a b1 out mode=%mode padding_mode=%padding_mode align_corners=%align_corners
+pnnx.Output             output      1 0 out
+)PNNXIR";
+    }
+
+    const char* type_str() const
+    {
+        return "GridSample";
+    }
+
+    const char* name_str() const
+    {
+        return "permutegridsample";
+    }
+
+    bool match(const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::vector<int>& dims = captured_params.at("dims").ai;
+
+        if ((dims == std::vector<int>{1, 2, 0}) || (dims == std::vector<int>{1, 2, 3, 0}))
+            return true;
+        if ((dims == std::vector<int>{0, 2, 3, 1}) || (dims == std::vector<int>{0, 2, 3, 4, 1}))
+            return true;
+        return false;
+    }
+
+    void write(Operator* op, const std::map<std::string, Parameter>& captured_params) const
+    {
+        const std::string& mode = captured_params.at("mode").s;
+        if (mode == "bilinear")
+            op->params["0"] = 1;
+        if (mode == "nearest")
+            op->params["0"] = 2;
+        if (mode == "bicubic")
+            op->params["0"] = 3;
+
+        const std::string& padding_mode = captured_params.at("padding_mode").s;
+        if (padding_mode == "zeros")
+            op->params["1"] = 1;
+        if (padding_mode == "border")
+            op->params["1"] = 2;
+        if (padding_mode == "reflection")
+            op->params["1"] = 3;
+
+        op->params["2"] = captured_params.at("align_corners").b ? 1 : 0;
+
+        const int batch_index = op->inputs[1]->params["__batch_index"].i;
+
+        const std::vector<int>& dims = captured_params.at("dims").ai;
+
+        int input_rank = (int)op->inputs[0]->shape.size();
+
+        if (input_rank == 0)
+        {
+            // assume input is fine
+            input_rank = (int)dims.size();
+        }
+
+        if (batch_index >= 0 && batch_index < input_rank)
+            input_rank -= 1;
+
+        if (input_rank > 4)
+        {
+            fprintf(stderr, "permute %d-rank tensor is not supported yet!\n", input_rank);
+            return;
+        }
+
+        // drop permute batch index
+        std::vector<int> new_dims;
+        for (int i = 0; i < (int)dims.size(); i++)
+        {
+            if (dims[i] == batch_index)
+                continue;
+
+            int new_dim = dims[i] > batch_index ? dims[i] - 1 : dims[i];
+            new_dims.push_back(new_dim);
+        }
+
+        if (input_rank != (int)new_dims.size())
+        {
+            fprintf(stderr, "permute %d-rank tensor with %d-rank dims is not possible\n", input_rank, (int)new_dims.size());
+            return;
+        }
+
+        if ((input_rank == 3 && new_dims == std::vector<int>{1, 2, 0}) || (input_rank == 4 && new_dims == std::vector<int>{1, 2, 3, 0}))
+            op->params["3"] = 1;
+    }
+};
+
+REGISTER_GLOBAL_PNNX_NCNN_GRAPH_REWRITER_PASS(F_grid_sample_1, 19)
+
 } // namespace ncnn
 
 } // namespace pnnx
diff --git a/tools/pnnx/tests/CMakeLists.txt b/tools/pnnx/tests/CMakeLists.txt
index fab12342b58f..346ee0a955fc 100644
--- a/tools/pnnx/tests/CMakeLists.txt
+++ b/tools/pnnx/tests/CMakeLists.txt
@@ -237,6 +237,8 @@ pnnx_add_test(torch_topk)
 pnnx_add_test(torch_transpose)
 pnnx_add_test(torch_unbind)
 pnnx_add_test(torch_unsqueeze)
+pnnx_add_test(torch_view_as_complex)
+pnnx_add_test(torch_view_as_real)
 pnnx_add_test(torch_zeros)
 pnnx_add_test(torch_zeros_like)
 
diff --git a/tools/pnnx/tests/ncnn/test_F_grid_sample.py b/tools/pnnx/tests/ncnn/test_F_grid_sample.py
index c84d38232b1e..95ca812eb512 100644
--- a/tools/pnnx/tests/ncnn/test_F_grid_sample.py
+++ b/tools/pnnx/tests/ncnn/test_F_grid_sample.py
@@ -22,46 +22,56 @@ class Model(nn.Module):
     def __init__(self):
         super(Model, self).__init__()
 
-    def forward(self, x, xg1, xg2, y, yg1, yg2):
+    def forward(self, x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2):
         # norm to -1 ~ 1
         xg1 = xg1 * 2 - 1
         xg2 = xg2 * 2 - 1
         yg1 = yg1 * 2 - 1
         yg2 = yg2 * 2 - 1
 
-        x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False)
-        x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='border', align_corners=False)
-        x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='reflection', align_corners=False)
-        x = F.grid_sample(x, xg2, mode='nearest', padding_mode='zeros', align_corners=False)
-        x = F.grid_sample(x, xg1, mode='nearest', padding_mode='border', align_corners=False)
-        x = F.grid_sample(x, xg2, mode='nearest', padding_mode='reflection', align_corners=False)
-        x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='zeros', align_corners=False)
-        x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='border', align_corners=False)
-        x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='reflection', align_corners=False)
-        x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='zeros', align_corners=True)
-        x = F.grid_sample(x, xg1, mode='bilinear', padding_mode='border', align_corners=True)
-        x = F.grid_sample(x, xg2, mode='bilinear', padding_mode='reflection', align_corners=True)
-        x = F.grid_sample(x, xg1, mode='nearest', padding_mode='zeros', align_corners=True)
-        x = F.grid_sample(x, xg2, mode='nearest', padding_mode='border', align_corners=True)
-        x = F.grid_sample(x, xg1, mode='nearest', padding_mode='reflection', align_corners=True)
-        x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='zeros', align_corners=True)
-        x = F.grid_sample(x, xg1, mode='bicubic', padding_mode='border', align_corners=True)
-        x = F.grid_sample(x, xg2, mode='bicubic', padding_mode='reflection', align_corners=True)
-
-        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False)
-        y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=False)
-        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=False)
-        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=False)
-        y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=False)
-        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=False)
-        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=True)
-        y = F.grid_sample(y, yg2, mode='bilinear', padding_mode='border', align_corners=True)
-        y = F.grid_sample(y, yg1, mode='bilinear', padding_mode='reflection', align_corners=True)
-        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='zeros', align_corners=True)
-        y = F.grid_sample(y, yg1, mode='nearest', padding_mode='border', align_corners=True)
-        y = F.grid_sample(y, yg2, mode='nearest', padding_mode='reflection', align_corners=True)
-
-        return x, y
+        x0 = F.grid_sample(x, xg1, mode='bilinear', padding_mode='zeros', align_corners=False)
+        x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='border', align_corners=False)
+        x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='reflection', align_corners=False)
+        x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='zeros', align_corners=False)
+        x0 = F.grid_sample(x0, xg1, mode='nearest', padding_mode='border', align_corners=False)
+        x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='reflection', align_corners=False)
+        x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='zeros', align_corners=False)
+        x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='border', align_corners=False)
+        x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='reflection', align_corners=False)
+        x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='zeros', align_corners=True)
+        x0 = F.grid_sample(x0, xg1, mode='bilinear', padding_mode='border', align_corners=True)
+        x0 = F.grid_sample(x0, xg2, mode='bilinear', padding_mode='reflection', align_corners=True)
+        x0 = F.grid_sample(x0, xg1, mode='nearest', padding_mode='zeros', align_corners=True)
+        x0 = F.grid_sample(x0, xg2, mode='nearest', padding_mode='border', align_corners=True)
+        x0 = F.grid_sample(x0, xg1, mode='nearest', padding_mode='reflection', align_corners=True)
+        x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='zeros', align_corners=True)
+        x0 = F.grid_sample(x0, xg1, mode='bicubic', padding_mode='border', align_corners=True)
+        x0 = F.grid_sample(x0, xg2, mode='bicubic', padding_mode='reflection', align_corners=True)
+
+        y0 = F.grid_sample(y, yg1, mode='bilinear', padding_mode='zeros', align_corners=False)
+        y0 = F.grid_sample(y0, yg2, mode='bilinear', padding_mode='border', align_corners=False)
+        y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='reflection', align_corners=False)
+        y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='zeros', align_corners=False)
+        y0 = F.grid_sample(y0, yg1, mode='nearest', padding_mode='border', align_corners=False)
+        y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='reflection', align_corners=False)
+        y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='zeros', align_corners=True)
+        y0 = F.grid_sample(y0, yg2, mode='bilinear', padding_mode='border', align_corners=True)
+        y0 = F.grid_sample(y0, yg1, mode='bilinear', padding_mode='reflection', align_corners=True)
+        y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='zeros', align_corners=True)
+        y0 = F.grid_sample(y0, yg1, mode='nearest', padding_mode='border', align_corners=True)
+        y0 = F.grid_sample(y0, yg2, mode='nearest', padding_mode='reflection', align_corners=True)
+
+        xgp1 = xgp1.permute(0, 2, 3, 1)
+        xgp2 = xgp2.permute(0, 2, 3, 1)
+        ygp1 = ygp1.permute(0, 2, 3, 4, 1)
+        ygp2 = ygp2.permute(0, 2, 3, 4, 1)
+
+        x1 = F.grid_sample(x, xgp1, mode='bilinear', padding_mode='zeros', align_corners=False)
+        x1 = F.grid_sample(x1, xgp2, mode='bilinear', padding_mode='border', align_corners=False)
+
+        y1 = F.grid_sample(y, ygp1, mode='bilinear', padding_mode='zeros', align_corners=False)
+        y1 = F.grid_sample(y1, ygp2, mode='bilinear', padding_mode='border', align_corners=False)
+        return x0, y0, x1, y1
 
 def test():
     net = Model()
@@ -71,25 +81,29 @@ def test():
     x = torch.rand(1, 3, 12, 16)
     xg1 = torch.rand(1, 21, 27, 2)
     xg2 = torch.rand(1, 12, 16, 2)
+    xgp1 = torch.rand(1, 2, 21, 27)
+    xgp2 = torch.rand(1, 2, 12, 16)
     y = torch.rand(1, 5, 10, 12, 16)
     yg1 = torch.rand(1, 10, 21, 27, 3)
     yg2 = torch.rand(1, 10, 12, 16, 3)
+    ygp1 = torch.rand(1, 3, 10, 21, 27)
+    ygp2 = torch.rand(1, 3, 10, 12, 16)
 
-    a0, a1 = net(x, xg1, xg2, y, yg1, yg2)
+    a0, a1, a2, a3 = net(x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2)
 
     # export torchscript
-    mod = torch.jit.trace(net, (x, xg1, xg2, y, yg1, yg2))
+    mod = torch.jit.trace(net, (x, xg1, xg2, xgp1, xgp2, y, yg1, yg2, ygp1, ygp2))
     mod.save("test_F_grid_sample.pt")
 
     # torchscript to pnnx
     import os
-    os.system("../../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3]")
+    os.system("../../src/pnnx test_F_grid_sample.pt inputshape=[1,3,12,16],[1,21,27,2],[1,12,16,2],[1,2,21,27],[1,2,12,16],[1,5,10,12,16],[1,10,21,27,3],[1,10,12,16,3],[1,3,10,21,27],[1,3,10,12,16]")
 
     # ncnn inference
     import test_F_grid_sample_ncnn
-    b0, b1 = test_F_grid_sample_ncnn.test_inference()
+    b0, b1, b2, b3 = test_F_grid_sample_ncnn.test_inference()
 
-    return torch.allclose(a0, b0, 1e-4, 1e-4) and torch.allclose(a1, b1, 1e-4, 1e-4)
+    return torch.allclose(a0, b0, 1e-6, 1e-6) and torch.allclose(a1, b1, 1e-6, 1e-6) and torch.allclose(a2, b2, 1e-6, 1e-6) and torch.allclose(a3, b3, 1e-6, 1e-6)
 
 if __name__ == "__main__":
     if test():
diff --git a/tools/pnnx/tests/test_torch_view_as_complex.py b/tools/pnnx/tests/test_torch_view_as_complex.py
new file mode 100644
index 000000000000..c2cedc537d05
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_view_as_complex.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.view_as_complex(x)
+        y = torch.view_as_complex(y)
+        z = torch.view_as_complex(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 2)
+    y = torch.rand(1, 5, 9, 2)
+    z = torch.rand(14, 8, 5, 9, 2)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_view_as_complex.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_view_as_complex.pt inputshape=[1,3,2],[1,5,9,2],[14,8,5,9,2]")
+
+    # pnnx inference
+    import test_torch_view_as_complex_pnnx
+    b = test_torch_view_as_complex_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
\ No newline at end of file
diff --git a/tools/pnnx/tests/test_torch_view_as_real.py b/tools/pnnx/tests/test_torch_view_as_real.py
new file mode 100644
index 000000000000..06bbe7de9b10
--- /dev/null
+++ b/tools/pnnx/tests/test_torch_view_as_real.py
@@ -0,0 +1,61 @@
+# Tencent is pleased to support the open source community by making ncnn available.
+#
+# Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
+#
+# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+class Model(nn.Module):
+    def __init__(self):
+        super(Model, self).__init__()
+
+    def forward(self, x, y, z):
+        x = torch.view_as_real(x)
+        y = torch.view_as_real(y)
+        z = torch.view_as_real(z)
+        return x, y, z
+
+def test():
+    net = Model()
+    net.eval()
+
+    torch.manual_seed(0)
+    x = torch.rand(1, 3, 16,dtype=torch.complex64)
+    y = torch.rand(1, 5, 9, 11,dtype=torch.complex64)
+    z = torch.rand(14, 8, 5, 9, 10,dtype=torch.complex64)
+
+    a = net(x, y, z)
+
+    # export torchscript
+    mod = torch.jit.trace(net, (x, y, z))
+    mod.save("test_torch_view_as_real.pt")
+
+    # torchscript to pnnx
+    import os
+    os.system("../src/pnnx test_torch_view_as_real.pt inputshape=[1,3,16]c64,[1,5,9,11]c64,[14,8,5,9,10]c64")
+
+    # pnnx inference
+    import test_torch_view_as_real_pnnx
+    b = test_torch_view_as_real_pnnx.test_inference()
+
+    for a0, b0 in zip(a, b):
+        if not torch.equal(a0, b0):
+            return False
+    return True
+
+if __name__ == "__main__":
+    if test():
+        exit(0)
+    else:
+        exit(1)
\ No newline at end of file