pnnx convert torch round trunc (#4813)

* update riscv qemu * c906 test on qemu * fix qemu aarch64
Tencent · Jun 25, 2023 · 1283a19 · 1283a19
1 parent 3a74ae4
commit 1283a19
Show file tree

Hide file tree

Showing 26 changed files with 720 additions and 48 deletions.
diff --git a/.ci/test-coverage.yml b/.ci/test-coverage.yml
@@ -631,14 +631,14 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-riscv64-install-20220831
+        cacheKey: qemu-riscv64-install-20230624
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
       checkout: https://github.com/qemu/qemu.git
       with:
         pullType: COMMIT_ID
-        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        refName: b455ce4c2f300c8ba47cba7232dd03261368a4cb
         localPath: qemu
         enableSubmodule: false
         enableGitLfs: false
@@ -650,6 +650,10 @@ jobs:
         echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
         apt-get update
         apt-get build-dep -y qemu
+        apt-get install -y python3-pip
+        python3 -m pip install --upgrade pip
+        apt-get remove -y python3-setuptools
+        pip3 install -U setuptools
         cd qemu
         wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
         patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
@@ -712,14 +716,14 @@ jobs:
       uses: cache@1.*
       with:
         cachePaths: qemu-install
-        cacheKey: qemu-riscv64-install-20220831
+        cacheKey: qemu-riscv64-install-20230624
 
     - name: checkout-qemu
       if: steps.cache-qemu.outputs.cacheHit != 'true'
       checkout: https://github.com/qemu/qemu.git
       with:
         pullType: COMMIT_ID
-        refName: 621da7789083b80d6f1ff1c0fb499334007b4f51
+        refName: b455ce4c2f300c8ba47cba7232dd03261368a4cb
         localPath: qemu
         enableSubmodule: false
         enableGitLfs: false
@@ -731,6 +735,10 @@ jobs:
         echo 'deb-src http://mirrors.cloud.tencent.com/debian bullseye-updates main' | tee -a /etc/apt/sources.list
         apt-get update
         apt-get build-dep -y qemu
+        apt-get install -y python3-pip
+        python3 -m pip install --upgrade pip
+        apt-get remove -y python3-setuptools
+        pip3 install -U setuptools
         cd qemu
         wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
         patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
@@ -789,7 +797,7 @@ jobs:
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
     - name: lcov-collect-vlen128
       run: |
         cd build
@@ -804,7 +812,7 @@ jobs:
       run: |
         export PATH=${{ci.workspace}}/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,x-zvfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;${{ci.workspace}}/rv64gcv-install/sysroot" ctest --output-on-failure -j $(nproc)
     - name: lcov-collect-vlen256
       run: |
         cd build

diff --git a/.github/workflows/linux-aarch64-cpu-gcc.yml b/.github/workflows/linux-aarch64-cpu-gcc.yml
@@ -39,7 +39,7 @@ jobs:
       uses: actions/cache@v3
       with:
         path: qemu-install
-        key: qemu-aarch64-install-20220502-2
+        key: qemu-aarch64-install-20220502-ubuntu-2004-2
     - name: install-qemu-build-deps
       if: steps.cache-qemu.outputs.cache-hit != 'true'
       run: |
@@ -97,7 +97,7 @@ jobs:
       uses: actions/cache@v3
       with:
         path: qemu-install
-        key: qemu-aarch64-install-20220502-2
+        key: qemu-aarch64-install-20220502-ubuntu-2004-2
     - name: install-qemu-build-deps
       if: steps.cache-qemu.outputs.cache-hit != 'true'
       run: |

diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml
@@ -89,10 +89,16 @@ jobs:
       run: |
         export RISCV_ROOT_PATH=/data/action/osd/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1
         mkdir build && cd build
-        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON ..
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/c906-v226.toolchain.cmake -DCMAKE_BUILD_TYPE=release -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF -DNCNN_RUNTIME_CPU=OFF -DNCNN_RVV=ON -DNCNN_SIMPLEOCV=ON -DNCNN_BUILD_EXAMPLES=ON -DNCNN_BUILD_TESTS=ON ..
     - name: build
       run: cmake --build build -j 4
 
+    - name: test
+      run: |
+        export PATH=/data/action/osd/xuantie-qemu-x86_64-Ubuntu-18.04-20230413-0706/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;c906fdv" ctest --output-on-failure -j 4
+
   linux-gcc-riscv64-rvv:
     runs-on: [self-hosted, linux, centos]
     steps:

diff --git a/src/layer/arm/unaryop_arm.cpp b/src/layer/arm/unaryop_arm.cpp
@@ -14,6 +14,8 @@
 
 #include "unaryop_arm.h"
 
+#include <fenv.h>
+#include <float.h>
 #include <math.h>
 
 #if __ARM_NEON
@@ -128,7 +130,7 @@ struct unary_op_floor
     float32x4_t func_pack4(const float32x4_t& x) const
     {
 #if __aarch64__
-        return vcvtq_f32_s32(vcvtmq_s32_f32(x));
+        return vrndmq_f32(x);
 #else  // __aarch64__
         int32x4_t _xi = vcvtq_s32_f32(x);
         uint32x4_t _mask = vcgtq_f32(vcvtq_f32_s32(_xi), x);
@@ -148,7 +150,7 @@ struct unary_op_ceil
     float32x4_t func_pack4(const float32x4_t& x) const
     {
 #if __aarch64__
-        return vcvtq_f32_s32(vcvtpq_s32_f32(x));
+        return vrndpq_f32(x);
 #else  // __aarch64__
         int32x4_t _xi = vcvtq_s32_f32(x);
         uint32x4_t _mask = vcgtq_f32(x, vcvtq_f32_s32(_xi));
@@ -374,6 +376,94 @@ struct unary_op_log10
 #endif // __ARM_NEON
 };
 
+struct unary_op_round
+{
+    float func(const float& x) const
+    {
+        // round to nearest even
+#if NCNN_GNU_INLINE_ASM && __ARM_NEON
+        // return (x + 12582912.f) - 12582912.f;
+        float y;
+        const float magic = 12582912.f;
+#if __aarch64__
+        asm volatile(
+            "fadd   %s0, %s1, %s2   \n"
+            "fsub   %s0, %s0, %s2   \n"
+            : "=w"(y)
+            : "w"(x), "w"(magic)
+            :);
+#else
+        asm volatile(
+            "vadd.f32   %0, %1, %2  \n"
+            "vsub.f32   %0, %0, %2  \n"
+            : "=t"(y)
+            : "t"(x), "t"(magic)
+            :);
+#endif
+        return y;
+#else
+        int old_rm = fegetround();
+        fesetround(FE_TONEAREST);
+        float y = nearbyintf(x);
+        fesetround(old_rm);
+        return y;
+#endif
+    }
+#if __ARM_NEON
+#if __aarch64__
+    float32x4_t func_pack4(const float32x4_t& x) const
+    {
+        return vrndnq_f32(x);
+    }
+#else
+    float32x4_t func_pack4(const float32x4_t& x) const
+    {
+#if NCNN_GNU_INLINE_ASM
+        float32x4_t y;
+        float32x4_t _magic = vdupq_n_f32(12582912.f); // 1.5 * 2^23
+        asm volatile(
+            "vadd.f32   %q0, %q1, %q2   \n"
+            "vsub.f32   %q0, %q0, %q2   \n"
+            : "=w"(y)
+            : "w"(x), "w"(_magic)
+            :);
+        return y;
+#else
+        float tmp[4];
+        vst1q_f32(tmp, x);
+        int old_rm = fegetround();
+        fesetround(FE_TONEAREST);
+        tmp[0] = nearbyintf(tmp[0]);
+        tmp[1] = nearbyintf(tmp[1]);
+        tmp[2] = nearbyintf(tmp[2]);
+        tmp[3] = nearbyintf(tmp[3]);
+        fesetround(old_rm);
+        float32x4_t y = vld1q_f32(tmp);
+        return y;
+#endif
+    }
+#endif
+#endif // __ARM_NEON
+};
+
+struct unary_op_trunc
+{
+    float func(const float& x) const
+    {
+        return (float)truncf(x);
+    }
+#if __ARM_NEON
+    float32x4_t func_pack4(const float32x4_t& x) const
+    {
+#if __aarch64__
+        return vrndq_f32(x);
+#else
+        return vcvtq_f32_s32(vcvtq_s32_f32(x));
+#endif
+    }
+#endif // __ARM_NEON
+};
+
 } // namespace UnaryOp_arm_functor
 
 int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
@@ -446,6 +536,12 @@ int UnaryOp_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
     if (op_type == Operation_LOG10)
         return unary_op_inplace<unary_op_log10>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ROUND)
+        return unary_op_inplace<unary_op_round>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TRUNC)
+        return unary_op_inplace<unary_op_trunc>(bottom_top_blob, opt);
+
     return 0;
 }
 
@@ -576,6 +672,12 @@ int UnaryOp_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt)
     if (op_type == Operation_LOG10)
         return unary_op_inplace_bf16s<unary_op_log10>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ROUND)
+        return unary_op_inplace_bf16s<unary_op_round>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TRUNC)
+        return unary_op_inplace_bf16s<unary_op_trunc>(bottom_top_blob, opt);
+
     return 0;
 }
 #endif // NCNN_BF16

diff --git a/src/layer/arm/unaryop_arm_asimdhp.cpp b/src/layer/arm/unaryop_arm_asimdhp.cpp
@@ -14,6 +14,8 @@
 
 #include "unaryop_arm.h"
 
+#include <fenv.h>
+#include <float.h>
 #include <math.h>
 
 #if __ARM_NEON
@@ -452,6 +454,56 @@ struct unary_op_log10_fp16s
     }
 };
 
+struct unary_op_round_fp16s
+{
+    __fp16 func(const __fp16& x) const
+    {
+        // round to nearest even
+#if NCNN_GNU_INLINE_ASM
+        // return (x + 1536.f) - 1536.f;
+        __fp16 y;
+        const __fp16 magic = 1536.f;
+        asm volatile(
+            "fadd   %h0, %h1, %h2  \n"
+            "fsub   %h0, %h0, %h2  \n"
+            : "=w"(y)
+            : "w"(x), "w"(magic)
+            :);
+        return y;
+#else
+        int old_rm = fegetround();
+        fesetround(FE_TONEAREST);
+        __fp16 y = (__fp16)nearbyintf(x);
+        fesetround(old_rm);
+        return y;
+#endif
+    }
+    float16x4_t func_pack4(const float16x4_t& x) const
+    {
+        return vrndn_f16(x);
+    }
+    float16x8_t func_pack8(const float16x8_t& x) const
+    {
+        return vrndnq_f16(x);
+    }
+};
+
+struct unary_op_trunc_fp16s
+{
+    __fp16 func(const __fp16& x) const
+    {
+        return (__fp16)truncf(x);
+    }
+    float16x4_t func_pack4(const float16x4_t& x) const
+    {
+        return vrnd_f16(x);
+    }
+    float16x8_t func_pack8(const float16x8_t& x) const
+    {
+        return vrndq_f16(x);
+    }
+};
+
 } // namespace UnaryOp_arm_functor
 
 int UnaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const
@@ -512,6 +564,12 @@ int UnaryOp_arm::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
     if (op_type == Operation_LOG10)
         return unary_op_inplace_fp16s<unary_op_log10_fp16s>(bottom_top_blob, opt);
 
+    if (op_type == Operation_ROUND)
+        return unary_op_inplace_fp16s<unary_op_round_fp16s>(bottom_top_blob, opt);
+
+    if (op_type == Operation_TRUNC)
+        return unary_op_inplace_fp16s<unary_op_trunc_fp16s>(bottom_top_blob, opt);
+
     return 0;
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC