Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
nihui committed Nov 16, 2023
1 parent 44844ae commit d9832b4
Show file tree
Hide file tree
Showing 7 changed files with 285 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,7 @@ endif()
if(NCNN_TARGET_ARCH STREQUAL "arm" AND CMAKE_SIZEOF_VOID_P EQUAL 8)
if(CMAKE_CXX_COMPILER_ID MATCHES "MSVC" OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND CMAKE_CXX_SIMULATE_ID MATCHES "MSVC" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT MATCHES "MSVC"))
# always enable neon and vfpv4 for msvc arm64
target_compile_options(ncnn PRIVATE /arch:armv8.0 /D__ARM_NEON /D__ARM_FP=0x0E)
target_compile_options(ncnn PRIVATE /D__ARM_NEON /D__ARM_FP=0x0E)
endif()

if(NOT NCNN_RUNTIME_CPU AND NCNN_ARM86SVE)
Expand Down
50 changes: 50 additions & 0 deletions src/layer/arm/hardsigmoid_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
float32x4_t _beta = vdupq_n_f32(beta);
for (; i + 15 < size; i += 16)
{
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
asm volatile(
"prfm pldl1keep, [%0, #512] \n"
Expand Down Expand Up @@ -133,6 +134,29 @@ int HardSigmoid_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
"w"(_beta) // %5
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else // NCNN_GNU_INLINE_ASM
float32x4_t _p0 = vld1q_f32(ptr);
float32x4_t _p1 = vld1q_f32(ptr + 4);
float32x4_t _p2 = vld1q_f32(ptr + 8);
float32x4_t _p3 = vld1q_f32(ptr + 12);
_p0 = vmlaq_f32(_beta, _p0, _alpha);
_p1 = vmlaq_f32(_beta, _p1, _alpha);
_p2 = vmlaq_f32(_beta, _p2, _alpha);
_p3 = vmlaq_f32(_beta, _p3, _alpha);
_p0 = vmaxq_f32(_p0, _zero);
_p1 = vmaxq_f32(_p1, _zero);
_p2 = vmaxq_f32(_p2, _zero);
_p3 = vmaxq_f32(_p3, _zero);
_p0 = vminq_f32(_p0, _one);
_p1 = vminq_f32(_p1, _one);
_p2 = vminq_f32(_p2, _one);
_p3 = vminq_f32(_p3, _one);
vst1q_f32(ptr, _p0);
vst1q_f32(ptr + 4, _p1);
vst1q_f32(ptr + 8, _p2);
vst1q_f32(ptr + 12, _p3);
ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
}
for (; i + 7 < size; i += 8)
{
Expand Down Expand Up @@ -197,6 +221,7 @@ int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& o
float32x4_t _beta = vdupq_n_f32(beta);
for (; i + 15 < size; i += 16)
{
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
asm volatile(
"prfm pldl1keep, [%0, #256] \n"
Expand Down Expand Up @@ -270,6 +295,31 @@ int HardSigmoid_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& o
"w"(_beta) // %5
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else // NCNN_GNU_INLINE_ASM
uint16x8_t _p = vld1q_u16(ptr);
uint16x8_t _q = vld1q_u16(ptr + 8);
float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
_p0 = vmlaq_f32(_beta, _p0, _alpha);
_p1 = vmlaq_f32(_beta, _p1, _alpha);
_p2 = vmlaq_f32(_beta, _p2, _alpha);
_p3 = vmlaq_f32(_beta, _p3, _alpha);
_p0 = vmaxq_f32(_p0, _zero);
_p1 = vmaxq_f32(_p1, _zero);
_p2 = vmaxq_f32(_p2, _zero);
_p3 = vmaxq_f32(_p3, _zero);
_p0 = vminq_f32(_p0, _one);
_p1 = vminq_f32(_p1, _one);
_p2 = vminq_f32(_p2, _one);
_p3 = vminq_f32(_p3, _one);
_p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
_q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
vst1q_u16(ptr, _p);
vst1q_u16(ptr + 8, _q);
ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
}
for (; i + 7 < size; i += 8)
{
Expand Down
24 changes: 24 additions & 0 deletions src/layer/arm/hardsigmoid_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
int i = 0;
for (; i + 31 < size; i += 32)
{
#if NCNN_GNU_INLINE_ASM
asm volatile(
"prfm pldl1keep, [%0, #512] \n"
"ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
Expand Down Expand Up @@ -135,6 +136,29 @@ int HardSigmoid_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option&
"w"(_alpha), // %4
"w"(_beta) // %5
: "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else // NCNN_GNU_INLINE_ASM
float16x8_t _p0 = vld1q_f16(ptr);
float16x8_t _p1 = vld1q_f16(ptr + 8);
float16x8_t _p2 = vld1q_f16(ptr + 16);
float16x8_t _p3 = vld1q_f16(ptr + 24);
_p0 = vfmaq_f16(_beta, _p0, _alpha);
_p1 = vfmaq_f16(_beta, _p1, _alpha);
_p2 = vfmaq_f16(_beta, _p2, _alpha);
_p3 = vfmaq_f16(_beta, _p3, _alpha);
_p0 = vmaxq_f16(_p0, _zero);
_p1 = vmaxq_f16(_p1, _zero);
_p2 = vmaxq_f16(_p2, _zero);
_p3 = vmaxq_f16(_p3, _zero);
_p0 = vminq_f16(_p0, _one);
_p1 = vminq_f16(_p1, _one);
_p2 = vminq_f16(_p2, _one);
_p3 = vminq_f16(_p3, _one);
vst1q_f16(ptr, _p0);
vst1q_f16(ptr + 8, _p1);
vst1q_f16(ptr + 16, _p2);
vst1q_f16(ptr + 24, _p3);
ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
}
for (; i + 15 < size; i += 16)
{
Expand Down
58 changes: 58 additions & 0 deletions src/layer/arm/hardswish_arm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
float32x4_t _beta = vdupq_n_f32(beta);
for (; i + 15 < size; i += 16)
{
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
asm volatile(
"prfm pldl1keep, [%0, #512] \n"
Expand Down Expand Up @@ -141,6 +142,33 @@ int HardSwish_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
"w"(_beta) // %5
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else // NCNN_GNU_INLINE_ASM
float32x4_t _p0 = vld1q_f32(ptr);
float32x4_t _p1 = vld1q_f32(ptr + 4);
float32x4_t _p2 = vld1q_f32(ptr + 8);
float32x4_t _p3 = vld1q_f32(ptr + 12);
float32x4_t _ans0 = vmlaq_f32(_beta, _p0, _alpha);
float32x4_t _ans1 = vmlaq_f32(_beta, _p1, _alpha);
float32x4_t _ans2 = vmlaq_f32(_beta, _p2, _alpha);
float32x4_t _ans3 = vmlaq_f32(_beta, _p3, _alpha);
_ans0 = vmaxq_f32(_ans0, _zero);
_ans1 = vmaxq_f32(_ans1, _zero);
_ans2 = vmaxq_f32(_ans2, _zero);
_ans3 = vmaxq_f32(_ans3, _zero);
_ans0 = vminq_f32(_ans0, _one);
_ans1 = vminq_f32(_ans1, _one);
_ans2 = vminq_f32(_ans2, _one);
_ans3 = vminq_f32(_ans3, _one);
_p0 = vmulq_f32(_ans0, _p0);
_p1 = vmulq_f32(_ans1, _p1);
_p2 = vmulq_f32(_ans2, _p2);
_p3 = vmulq_f32(_ans3, _p3);
vst1q_f32(ptr, _p0);
vst1q_f32(ptr + 4, _p1);
vst1q_f32(ptr + 8, _p2);
vst1q_f32(ptr + 12, _p3);
ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
}
for (; i + 7 < size; i += 8)
{
Expand Down Expand Up @@ -208,6 +236,7 @@ int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt
float32x4_t _beta = vdupq_n_f32(beta);
for (; i + 15 < size; i += 16)
{
#if NCNN_GNU_INLINE_ASM
#if __aarch64__
asm volatile(
"prfm pldl1keep, [%0, #256] \n"
Expand Down Expand Up @@ -289,6 +318,35 @@ int HardSwish_arm::forward_inplace_bf16s(Mat& bottom_top_blob, const Option& opt
"w"(_beta) // %5
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
#endif // __aarch64__
#else // NCNN_GNU_INLINE_ASM
uint16x8_t _p = vld1q_u16(ptr);
uint16x8_t _q = vld1q_u16(ptr + 8);
float32x4_t _p0 = bfloat2float(vget_low_u16(_p));
float32x4_t _p1 = bfloat2float(vget_high_u16(_p));
float32x4_t _p2 = bfloat2float(vget_low_u16(_q));
float32x4_t _p3 = bfloat2float(vget_high_u16(_q));
float32x4_t _ans0 = vmlaq_f32(_beta, _p0, _alpha);
float32x4_t _ans1 = vmlaq_f32(_beta, _p1, _alpha);
float32x4_t _ans2 = vmlaq_f32(_beta, _p2, _alpha);
float32x4_t _ans3 = vmlaq_f32(_beta, _p3, _alpha);
_ans0 = vmaxq_f32(_ans0, _zero);
_ans1 = vmaxq_f32(_ans1, _zero);
_ans2 = vmaxq_f32(_ans2, _zero);
_ans3 = vmaxq_f32(_ans3, _zero);
_ans0 = vminq_f32(_ans0, _one);
_ans1 = vminq_f32(_ans1, _one);
_ans2 = vminq_f32(_ans2, _one);
_ans3 = vminq_f32(_ans3, _one);
_p0 = vmulq_f32(_ans0, _p0);
_p1 = vmulq_f32(_ans1, _p1);
_p2 = vmulq_f32(_ans2, _p2);
_p3 = vmulq_f32(_ans3, _p3);
_p = vcombine_u16(float2bfloat(_p0), float2bfloat(_p1));
_q = vcombine_u16(float2bfloat(_p2), float2bfloat(_p3));
vst1q_u16(ptr, _p);
vst1q_u16(ptr + 8, _q);
ptr += 16;
#endif // NCNN_GNU_INLINE_ASM
}
for (; i + 7 < size; i += 8)
{
Expand Down
28 changes: 28 additions & 0 deletions src/layer/arm/hardswish_arm_asimdhp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
int i = 0;
for (; i + 31 < size; i += 32)
{
#if NCNN_GNU_INLINE_ASM
asm volatile(
"prfm pldl1keep, [%0, #512] \n"
"ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [%0] \n"
Expand Down Expand Up @@ -142,6 +143,33 @@ int HardSwish_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
"w"(_alpha), // %4
"w"(_beta) // %5
: "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
#else // NCNN_GNU_INLINE_ASM
float16x8_t _p0 = vld1q_f16(ptr);
float16x8_t _p1 = vld1q_f16(ptr + 8);
float16x8_t _p2 = vld1q_f16(ptr + 16);
float16x8_t _p3 = vld1q_f16(ptr + 24);
float16x8_t _ans0 = vfmaq_f16(_beta, _p0, _alpha);
float16x8_t _ans1 = vfmaq_f16(_beta, _p1, _alpha);
float16x8_t _ans2 = vfmaq_f16(_beta, _p2, _alpha);
float16x8_t _ans3 = vfmaq_f16(_beta, _p3, _alpha);
_ans0 = vmaxq_f16(_ans0, _zero);
_ans1 = vmaxq_f16(_ans1, _zero);
_ans2 = vmaxq_f16(_ans2, _zero);
_ans3 = vmaxq_f16(_ans3, _zero);
_ans0 = vminq_f16(_ans0, _one);
_ans1 = vminq_f16(_ans1, _one);
_ans2 = vminq_f16(_ans2, _one);
_ans3 = vminq_f16(_ans3, _one);
_p0 = vmulq_f16(_ans0, _p0);
_p1 = vmulq_f16(_ans1, _p1);
_p2 = vmulq_f16(_ans2, _p2);
_p3 = vmulq_f16(_ans3, _p3);
vst1q_f16(ptr, _p0);
vst1q_f16(ptr + 8, _p1);
vst1q_f16(ptr + 16, _p2);
vst1q_f16(ptr + 24, _p3);
ptr += 32;
#endif // NCNN_GNU_INLINE_ASM
}
for (; i + 15 < size; i += 16)
{
Expand Down
Loading

0 comments on commit d9832b4

Please sign in to comment.