diff --git a/src/layer/arm/prelu_arm_asimdhp.cpp b/src/layer/arm/prelu_arm_asimdhp.cpp index 776cd56394b0..e0efd6f07d90 100644 --- a/src/layer/arm/prelu_arm_asimdhp.cpp +++ b/src/layer/arm/prelu_arm_asimdhp.cpp @@ -342,10 +342,14 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c if (elempack == 4) { - float16x4_t _zero = vdup_n_f16(0.f); - if (dims == 1) { +#if _MSC_VER + float16x8_t _zero = vdupq_n_f16(0.f); +#else + float16x4_t _zero = vdup_n_f16(0.f); +#endif + int w = bottom_top_blob.w; if (num_slope > 1) @@ -359,7 +363,11 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c float16x4_t _p = vld1_f16(ptr); float16x4_t _slope = vcvt_f16_f32(vld1q_f32(slope + i * 4)); +#if _MSC_VER + uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero)); +#else uint16x4_t _lemask = vcle_f16(_p, _zero); +#endif float16x4_t _ps = vmul_f16(_p, _slope); _p = vbsl_f16(_lemask, _ps, _p); vst1_f16(ptr, _p); @@ -368,7 +376,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c else { #if _MSC_VER - float16x4_t _slope = vcvt_f16_f32(vdupq_n_f32(slope_data[0])); + float16x8_t _slope = vdupq_n_f16((__fp16)slope_data[0]); #else float16x4_t _slope = vdup_n_f16((__fp16)slope_data[0]); #endif @@ -379,8 +387,13 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c __fp16* ptr = (__fp16*)bottom_top_blob + i * 4; float16x4_t _p = vld1_f16(ptr); +#if _MSC_VER + uint16x4_t _lemask = vcle_f16(_p, vget_low_f16(_zero)); + float16x4_t _ps = vmul_f16(_p, vget_low_f16(_slope)); +#else uint16x4_t _lemask = vcle_f16(_p, _zero); float16x4_t _ps = vmul_f16(_p, _slope); +#endif _p = vbsl_f16(_lemask, _ps, _p); vst1_f16(ptr, _p); } @@ -396,6 +409,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c for (int i = 0; i < h; i++) { __fp16* ptr = bottom_top_blob.row<__fp16>(i); + float16x4_t _zero = vdup_n_f16(0.f); float16x4_t _slope = num_slope > 1 ? vcvt_f16_f32(vld1q_f32((const float*)slope_data + i * 4)) : vdup_n_f16((__fp16)slope_data[0]); for (int j = 0; j < w; j++) @@ -422,6 +436,7 @@ int PReLU_arm::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) c for (int q = 0; q < channels; q++) { __fp16* ptr = bottom_top_blob.channel(q); + float16x4_t _zero = vdup_n_f16(0.f); float16x4_t _slope = num_slope > 1 ? vcvt_f16_f32(vld1q_f32((const float*)slope_data + q * 4)) : vdup_n_f16((__fp16)slope_data[0]); for (int i = 0; i < size; i++)