diff --git a/src/layer/riscv/riscv_activation.h b/src/layer/riscv/riscv_activation.h index da68d4843535..1d0f7476321b 100644 --- a/src/layer/riscv/riscv_activation.h +++ b/src/layer/riscv/riscv_activation.h @@ -24,47 +24,47 @@ #include "rvv_mathfun_fp16s.h" #endif -#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN, STYPE) \ - static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \ - { \ - if (activation_type == 1) \ - { \ - _v = __riscv_vfmax_vf_f##SEW##m##LMUL(_v, (STYPE)0.f, vl); \ - } \ - else if (activation_type == 2) \ - { \ - vbool##MLEN##_t _lemask = __riscv_vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)0.f, vl); \ - _v = __riscv_vfmul_vf_f##SEW##m##LMUL##_mu(_lemask, _v, _v, (STYPE)activation_params[0], vl); \ - } \ - else if (activation_type == 3) \ - { \ - _v = __riscv_vfmax_vf_f##SEW##m##LMUL(_v, (STYPE)activation_params[0], vl); \ - _v = __riscv_vfmin_vf_f##SEW##m##LMUL(_v, (STYPE)activation_params[1], vl); \ - } \ - else if (activation_type == 4) \ - { \ - _v = sigmoid_ps(_v, vl); \ - } \ - else if (activation_type == 5) \ - { \ - _v = __riscv_vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(__riscv_vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), (STYPE)1.f, vl), vl), vl), vl); \ - } \ - else if (activation_type == 6) \ - { \ - const float alpha = activation_params[0]; \ - const float beta = activation_params[1]; \ - const float lower = -beta / alpha; \ - const float upper = (1.f / alpha) + lower; \ - vbool##MLEN##_t _lower = __riscv_vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)lower, vl); \ - vbool##MLEN##_t _higher = __riscv_vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)upper, vl); \ - vbool##MLEN##_t _apply = __riscv_vmnor_mm_b##MLEN(_lower, _higher, vl); \ - _v = __riscv_vfmerge_vfm_f##SEW##m##LMUL(_v, (STYPE).0f, _lower, vl); \ - \ +#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN, STYPE) \ + static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \ + { \ + if (activation_type == 1) \ + { \ + _v = __riscv_vfmax_vf_f##SEW##m##LMUL(_v, (STYPE)0.f, vl); \ + } \ + else if (activation_type == 2) \ + { \ + vbool##MLEN##_t _lemask = __riscv_vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)0.f, vl); \ + _v = __riscv_vfmul_vf_f##SEW##m##LMUL##_mu(_lemask, _v, _v, (STYPE)activation_params[0], vl); \ + } \ + else if (activation_type == 3) \ + { \ + _v = __riscv_vfmax_vf_f##SEW##m##LMUL(_v, (STYPE)activation_params[0], vl); \ + _v = __riscv_vfmin_vf_f##SEW##m##LMUL(_v, (STYPE)activation_params[1], vl); \ + } \ + else if (activation_type == 4) \ + { \ + _v = sigmoid_ps(_v, vl); \ + } \ + else if (activation_type == 5) \ + { \ + _v = __riscv_vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(__riscv_vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), (STYPE)1.f, vl), vl), vl), vl); \ + } \ + else if (activation_type == 6) \ + { \ + const float alpha = activation_params[0]; \ + const float beta = activation_params[1]; \ + const float lower = -beta / alpha; \ + const float upper = (1.f / alpha) + lower; \ + vbool##MLEN##_t _lower = __riscv_vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)lower, vl); \ + vbool##MLEN##_t _higher = __riscv_vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, (STYPE)upper, vl); \ + vbool##MLEN##_t _apply = __riscv_vmnor_mm_b##MLEN(_lower, _higher, vl); \ + _v = __riscv_vfmerge_vfm_f##SEW##m##LMUL(_v, (STYPE).0f, _lower, vl); \ + \ vfloat##SEW##m##LMUL##_t _p0 = __riscv_vfadd_vf_f##SEW##m##LMUL##_m(_apply, __riscv_vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, (STYPE)alpha, vl), (STYPE)beta, vl); \ - _v = __riscv_vfmul_vv_f##SEW##m##LMUL##_mu(_apply, _v, _v, _p0, vl); \ - } \ - \ - return _v; \ + _v = __riscv_vfmul_vv_f##SEW##m##LMUL##_mu(_apply, _v, _v, _p0, vl); \ + } \ + \ + return _v; \ } #if __riscv_zvfh diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index 99a921745efc..08174608c4bc 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -34,8 +34,8 @@ #define _RVV_FLOAT16_LOG_OP(LMUL, MLEN) \ static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ - x = __riscv_vfmax_vf_f16m##LMUL(x, (__fp16)0.f, vl); /* force flush to zero on denormal values */ \ - vbool##MLEN##_t invalid_mask = __riscv_vmfle_vf_f16m##LMUL##_b##MLEN(x, (__fp16)0.f, vl); \ + x = __riscv_vfmax_vf_f16m##LMUL(x, (__fp16)0.f, vl); /* force flush to zero on denormal values */ \ + vbool##MLEN##_t invalid_mask = __riscv_vmfle_vf_f16m##LMUL##_b##MLEN(x, (__fp16)0.f, vl); \ \ vint16m##LMUL##_t ux = __riscv_vreinterpret_v_f16m##LMUL##_i16m##LMUL(x); \ \ @@ -49,47 +49,47 @@ emm0 = __riscv_vsub_vx_i16m##LMUL(emm0, 0xf, vl); \ vfloat16m##LMUL##_t e = __riscv_vfcvt_f_x_v_f16m##LMUL(emm0, vl); \ \ - e = __riscv_vfadd_vf_f16m##LMUL(e, (__fp16)1.f, vl); \ + e = __riscv_vfadd_vf_f16m##LMUL(e, (__fp16)1.f, vl); \ \ /* part2: */ \ /* if( x < SQRTHF ) { */ \ /* e -= 1; */ \ /* x = x + x - 1.0; */ \ /* } else { x = x - 1.0; } */ \ - vbool##MLEN##_t mask = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (__fp16)c_cephes_SQRTHF, vl); \ + vbool##MLEN##_t mask = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (__fp16)c_cephes_SQRTHF, vl); \ x = __riscv_vfadd_vv_f16m##LMUL##_mu(mask, x, x, x, vl); \ - x = __riscv_vfsub_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ - e = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, e, e, (__fp16)1.f, vl); \ + x = __riscv_vfsub_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ + e = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, e, e, (__fp16)1.f, vl); \ \ vfloat16m##LMUL##_t z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ \ - vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_log_p0, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p1, vl); \ + vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_log_p0, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p1, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p2, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p2, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p3, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p3, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p4, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p4, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p5, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p5, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p6, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p6, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p7, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p7, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p8, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_log_p8, vl); \ y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ \ y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ \ - vfloat16m##LMUL##_t tmp = __riscv_vfmul_vf_f16m##LMUL(e, (__fp16)c_cephes_log_q1, vl); \ + vfloat16m##LMUL##_t tmp = __riscv_vfmul_vf_f16m##LMUL(e, (__fp16)c_cephes_log_q1, vl); \ y = __riscv_vfadd_vv_f16m##LMUL(y, tmp, vl); \ \ - tmp = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)0.5f, vl); \ + tmp = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)0.5f, vl); \ y = __riscv_vfsub_vv_f16m##LMUL(y, tmp, vl); \ \ - tmp = __riscv_vfmul_vf_f16m##LMUL(e, (__fp16)c_cephes_log_q2, vl); \ + tmp = __riscv_vfmul_vf_f16m##LMUL(e, (__fp16)c_cephes_log_q2, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, y, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, tmp, vl); \ /* negative arg will be NAN */ \ @@ -117,54 +117,54 @@ _RVV_FLOAT16_LOG_OP(8, 2) #define c_cephes_exp_p4 1.6666665459E-1 #define c_cephes_exp_p5 5.0000001201E-1 -#define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl) \ - { \ - vfloat16m##LMUL##_t tmp, fx; \ - \ - x = __riscv_vfmin_vf_f16m##LMUL(x, (__fp16)c_exp_hi_f16, vl); \ - x = __riscv_vfmax_vf_f16m##LMUL(x, (__fp16)c_exp_lo_f16, vl); \ - \ - /* express exp(x) as exp(g + n*log(2)) */ \ +#define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t tmp, fx; \ + \ + x = __riscv_vfmin_vf_f16m##LMUL(x, (__fp16)c_exp_hi_f16, vl); \ + x = __riscv_vfmax_vf_f16m##LMUL(x, (__fp16)c_exp_lo_f16, vl); \ + \ + /* express exp(x) as exp(g + n*log(2)) */ \ fx = __riscv_vfmacc_vf_f16m##LMUL(__riscv_vfmv_v_f_f16m##LMUL((__fp16)0.5f, vl), (__fp16)c_cephes_LOG2EF, x, vl); \ - \ - /* perform a floorf */ \ - tmp = __riscv_vfcvt_f_x_v_f16m##LMUL(__riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl), vl); \ - \ - /* if greater, substract 1 */ \ - vbool##MLEN##_t mask = __riscv_vmfgt_vv_f16m##LMUL##_b##MLEN(tmp, fx, vl); \ - fx = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, tmp, tmp, (__fp16)1.f, vl); \ - \ - tmp = __riscv_vfmul_vf_f16m##LMUL(fx, (__fp16)c_cephes_exp_C1, vl); \ - vfloat16m##LMUL##_t z = __riscv_vfmul_vf_f16m##LMUL(fx, (__fp16)c_cephes_exp_C2, vl); \ - x = __riscv_vfsub_vv_f16m##LMUL(x, tmp, vl); \ - x = __riscv_vfsub_vv_f16m##LMUL(x, z, vl); \ - \ - vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_exp_p0, vl); \ - z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ - \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p1, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p2, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p3, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p4, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p5, vl); \ - \ - y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ - y = __riscv_vfadd_vv_f16m##LMUL(y, x, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)1.f, vl); \ - \ - /* build 2^n */ \ - vint16m##LMUL##_t mm = __riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl); \ - mm = __riscv_vadd_vx_i16m##LMUL(mm, 0xf, vl); \ - mm = __riscv_vsll_vx_i16m##LMUL(mm, 10, vl); \ - vfloat16m##LMUL##_t pow2n = __riscv_vreinterpret_v_i16m##LMUL##_f16m##LMUL(mm); \ - \ - y = __riscv_vfmul_vv_f16m##LMUL(y, pow2n, vl); \ - return y; \ + \ + /* perform a floorf */ \ + tmp = __riscv_vfcvt_f_x_v_f16m##LMUL(__riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl), vl); \ + \ + /* if greater, substract 1 */ \ + vbool##MLEN##_t mask = __riscv_vmfgt_vv_f16m##LMUL##_b##MLEN(tmp, fx, vl); \ + fx = __riscv_vfsub_vf_f16m##LMUL##_mu(mask, tmp, tmp, (__fp16)1.f, vl); \ + \ + tmp = __riscv_vfmul_vf_f16m##LMUL(fx, (__fp16)c_cephes_exp_C1, vl); \ + vfloat16m##LMUL##_t z = __riscv_vfmul_vf_f16m##LMUL(fx, (__fp16)c_cephes_exp_C2, vl); \ + x = __riscv_vfsub_vv_f16m##LMUL(x, tmp, vl); \ + x = __riscv_vfsub_vv_f16m##LMUL(x, z, vl); \ + \ + vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_exp_p0, vl); \ + z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ + \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p1, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p2, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p3, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p4, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_cephes_exp_p5, vl); \ + \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vv_f16m##LMUL(y, x, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)1.f, vl); \ + \ + /* build 2^n */ \ + vint16m##LMUL##_t mm = __riscv_vfcvt_x_f_v_i16m##LMUL(fx, vl); \ + mm = __riscv_vadd_vx_i16m##LMUL(mm, 0xf, vl); \ + mm = __riscv_vsll_vx_i16m##LMUL(mm, 10, vl); \ + vfloat16m##LMUL##_t pow2n = __riscv_vreinterpret_v_i16m##LMUL##_f16m##LMUL(mm); \ + \ + y = __riscv_vfmul_vv_f16m##LMUL(y, pow2n, vl); \ + return y; \ } _RVV_FLOAT16_EXP_OP(1, 16) @@ -192,11 +192,11 @@ _RVV_FLOAT16_EXP_OP(8, 2) vuint16m##LMUL##_t emm2; \ \ vbool##MLEN##_t sign_mask_sin, sign_mask_cos; \ - sign_mask_sin = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (__fp16)0.f, vl); \ - x = __riscv_vfsgnj_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ + sign_mask_sin = __riscv_vmflt_vf_f16m##LMUL##_b##MLEN(x, (__fp16)0.f, vl); \ + x = __riscv_vfsgnj_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ \ /* scale by 4/Pi */ \ - y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_FOPI, vl); \ + y = __riscv_vfmul_vf_f16m##LMUL(x, (__fp16)c_cephes_FOPI, vl); \ \ /* store the integer part of y in mm0 */ \ emm2 = __riscv_vfcvt_xu_f_v_u16m##LMUL(y, vl); \ @@ -214,9 +214,9 @@ _RVV_FLOAT16_EXP_OP(8, 2) \ /* The magic pass: "Extended precision modular arithmetic" */ \ /* x = ((x - y * DP1) - y * DP2) - y * DP3; */ \ - xmm1 = __riscv_vfmul_vf_f16m##LMUL(y, (__fp16)c_minus_cephes_DP1, vl); \ - xmm2 = __riscv_vfmul_vf_f16m##LMUL(y, (__fp16)c_minus_cephes_DP2, vl); \ - xmm3 = __riscv_vfmul_vf_f16m##LMUL(y, (__fp16)c_minus_cephes_DP3, vl); \ + xmm1 = __riscv_vfmul_vf_f16m##LMUL(y, (__fp16)c_minus_cephes_DP1, vl); \ + xmm2 = __riscv_vfmul_vf_f16m##LMUL(y, (__fp16)c_minus_cephes_DP2, vl); \ + xmm3 = __riscv_vfmul_vf_f16m##LMUL(y, (__fp16)c_minus_cephes_DP3, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, xmm1, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, xmm2, vl); \ x = __riscv_vfadd_vv_f16m##LMUL(x, xmm3, vl); \ @@ -229,21 +229,21 @@ _RVV_FLOAT16_EXP_OP(8, 2) vfloat16m##LMUL##_t z = __riscv_vfmul_vv_f16m##LMUL(x, x, vl); \ vfloat16m##LMUL##_t y1, y2; \ \ - y1 = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_coscof_p0, vl); \ - y2 = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_sincof_p0, vl); \ - y1 = __riscv_vfadd_vf_f16m##LMUL(y1, (__fp16)c_coscof_p1, vl); \ - y2 = __riscv_vfadd_vf_f16m##LMUL(y2, (__fp16)c_sincof_p1, vl); \ + y1 = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_coscof_p0, vl); \ + y2 = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_sincof_p0, vl); \ + y1 = __riscv_vfadd_vf_f16m##LMUL(y1, (__fp16)c_coscof_p1, vl); \ + y2 = __riscv_vfadd_vf_f16m##LMUL(y2, (__fp16)c_sincof_p1, vl); \ y1 = __riscv_vfmul_vv_f16m##LMUL(y1, z, vl); \ y2 = __riscv_vfmul_vv_f16m##LMUL(y2, z, vl); \ - y1 = __riscv_vfadd_vf_f16m##LMUL(y1, (__fp16)c_coscof_p2, vl); \ - y2 = __riscv_vfadd_vf_f16m##LMUL(y2, (__fp16)c_sincof_p2, vl); \ + y1 = __riscv_vfadd_vf_f16m##LMUL(y1, (__fp16)c_coscof_p2, vl); \ + y2 = __riscv_vfadd_vf_f16m##LMUL(y2, (__fp16)c_sincof_p2, vl); \ y1 = __riscv_vfmul_vv_f16m##LMUL(y1, z, vl); \ y2 = __riscv_vfmul_vv_f16m##LMUL(y2, z, vl); \ y1 = __riscv_vfmul_vv_f16m##LMUL(y1, z, vl); \ y2 = __riscv_vfmul_vv_f16m##LMUL(y2, x, vl); \ - y1 = __riscv_vfsub_vv_f16m##LMUL(y1, __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)0.5f, vl), vl); \ + y1 = __riscv_vfsub_vv_f16m##LMUL(y1, __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)0.5f, vl), vl); \ y2 = __riscv_vfadd_vv_f16m##LMUL(y2, x, vl); \ - y1 = __riscv_vfadd_vf_f16m##LMUL(y1, (__fp16)1.f, vl); \ + y1 = __riscv_vfadd_vf_f16m##LMUL(y1, (__fp16)1.f, vl); \ \ /* select the correct result from the two polynoms */ \ vfloat16m##LMUL##_t ys = __riscv_vmerge_vvm_f16m##LMUL(y2, y1, poly_mask, vl); \ @@ -299,53 +299,53 @@ _RVV_FLOAT16_COS_OP(8, 2) #define c_tanh_beta_4 1.18534705686654e-4f #define c_tanh_beta_6 1.19825839466702e-6f -#define _RVV_FLOAT16_TANH_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, size_t vl) \ - { \ - vfloat16m##LMUL##_t x2 = __riscv_vfsgnj_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ - \ - vbool##MLEN##_t tiny_mask = __riscv_vmfge_vf_f16m##LMUL##_b##MLEN(x2, (__fp16)c_tanh_tiny, vl); \ - \ - /* clamp the inputs to the range [-9, 9] since anything outside */ \ - /* this range is -/+1.0f in single-precision. */ \ - x2 = __riscv_vfmin_vf_f16m##LMUL(x2, (__fp16)c_tanh_hi, vl); \ - \ - /* since the polynomials are odd/even, we need x**2. */ \ - vfloat16m##LMUL##_t z = __riscv_vfmul_vv_f16m##LMUL(x2, x2, vl); \ - \ - /* evaluate the numerator polynomial y. */ \ - vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_tanh_alpha_13, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_11, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_9, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_7, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_5, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_3, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ - y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_1, vl); \ - y = __riscv_vfmul_vv_f16m##LMUL(y, x2, vl); \ - \ - /* evaluate the denominator polynomial w. */ \ - vfloat16m##LMUL##_t w = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_tanh_beta_6, vl); \ - w = __riscv_vfadd_vf_f16m##LMUL(w, (__fp16)c_tanh_beta_4, vl); \ - w = __riscv_vfmul_vv_f16m##LMUL(w, z, vl); \ - w = __riscv_vfadd_vf_f16m##LMUL(w, (__fp16)c_tanh_beta_2, vl); \ - w = __riscv_vfmul_vv_f16m##LMUL(w, z, vl); \ - w = __riscv_vfadd_vf_f16m##LMUL(w, (__fp16)c_tanh_beta_0, vl); \ - \ - /* divide the numerator by the denominator. */ \ - y = __riscv_vfdiv_vv_f16m##LMUL(y, w, vl); \ - \ - /* reinstate the sign. */ \ - y = __riscv_vfsgnj_vv_f16m##LMUL(y, x, vl); \ - \ - /* when the argument is very small in magnitude it's more accurate to just return it. */ \ - y = __riscv_vmerge_vvm_f16m##LMUL(x, y, tiny_mask, vl); \ - \ - return y; \ +#define _RVV_FLOAT16_TANH_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t x2 = __riscv_vfsgnj_vf_f16m##LMUL(x, (__fp16)1.f, vl); \ + \ + vbool##MLEN##_t tiny_mask = __riscv_vmfge_vf_f16m##LMUL##_b##MLEN(x2, (__fp16)c_tanh_tiny, vl); \ + \ + /* clamp the inputs to the range [-9, 9] since anything outside */ \ + /* this range is -/+1.0f in single-precision. */ \ + x2 = __riscv_vfmin_vf_f16m##LMUL(x2, (__fp16)c_tanh_hi, vl); \ + \ + /* since the polynomials are odd/even, we need x**2. */ \ + vfloat16m##LMUL##_t z = __riscv_vfmul_vv_f16m##LMUL(x2, x2, vl); \ + \ + /* evaluate the numerator polynomial y. */ \ + vfloat16m##LMUL##_t y = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_tanh_alpha_13, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_11, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_9, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_7, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_5, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_3, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, z, vl); \ + y = __riscv_vfadd_vf_f16m##LMUL(y, (__fp16)c_tanh_alpha_1, vl); \ + y = __riscv_vfmul_vv_f16m##LMUL(y, x2, vl); \ + \ + /* evaluate the denominator polynomial w. */ \ + vfloat16m##LMUL##_t w = __riscv_vfmul_vf_f16m##LMUL(z, (__fp16)c_tanh_beta_6, vl); \ + w = __riscv_vfadd_vf_f16m##LMUL(w, (__fp16)c_tanh_beta_4, vl); \ + w = __riscv_vfmul_vv_f16m##LMUL(w, z, vl); \ + w = __riscv_vfadd_vf_f16m##LMUL(w, (__fp16)c_tanh_beta_2, vl); \ + w = __riscv_vfmul_vv_f16m##LMUL(w, z, vl); \ + w = __riscv_vfadd_vf_f16m##LMUL(w, (__fp16)c_tanh_beta_0, vl); \ + \ + /* divide the numerator by the denominator. */ \ + y = __riscv_vfdiv_vv_f16m##LMUL(y, w, vl); \ + \ + /* reinstate the sign. */ \ + y = __riscv_vfsgnj_vv_f16m##LMUL(y, x, vl); \ + \ + /* when the argument is very small in magnitude it's more accurate to just return it. */ \ + y = __riscv_vmerge_vvm_f16m##LMUL(x, y, tiny_mask, vl); \ + \ + return y; \ } _RVV_FLOAT16_TANH_OP(1, 16) @@ -366,28 +366,28 @@ _RVV_FLOAT16_POW_OP(4, 4) _RVV_FLOAT16_POW_OP(8, 2) #if __riscv_xtheadvector -#define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl) \ - { \ - _v = __riscv_vfneg_v_f16m##LMUL(_v, vl); \ - _v = exp_ps(_v, vl); \ +#define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl) \ + { \ + _v = __riscv_vfneg_v_f16m##LMUL(_v, vl); \ + _v = exp_ps(_v, vl); \ _v = __riscv_vfadd_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \ vfloat16m##LMUL##_t _reciprocal = __riscv_vfrdiv_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \ _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); \ /* _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); */ \ - return _reciprocal; \ + return _reciprocal; \ } #else // __riscv_xtheadvector -#define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl) \ - { \ - _v = __riscv_vfneg_v_f16m##LMUL(_v, vl); \ - _v = exp_ps(_v, vl); \ +#define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl) \ + { \ + _v = __riscv_vfneg_v_f16m##LMUL(_v, vl); \ + _v = exp_ps(_v, vl); \ _v = __riscv_vfadd_vf_f16m##LMUL(_v, (__fp16)1.f, vl); \ - vfloat16m##LMUL##_t _reciprocal = __riscv_vfrec7_v_f16m##LMUL(_v, vl); \ + vfloat16m##LMUL##_t _reciprocal = __riscv_vfrec7_v_f16m##LMUL(_v, vl); \ _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); \ /* _reciprocal = __riscv_vfmul_vv_f16m##LMUL(__riscv_vfrsub_vf_f16m##LMUL(__riscv_vfmul_vv_f16m##LMUL(_v, _reciprocal, vl), (__fp16)2.f, vl), _reciprocal, vl); */ \ - return _reciprocal; \ + return _reciprocal; \ } #endif // __riscv_xtheadvector