diff --git a/sycl/include/sycl/builtins.hpp b/sycl/include/sycl/builtins.hpp index f6eb3612ce5e5..aa1e1c2e85409 100644 --- a/sycl/include/sycl/builtins.hpp +++ b/sycl/include/sycl/builtins.hpp @@ -18,7 +18,15 @@ #define __NOEXC /*noexcept*/ namespace sycl { + __SYCL_INLINE_VER_NAMESPACE(_V1) { + +namespace detail { +template vec to_vec2(marray x, size_t start) { + return {x[start], x[start + 1]}; +} +} // namespace detail + #ifdef __SYCL_DEVICE_ONLY__ #define __sycl_std #else @@ -27,236 +35,392 @@ namespace __sycl_std = __host_std; #ifdef __FAST_MATH__ #define __FAST_MATH_GENFLOAT(T) \ - (detail::is_genfloatd::value || detail::is_genfloath::value) + (detail::is_svgenfloatd::value || detail::is_svgenfloath::value) +#define __FAST_MATH_SGENFLOAT(T) \ + (std::is_same_v || std::is_same_v) #else -#define __FAST_MATH_GENFLOAT(T) (detail::is_genfloat::value) +#define __FAST_MATH_GENFLOAT(T) (detail::is_svgenfloat::value) +#define __FAST_MATH_SGENFLOAT(T) (detail::is_sgenfloat::value) #endif /* ----------------- 4.13.3 Math functions. ---------------------------------*/ -// genfloat acos (genfloat x) -template -detail::enable_if_t::value, T> acos(T x) __NOEXC { + +// These macros for marray math function implementations use vectorizations of +// size two as a simple general optimization. A more complex implementation +// using larger vectorizations for large marray sizes is possible; however more +// testing is required in order to ascertain the performance implications for +// all backends. +#define __SYCL_MATH_FUNCTION_OVERLOAD_IMPL(NAME) \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + vec partial_res = \ + __sycl_std::__invoke_##NAME>(detail::to_vec2(x, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = __sycl_std::__invoke_##NAME(x[N - 1]); \ + } \ + return res; + +#define __SYCL_MATH_FUNCTION_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t::value, marray> \ + NAME(marray x) __NOEXC { \ + __SYCL_MATH_FUNCTION_OVERLOAD_IMPL(NAME) \ + } + +__SYCL_MATH_FUNCTION_OVERLOAD(cospi) +__SYCL_MATH_FUNCTION_OVERLOAD(sinpi) +__SYCL_MATH_FUNCTION_OVERLOAD(tanpi) +__SYCL_MATH_FUNCTION_OVERLOAD(sinh) +__SYCL_MATH_FUNCTION_OVERLOAD(cosh) +__SYCL_MATH_FUNCTION_OVERLOAD(tanh) +__SYCL_MATH_FUNCTION_OVERLOAD(asin) +__SYCL_MATH_FUNCTION_OVERLOAD(acos) +__SYCL_MATH_FUNCTION_OVERLOAD(atan) +__SYCL_MATH_FUNCTION_OVERLOAD(asinpi) +__SYCL_MATH_FUNCTION_OVERLOAD(acospi) +__SYCL_MATH_FUNCTION_OVERLOAD(atanpi) +__SYCL_MATH_FUNCTION_OVERLOAD(asinh) +__SYCL_MATH_FUNCTION_OVERLOAD(acosh) +__SYCL_MATH_FUNCTION_OVERLOAD(atanh) +__SYCL_MATH_FUNCTION_OVERLOAD(cbrt) +__SYCL_MATH_FUNCTION_OVERLOAD(ceil) +__SYCL_MATH_FUNCTION_OVERLOAD(floor) +__SYCL_MATH_FUNCTION_OVERLOAD(erfc) +__SYCL_MATH_FUNCTION_OVERLOAD(erf) +__SYCL_MATH_FUNCTION_OVERLOAD(expm1) +__SYCL_MATH_FUNCTION_OVERLOAD(tgamma) +__SYCL_MATH_FUNCTION_OVERLOAD(lgamma) +__SYCL_MATH_FUNCTION_OVERLOAD(log1p) +__SYCL_MATH_FUNCTION_OVERLOAD(logb) +__SYCL_MATH_FUNCTION_OVERLOAD(rint) +__SYCL_MATH_FUNCTION_OVERLOAD(round) +__SYCL_MATH_FUNCTION_OVERLOAD(trunc) + +#undef __SYCL_MATH_FUNCTION_OVERLOAD + +// __SYCL_MATH_FUNCTION_OVERLOAD_FM cases are replaced by corresponding native +// implementations when the -ffast-math flag is used with float. +#define __SYCL_MATH_FUNCTION_OVERLOAD_FM(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t<__FAST_MATH_SGENFLOAT(T), marray> \ + NAME(marray x) __NOEXC { \ + __SYCL_MATH_FUNCTION_OVERLOAD_IMPL(NAME) \ + } + +__SYCL_MATH_FUNCTION_OVERLOAD_FM(sin) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(cos) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(tan) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(exp) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(exp2) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(exp10) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(log) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(log2) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(log10) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(sqrt) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(rsqrt) + +#undef __SYCL_MATH_FUNCTION_OVERLOAD_FM +#undef __SYCL_MATH_FUNCTION_OVERLOAD_IMPL + +#define __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL(NAME) \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_##NAME>( \ + detail::to_vec2(x, i * 2), detail::to_vec2(y, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = __sycl_std::__invoke_##NAME(x[N - 1], y[N - 1]); \ + } \ + return res; + +#define __SYCL_MATH_FUNCTION_2_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t::value, marray> \ + NAME(marray x, marray y) __NOEXC { \ + __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL(NAME) \ + } + +__SYCL_MATH_FUNCTION_2_OVERLOAD(atan2) +__SYCL_MATH_FUNCTION_2_OVERLOAD(atan2pi) +__SYCL_MATH_FUNCTION_2_OVERLOAD(copysign) +__SYCL_MATH_FUNCTION_2_OVERLOAD(fdim) +__SYCL_MATH_FUNCTION_2_OVERLOAD(fmin) +__SYCL_MATH_FUNCTION_2_OVERLOAD(fmax) +__SYCL_MATH_FUNCTION_2_OVERLOAD(fmod) +__SYCL_MATH_FUNCTION_2_OVERLOAD(hypot) +__SYCL_MATH_FUNCTION_2_OVERLOAD(maxmag) +__SYCL_MATH_FUNCTION_2_OVERLOAD(minmag) +__SYCL_MATH_FUNCTION_2_OVERLOAD(nextafter) +__SYCL_MATH_FUNCTION_2_OVERLOAD(pow) +__SYCL_MATH_FUNCTION_2_OVERLOAD(remainder) + +#undef __SYCL_MATH_FUNCTION_2_OVERLOAD + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t<__FAST_MATH_SGENFLOAT(T), marray> + powr(marray x, + marray y) __NOEXC{__SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL(powr)} + +#undef __SYCL_MATH_FUNCTION_2_OVERLOAD_IMPL + +#define __SYCL_MATH_FUNCTION_3_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t::value, marray> \ + NAME(marray x, marray y, marray z) __NOEXC { \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_##NAME>( \ + detail::to_vec2(x, i * 2), detail::to_vec2(y, i * 2), \ + detail::to_vec2(z, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = \ + __sycl_std::__invoke_##NAME(x[N - 1], y[N - 1], z[N - 1]); \ + } \ + return res; \ + } + +__SYCL_MATH_FUNCTION_3_OVERLOAD(mad) __SYCL_MATH_FUNCTION_3_OVERLOAD(mix) + __SYCL_MATH_FUNCTION_3_OVERLOAD(fma) + +#undef __SYCL_MATH_FUNCTION_3_OVERLOAD + + // svgenfloat acos (svgenfloat x) + template + detail::enable_if_t::value, T> acos(T x) __NOEXC { return __sycl_std::__invoke_acos(x); } -// genfloat acosh (genfloat x) +// svgenfloat acosh (svgenfloat x) template -detail::enable_if_t::value, T> acosh(T x) __NOEXC { +detail::enable_if_t::value, T> acosh(T x) __NOEXC { return __sycl_std::__invoke_acosh(x); } -// genfloat acospi (genfloat x) +// svgenfloat acospi (svgenfloat x) template -detail::enable_if_t::value, T> acospi(T x) __NOEXC { +detail::enable_if_t::value, T> acospi(T x) __NOEXC { return __sycl_std::__invoke_acospi(x); } -// genfloat asin (genfloat x) +// svgenfloat asin (svgenfloat x) template -detail::enable_if_t::value, T> asin(T x) __NOEXC { +detail::enable_if_t::value, T> asin(T x) __NOEXC { return __sycl_std::__invoke_asin(x); } -// genfloat asinh (genfloat x) +// svgenfloat asinh (svgenfloat x) template -detail::enable_if_t::value, T> asinh(T x) __NOEXC { +detail::enable_if_t::value, T> asinh(T x) __NOEXC { return __sycl_std::__invoke_asinh(x); } -// genfloat asinpi (genfloat x) +// svgenfloat asinpi (svgenfloat x) template -detail::enable_if_t::value, T> asinpi(T x) __NOEXC { +detail::enable_if_t::value, T> asinpi(T x) __NOEXC { return __sycl_std::__invoke_asinpi(x); } -// genfloat atan (genfloat y_over_x) +// svgenfloat atan (svgenfloat y_over_x) template -detail::enable_if_t::value, T> atan(T y_over_x) __NOEXC { +detail::enable_if_t::value, T> +atan(T y_over_x) __NOEXC { return __sycl_std::__invoke_atan(y_over_x); } -// genfloat atan2 (genfloat y, genfloat x) +// svgenfloat atan2 (svgenfloat y, svgenfloat x) template -detail::enable_if_t::value, T> atan2(T y, T x) __NOEXC { +detail::enable_if_t::value, T> atan2(T y, + T x) __NOEXC { return __sycl_std::__invoke_atan2(y, x); } -// genfloat atanh (genfloat x) +// svgenfloat atanh (svgenfloat x) template -detail::enable_if_t::value, T> atanh(T x) __NOEXC { +detail::enable_if_t::value, T> atanh(T x) __NOEXC { return __sycl_std::__invoke_atanh(x); } -// genfloat atanpi (genfloat x) +// svgenfloat atanpi (svgenfloat x) template -detail::enable_if_t::value, T> atanpi(T x) __NOEXC { +detail::enable_if_t::value, T> atanpi(T x) __NOEXC { return __sycl_std::__invoke_atanpi(x); } -// genfloat atan2pi (genfloat y, genfloat x) +// svgenfloat atan2pi (svgenfloat y, svgenfloat x) template -detail::enable_if_t::value, T> atan2pi(T y, - T x) __NOEXC { +detail::enable_if_t::value, T> atan2pi(T y, + T x) __NOEXC { return __sycl_std::__invoke_atan2pi(y, x); } -// genfloat cbrt (genfloat x) +// svgenfloat cbrt (svgenfloat x) template -detail::enable_if_t::value, T> cbrt(T x) __NOEXC { +detail::enable_if_t::value, T> cbrt(T x) __NOEXC { return __sycl_std::__invoke_cbrt(x); } -// genfloat ceil (genfloat x) +// svgenfloat ceil (svgenfloat x) template -detail::enable_if_t::value, T> ceil(T x) __NOEXC { +detail::enable_if_t::value, T> ceil(T x) __NOEXC { return __sycl_std::__invoke_ceil(x); } -// genfloat copysign (genfloat x, genfloat y) +// svgenfloat copysign (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> copysign(T x, - T y) __NOEXC { +detail::enable_if_t::value, T> copysign(T x, + T y) __NOEXC { return __sycl_std::__invoke_copysign(x, y); } -// genfloat cos (genfloat x) +// svgenfloat cos (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> cos(T x) __NOEXC { return __sycl_std::__invoke_cos(x); } -// genfloat cosh (genfloat x) +// svgenfloat cosh (svgenfloat x) template -detail::enable_if_t::value, T> cosh(T x) __NOEXC { +detail::enable_if_t::value, T> cosh(T x) __NOEXC { return __sycl_std::__invoke_cosh(x); } -// genfloat cospi (genfloat x) +// svgenfloat cospi (svgenfloat x) template -detail::enable_if_t::value, T> cospi(T x) __NOEXC { +detail::enable_if_t::value, T> cospi(T x) __NOEXC { return __sycl_std::__invoke_cospi(x); } -// genfloat erfc (genfloat x) +// svgenfloat erfc (svgenfloat x) template -detail::enable_if_t::value, T> erfc(T x) __NOEXC { +detail::enable_if_t::value, T> erfc(T x) __NOEXC { return __sycl_std::__invoke_erfc(x); } -// genfloat erf (genfloat x) +// svgenfloat erf (svgenfloat x) template -detail::enable_if_t::value, T> erf(T x) __NOEXC { +detail::enable_if_t::value, T> erf(T x) __NOEXC { return __sycl_std::__invoke_erf(x); } -// genfloat exp (genfloat x ) +// svgenfloat exp (svgenfloat x ) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> exp(T x) __NOEXC { return __sycl_std::__invoke_exp(x); } -// genfloat exp2 (genfloat x) +// svgenfloat exp2 (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> exp2(T x) __NOEXC { return __sycl_std::__invoke_exp2(x); } -// genfloat exp10 (genfloat x) +// svgenfloat exp10 (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> exp10(T x) __NOEXC { return __sycl_std::__invoke_exp10(x); } -// genfloat expm1 (genfloat x) +// svgenfloat expm1 (svgenfloat x) template -detail::enable_if_t::value, T> expm1(T x) __NOEXC { +detail::enable_if_t::value, T> expm1(T x) __NOEXC { return __sycl_std::__invoke_expm1(x); } -// genfloat fabs (genfloat x) +// svgenfloat fabs (svgenfloat x) template -detail::enable_if_t::value, T> fabs(T x) __NOEXC { +detail::enable_if_t::value, T> fabs(T x) __NOEXC { return __sycl_std::__invoke_fabs(x); } -// genfloat fdim (genfloat x, genfloat y) +// svgenfloat fdim (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> fdim(T x, T y) __NOEXC { +detail::enable_if_t::value, T> fdim(T x, T y) __NOEXC { return __sycl_std::__invoke_fdim(x, y); } -// genfloat floor (genfloat x) +// svgenfloat floor (svgenfloat x) template -detail::enable_if_t::value, T> floor(T x) __NOEXC { +detail::enable_if_t::value, T> floor(T x) __NOEXC { return __sycl_std::__invoke_floor(x); } -// genfloat fma (genfloat a, genfloat b, genfloat c) +// svgenfloat fma (svgenfloat a, svgenfloat b, svgenfloat c) template -detail::enable_if_t::value, T> fma(T a, T b, - T c) __NOEXC { +detail::enable_if_t::value, T> fma(T a, T b, + T c) __NOEXC { return __sycl_std::__invoke_fma(a, b, c); } -// genfloat fmax (genfloat x, genfloat y) +// svgenfloat fmax (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> fmax(T x, T y) __NOEXC { +detail::enable_if_t::value, T> fmax(T x, T y) __NOEXC { return __sycl_std::__invoke_fmax(x, y); } -// genfloat fmax (genfloat x, sgenfloat y) +// svgenfloat fmax (svgenfloat x, sgenfloat y) template detail::enable_if_t::value, T> fmax(T x, typename T::element_type y) __NOEXC { return __sycl_std::__invoke_fmax(x, T(y)); } -// genfloat fmin (genfloat x, genfloat y) +// svgenfloat fmin (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> fmin(T x, T y) __NOEXC { +detail::enable_if_t::value, T> fmin(T x, T y) __NOEXC { return __sycl_std::__invoke_fmin(x, y); } -// genfloat fmin (genfloat x, sgenfloat y) +// svgenfloat fmin (svgenfloat x, sgenfloat y) template detail::enable_if_t::value, T> fmin(T x, typename T::element_type y) __NOEXC { return __sycl_std::__invoke_fmin(x, T(y)); } -// genfloat fmod (genfloat x, genfloat y) +// svgenfloat fmod (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> fmod(T x, T y) __NOEXC { +detail::enable_if_t::value, T> fmod(T x, T y) __NOEXC { return __sycl_std::__invoke_fmod(x, y); } -// genfloat fract (genfloat x, genfloatptr iptr) +// svgenfloat fract (svgenfloat x, genfloatptr iptr) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genfloatptr::value, T> + detail::is_svgenfloat::value && detail::is_genfloatptr::value, T> fract(T x, T2 iptr) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_fract(x, iptr); } -// genfloat frexp (genfloat x, genintptr exp) +// svgenfloat frexp (svgenfloat x, genintptr exp) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genintptr::value, T> + detail::is_svgenfloat::value && detail::is_genintptr::value, T> frexp(T x, T2 exp) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_frexp(x, exp); } -// genfloat hypot (genfloat x, genfloat y) +// svgenfloat hypot (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> hypot(T x, T y) __NOEXC { +detail::enable_if_t::value, T> hypot(T x, + T y) __NOEXC { return __sycl_std::__invoke_hypot(x, y); } -// genint ilogb (genfloat x) +// genint ilogb (svgenfloat x) template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::change_base_type_t ilogb(T x) __NOEXC { return __sycl_std::__invoke_ilogb>(x); } @@ -286,74 +450,76 @@ ldexp(T x, T2 k) __NOEXC { return __sycl_std::__invoke_ldexp(x, k); } -// genfloat lgamma (genfloat x) +// svgenfloat lgamma (svgenfloat x) template -detail::enable_if_t::value, T> lgamma(T x) __NOEXC { +detail::enable_if_t::value, T> lgamma(T x) __NOEXC { return __sycl_std::__invoke_lgamma(x); } -// genfloat lgamma_r (genfloat x, genintptr signp) +// svgenfloat lgamma_r (svgenfloat x, genintptr signp) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genintptr::value, T> + detail::is_svgenfloat::value && detail::is_genintptr::value, T> lgamma_r(T x, T2 signp) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_lgamma_r(x, signp); } -// genfloat log (genfloat x) +// svgenfloat log (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> log(T x) __NOEXC { return __sycl_std::__invoke_log(x); } -// genfloat log2 (genfloat x) +// svgenfloat log2 (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> log2(T x) __NOEXC { return __sycl_std::__invoke_log2(x); } -// genfloat log10 (genfloat x) +// svgenfloat log10 (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> log10(T x) __NOEXC { return __sycl_std::__invoke_log10(x); } -// genfloat log1p (genfloat x) +// svgenfloat log1p (svgenfloat x) template -detail::enable_if_t::value, T> log1p(T x) __NOEXC { +detail::enable_if_t::value, T> log1p(T x) __NOEXC { return __sycl_std::__invoke_log1p(x); } -// genfloat logb (genfloat x) +// svgenfloat logb (svgenfloat x) template -detail::enable_if_t::value, T> logb(T x) __NOEXC { +detail::enable_if_t::value, T> logb(T x) __NOEXC { return __sycl_std::__invoke_logb(x); } -// genfloat mad (genfloat a, genfloat b, genfloat c) +// svgenfloat mad (svgenfloat a, svgenfloat b, svgenfloat c) template -detail::enable_if_t::value, T> mad(T a, T b, - T c) __NOEXC { +detail::enable_if_t::value, T> mad(T a, T b, + T c) __NOEXC { return __sycl_std::__invoke_mad(a, b, c); } -// genfloat maxmag (genfloat x, genfloat y) +// svgenfloat maxmag (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> maxmag(T x, T y) __NOEXC { +detail::enable_if_t::value, T> maxmag(T x, + T y) __NOEXC { return __sycl_std::__invoke_maxmag(x, y); } -// genfloat minmag (genfloat x, genfloat y) +// svgenfloat minmag (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> minmag(T x, T y) __NOEXC { +detail::enable_if_t::value, T> minmag(T x, + T y) __NOEXC { return __sycl_std::__invoke_minmag(x, y); } -// genfloat modf (genfloat x, genfloatptr iptr) +// svgenfloat modf (svgenfloat x, genfloatptr iptr) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genfloatptr::value, T> + detail::is_svgenfloat::value && detail::is_genfloatptr::value, T> modf(T x, T2 iptr) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_modf(x, iptr); @@ -366,151 +532,151 @@ detail::nan_return_t nan(T nancode) __NOEXC { detail::convert_data_type>()(nancode)); } -// genfloat nextafter (genfloat x, genfloat y) +// svgenfloat nextafter (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> nextafter(T x, - T y) __NOEXC { +detail::enable_if_t::value, T> nextafter(T x, + T y) __NOEXC { return __sycl_std::__invoke_nextafter(x, y); } -// genfloat pow (genfloat x, genfloat y) +// svgenfloat pow (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> pow(T x, T y) __NOEXC { +detail::enable_if_t::value, T> pow(T x, T y) __NOEXC { return __sycl_std::__invoke_pow(x, y); } -// genfloat pown (genfloat x, genint y) +// svgenfloat pown (svgenfloat x, genint y) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genint::value, T> + detail::is_svgenfloat::value && detail::is_genint::value, T> pown(T x, T2 y) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_pown(x, y); } -// genfloat powr (genfloat x, genfloat y) +// svgenfloat powr (svgenfloat x, svgenfloat y) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> powr(T x, T y) __NOEXC { return __sycl_std::__invoke_powr(x, y); } -// genfloat remainder (genfloat x, genfloat y) +// svgenfloat remainder (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T> remainder(T x, - T y) __NOEXC { +detail::enable_if_t::value, T> remainder(T x, + T y) __NOEXC { return __sycl_std::__invoke_remainder(x, y); } -// genfloat remquo (genfloat x, genfloat y, genintptr quo) +// svgenfloat remquo (svgenfloat x, svgenfloat y, genintptr quo) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genintptr::value, T> + detail::is_svgenfloat::value && detail::is_genintptr::value, T> remquo(T x, T y, T2 quo) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_remquo(x, y, quo); } -// genfloat rint (genfloat x) +// svgenfloat rint (svgenfloat x) template -detail::enable_if_t::value, T> rint(T x) __NOEXC { +detail::enable_if_t::value, T> rint(T x) __NOEXC { return __sycl_std::__invoke_rint(x); } -// genfloat rootn (genfloat x, genint y) +// svgenfloat rootn (svgenfloat x, genint y) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genint::value, T> + detail::is_svgenfloat::value && detail::is_genint::value, T> rootn(T x, T2 y) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_rootn(x, y); } -// genfloat round (genfloat x) +// svgenfloat round (svgenfloat x) template -detail::enable_if_t::value, T> round(T x) __NOEXC { +detail::enable_if_t::value, T> round(T x) __NOEXC { return __sycl_std::__invoke_round(x); } -// genfloat rsqrt (genfloat x) +// svgenfloat rsqrt (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> rsqrt(T x) __NOEXC { return __sycl_std::__invoke_rsqrt(x); } -// genfloat sin (genfloat x) +// svgenfloat sin (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sin(T x) __NOEXC { return __sycl_std::__invoke_sin(x); } -// genfloat sincos (genfloat x, genfloatptr cosval) +// svgenfloat sincos (svgenfloat x, genfloatptr cosval) template detail::enable_if_t< - detail::is_genfloat::value && detail::is_genfloatptr::value, T> + detail::is_svgenfloat::value && detail::is_genfloatptr::value, T> sincos(T x, T2 cosval) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_sincos(x, cosval); } -// genfloat sinh (genfloat x) +// svgenfloat sinh (svgenfloat x) template -detail::enable_if_t::value, T> sinh(T x) __NOEXC { +detail::enable_if_t::value, T> sinh(T x) __NOEXC { return __sycl_std::__invoke_sinh(x); } -// genfloat sinpi (genfloat x) +// svgenfloat sinpi (svgenfloat x) template -detail::enable_if_t::value, T> sinpi(T x) __NOEXC { +detail::enable_if_t::value, T> sinpi(T x) __NOEXC { return __sycl_std::__invoke_sinpi(x); } -// genfloat sqrt (genfloat x) +// svgenfloat sqrt (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> sqrt(T x) __NOEXC { return __sycl_std::__invoke_sqrt(x); } -// genfloat tan (genfloat x) +// svgenfloat tan (svgenfloat x) template detail::enable_if_t<__FAST_MATH_GENFLOAT(T), T> tan(T x) __NOEXC { return __sycl_std::__invoke_tan(x); } -// genfloat tanh (genfloat x) +// svgenfloat tanh (svgenfloat x) template -detail::enable_if_t::value, T> tanh(T x) __NOEXC { +detail::enable_if_t::value, T> tanh(T x) __NOEXC { return __sycl_std::__invoke_tanh(x); } -// genfloat tanpi (genfloat x) +// svgenfloat tanpi (svgenfloat x) template -detail::enable_if_t::value, T> tanpi(T x) __NOEXC { +detail::enable_if_t::value, T> tanpi(T x) __NOEXC { return __sycl_std::__invoke_tanpi(x); } -// genfloat tgamma (genfloat x) +// svgenfloat tgamma (svgenfloat x) template -detail::enable_if_t::value, T> tgamma(T x) __NOEXC { +detail::enable_if_t::value, T> tgamma(T x) __NOEXC { return __sycl_std::__invoke_tgamma(x); } -// genfloat trunc (genfloat x) +// svgenfloat trunc (svgenfloat x) template -detail::enable_if_t::value, T> trunc(T x) __NOEXC { +detail::enable_if_t::value, T> trunc(T x) __NOEXC { return __sycl_std::__invoke_trunc(x); } /* --------------- 4.13.5 Common functions. ---------------------------------*/ -// genfloat clamp (genfloat x, genfloat minval, genfloat maxval) +// svgenfloat clamp (svgenfloat x, svgenfloat minval, svgenfloat maxval) template -detail::enable_if_t::value, T> clamp(T x, T minval, - T maxval) __NOEXC { +detail::enable_if_t::value, T> +clamp(T x, T minval, T maxval) __NOEXC { return __sycl_std::__invoke_fclamp(x, minval, maxval); } -// genfloath clamp (genfloath x, half minval, half maxval) -// genfloatf clamp (genfloatf x, float minval, float maxval) -// genfloatd clamp (genfloatd x, double minval, double maxval) +// vgenfloath clamp (vgenfloath x, half minval, half maxval) +// vgenfloatf clamp (vgenfloatf x, float minval, float maxval) +// vgenfloatd clamp (vgenfloatd x, double minval, double maxval) template detail::enable_if_t::value, T> clamp(T x, typename T::element_type minval, @@ -518,98 +684,98 @@ clamp(T x, typename T::element_type minval, return __sycl_std::__invoke_fclamp(x, T(minval), T(maxval)); } -// genfloat degrees (genfloat radians) +// svgenfloat degrees (svgenfloat radians) template -detail::enable_if_t::value, T> +detail::enable_if_t::value, T> degrees(T radians) __NOEXC { return __sycl_std::__invoke_degrees(radians); } -// genfloat abs (genfloat x) +// svgenfloat abs (svgenfloat x) template -detail::enable_if_t::value, T> abs(T x) __NOEXC { +detail::enable_if_t::value, T> abs(T x) __NOEXC { return __sycl_std::__invoke_fabs(x); } -// genfloat max (genfloat x, genfloat y) +// svgenfloat max (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T>(max)(T x, T y) __NOEXC { +detail::enable_if_t::value, T>(max)(T x, T y) __NOEXC { return __sycl_std::__invoke_fmax_common(x, y); } -// genfloatf max (genfloatf x, float y) -// genfloatd max (genfloatd x, double y) -// genfloath max (genfloath x, half y) +// vgenfloatf max (vgenfloatf x, float y) +// vgenfloatd max (vgenfloatd x, double y) +// vgenfloath max (vgenfloath x, half y) template detail::enable_if_t::value, T>(max)( T x, typename T::element_type y) __NOEXC { return __sycl_std::__invoke_fmax_common(x, T(y)); } -// genfloat min (genfloat x, genfloat y) +// svgenfloat min (svgenfloat x, svgenfloat y) template -detail::enable_if_t::value, T>(min)(T x, T y) __NOEXC { +detail::enable_if_t::value, T>(min)(T x, T y) __NOEXC { return __sycl_std::__invoke_fmin_common(x, y); } -// genfloatf min (genfloatf x, float y) -// genfloatd min (genfloatd x, double y) -// genfloath min (genfloath x, half y) +// vgenfloatf min (vgenfloatf x, float y) +// vgenfloatd min (vgenfloatd x, double y) +// vgenfloath min (vgenfloath x, half y) template detail::enable_if_t::value, T>(min)( T x, typename T::element_type y) __NOEXC { return __sycl_std::__invoke_fmin_common(x, T(y)); } -// genfloat mix (genfloat x, genfloat y, genfloat a) +// svgenfloat mix (svgenfloat x, svgenfloat y, svgenfloat a) template -detail::enable_if_t::value, T> mix(T x, T y, - T a) __NOEXC { +detail::enable_if_t::value, T> mix(T x, T y, + T a) __NOEXC { return __sycl_std::__invoke_mix(x, y, a); } -// genfloatf mix (genfloatf x, genfloatf y, float a) -// genfloatd mix (genfloatd x, genfloatd y, double a) -// genfloatd mix (genfloath x, genfloath y, half a) +// vgenfloatf mix (vgenfloatf x, vgenfloatf y, float a) +// vgenfloatd mix (vgenfloatd x, vgenfloatd y, double a) +// vgenfloatd mix (vgenfloath x, vgenfloath y, half a) template detail::enable_if_t::value, T> mix(T x, T y, typename T::element_type a) __NOEXC { return __sycl_std::__invoke_mix(x, y, T(a)); } -// genfloat radians (genfloat degrees) +// svgenfloat radians (svgenfloat degrees) template -detail::enable_if_t::value, T> +detail::enable_if_t::value, T> radians(T degrees) __NOEXC { return __sycl_std::__invoke_radians(degrees); } -// genfloat step (genfloat edge, genfloat x) +// svgenfloat step (svgenfloat edge, svgenfloat x) template -detail::enable_if_t::value, T> step(T edge, - T x) __NOEXC { +detail::enable_if_t::value, T> step(T edge, + T x) __NOEXC { return __sycl_std::__invoke_step(edge, x); } -// genfloatf step (float edge, genfloatf x) -// genfloatd step (double edge, genfloatd x) -// genfloatd step (half edge, genfloath x) +// vgenfloatf step (float edge, vgenfloatf x) +// vgenfloatd step (double edge, vgenfloatd x) +// vgenfloatd step (half edge, vgenfloath x) template detail::enable_if_t::value, T> step(typename T::element_type edge, T x) __NOEXC { return __sycl_std::__invoke_step(T(edge), x); } -// genfloat smoothstep (genfloat edge0, genfloat edge1, genfloat x) +// svgenfloat smoothstep (svgenfloat edge0, svgenfloat edge1, svgenfloat x) template -detail::enable_if_t::value, T> +detail::enable_if_t::value, T> smoothstep(T edge0, T edge1, T x) __NOEXC { return __sycl_std::__invoke_smoothstep(edge0, edge1, x); } -// genfloatf smoothstep (float edge0, float edge1, genfloatf x) -// genfloatd smoothstep (double edge0, double edge1, genfloatd x) -// genfloath smoothstep (half edge0, half edge1, genfloath x) +// vgenfloatf smoothstep (float edge0, float edge1, vgenfloatf x) +// vgenfloatd smoothstep (double edge0, double edge1, vgenfloatd x) +// vgenfloath smoothstep (half edge0, half edge1, vgenfloath x) template detail::enable_if_t::value, T> smoothstep(typename T::element_type edge0, typename T::element_type edge1, @@ -617,9 +783,9 @@ smoothstep(typename T::element_type edge0, typename T::element_type edge1, return __sycl_std::__invoke_smoothstep(T(edge0), T(edge1), x); } -// genfloat sign (genfloat x) +// svgenfloat sign (svgenfloat x) template -detail::enable_if_t::value, T> sign(T x) __NOEXC { +detail::enable_if_t::value, T> sign(T x) __NOEXC { return __sycl_std::__invoke_sign(x); } @@ -1113,21 +1279,21 @@ fast_normalize(T p) __NOEXC { /* SYCL 2020 ---- 4.17.9 Relational functions. -----------------------------*/ template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isequal(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FOrdEqual>(x, y)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isnotequal(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FUnordNotEqual>(x, y)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isgreater(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FOrdGreaterThan>(x, @@ -1135,7 +1301,7 @@ detail::common_rel_ret_t isgreater(T x, T y) __NOEXC { } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isgreaterequal(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FOrdGreaterThanEqual>( @@ -1143,14 +1309,14 @@ detail::common_rel_ret_t isgreaterequal(T x, T y) __NOEXC { } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isless(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FOrdLessThan>(x, y)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t islessequal(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FOrdLessThanEqual>(x, @@ -1158,56 +1324,56 @@ detail::common_rel_ret_t islessequal(T x, T y) __NOEXC { } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t islessgreater(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_FOrdNotEqual>(x, y)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isfinite(T x) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_IsFinite>(x)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isinf(T x) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_IsInf>(x)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isnan(T x) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_IsNan>(x)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isnormal(T x) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_IsNormal>(x)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isordered(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_Ordered>(x, y)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t isunordered(T x, T y) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_Unordered>(x, y)); } template ::value, T>> + typename = detail::enable_if_t::value, T>> detail::common_rel_ret_t signbit(T x) __NOEXC { return detail::RelConverter::apply( __sycl_std::__invoke_SignBitSet>(x)); @@ -1285,55 +1451,59 @@ select(T a, T b, T2 c) __NOEXC { return __sycl_std::__invoke_select(a, b, c); } -// genfloatf select (genfloatf a, genfloatf b, genint c) +// svgenfloatf select (svgenfloatf a, svgenfloatf b, genint c) template detail::enable_if_t< - detail::is_genfloatf::value && detail::is_genint::value, T> + detail::is_svgenfloatf::value && detail::is_genint::value, T> select(T a, T b, T2 c) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_select(a, b, c); } -// genfloatf select (genfloatf a, genfloatf b, ugenint c) +// svgenfloatf select (svgenfloatf a, svgenfloatf b, ugenint c) template detail::enable_if_t< - detail::is_genfloatf::value && detail::is_ugenint::value, T> + detail::is_svgenfloatf::value && detail::is_ugenint::value, T> select(T a, T b, T2 c) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_select(a, b, c); } -// genfloatd select (genfloatd a, genfloatd b, igeninteger64 c) +// svgenfloatd select (svgenfloatd a, svgenfloatd b, igeninteger64 c) template -detail::enable_if_t< - detail::is_genfloatd::value && detail::is_igeninteger64bit::value, T> +detail::enable_if_t::value && + detail::is_igeninteger64bit::value, + T> select(T a, T b, T2 c) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_select(a, b, c); } -// genfloatd select (genfloatd a, genfloatd b, ugeninteger64 c) +// svgenfloatd select (svgenfloatd a, svgenfloatd b, ugeninteger64 c) template -detail::enable_if_t< - detail::is_genfloatd::value && detail::is_ugeninteger64bit::value, T> +detail::enable_if_t::value && + detail::is_ugeninteger64bit::value, + T> select(T a, T b, T2 c) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_select(a, b, c); } -// genfloath select (genfloath a, genfloath b, igeninteger16 c) +// svgenfloath select (svgenfloath a, svgenfloath b, igeninteger16 c) template -detail::enable_if_t< - detail::is_genfloath::value && detail::is_igeninteger16bit::value, T> +detail::enable_if_t::value && + detail::is_igeninteger16bit::value, + T> select(T a, T b, T2 c) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_select(a, b, c); } -// genfloath select (genfloath a, genfloath b, ugeninteger16 c) +// svgenfloath select (svgenfloath a, svgenfloath b, ugeninteger16 c) template -detail::enable_if_t< - detail::is_genfloath::value && detail::is_ugeninteger16bit::value, T> +detail::enable_if_t::value && + detail::is_ugeninteger16bit::value, + T> select(T a, T b, T2 c) __NOEXC { detail::check_vector_size(); return __sycl_std::__invoke_select(a, b, c); @@ -1341,176 +1511,284 @@ select(T a, T b, T2 c) __NOEXC { namespace native { /* ----------------- 4.13.3 Math functions. ---------------------------------*/ -// genfloatf cos (genfloatf x) -template -detail::enable_if_t::value, T> cos(T x) __NOEXC { + +#define __SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE marray NAME(marray x) \ + __NOEXC { \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_native_##NAME>( \ + detail::to_vec2(x, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = __sycl_std::__invoke_native_##NAME(x[N - 1]); \ + } \ + return res; \ + } + +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(sin) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(cos) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(tan) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(exp) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(exp2) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(exp10) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(log) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(log2) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(log10) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(sqrt) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(rsqrt) +__SYCL_NATIVE_MATH_FUNCTION_OVERLOAD(recip) + +#undef __SYCL_NATIVE_MATH_FUNCTION_OVERLOAD + +#define __SYCL_NATIVE_MATH_FUNCTION_2_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE marray NAME( \ + marray x, marray y) __NOEXC { \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_native_##NAME>( \ + detail::to_vec2(x, i * 2), detail::to_vec2(y, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = \ + __sycl_std::__invoke_native_##NAME(x[N - 1], y[N - 1]); \ + } \ + return res; \ + } + +__SYCL_NATIVE_MATH_FUNCTION_2_OVERLOAD(divide) +__SYCL_NATIVE_MATH_FUNCTION_2_OVERLOAD(powr) + +#undef __SYCL_NATIVE_MATH_FUNCTION_2_OVERLOAD + +// svgenfloatf cos (svgenfloatf x) +template +detail::enable_if_t::value, T> cos(T x) __NOEXC { return __sycl_std::__invoke_native_cos(x); } -// genfloatf divide (genfloatf x, genfloatf y) +// svgenfloatf divide (svgenfloatf x, svgenfloatf y) template -detail::enable_if_t::value, T> divide(T x, - T y) __NOEXC { +detail::enable_if_t::value, T> divide(T x, + T y) __NOEXC { return __sycl_std::__invoke_native_divide(x, y); } -// genfloatf exp (genfloatf x) +// svgenfloatf exp (svgenfloatf x) template -detail::enable_if_t::value, T> exp(T x) __NOEXC { +detail::enable_if_t::value, T> exp(T x) __NOEXC { return __sycl_std::__invoke_native_exp(x); } -// genfloatf exp2 (genfloatf x) +// svgenfloatf exp2 (svgenfloatf x) template -detail::enable_if_t::value, T> exp2(T x) __NOEXC { +detail::enable_if_t::value, T> exp2(T x) __NOEXC { return __sycl_std::__invoke_native_exp2(x); } -// genfloatf exp10 (genfloatf x) +// svgenfloatf exp10 (svgenfloatf x) template -detail::enable_if_t::value, T> exp10(T x) __NOEXC { +detail::enable_if_t::value, T> exp10(T x) __NOEXC { return __sycl_std::__invoke_native_exp10(x); } -// genfloatf log (genfloatf x) +// svgenfloatf log (svgenfloatf x) template -detail::enable_if_t::value, T> log(T x) __NOEXC { +detail::enable_if_t::value, T> log(T x) __NOEXC { return __sycl_std::__invoke_native_log(x); } -// genfloatf log2 (genfloatf x) +// svgenfloatf log2 (svgenfloatf x) template -detail::enable_if_t::value, T> log2(T x) __NOEXC { +detail::enable_if_t::value, T> log2(T x) __NOEXC { return __sycl_std::__invoke_native_log2(x); } -// genfloatf log10 (genfloatf x) +// svgenfloatf log10 (svgenfloatf x) template -detail::enable_if_t::value, T> log10(T x) __NOEXC { +detail::enable_if_t::value, T> log10(T x) __NOEXC { return __sycl_std::__invoke_native_log10(x); } -// genfloatf powr (genfloatf x, genfloatf y) +// svgenfloatf powr (svgenfloatf x, svgenfloatf y) template -detail::enable_if_t::value, T> powr(T x, T y) __NOEXC { +detail::enable_if_t::value, T> powr(T x, + T y) __NOEXC { return __sycl_std::__invoke_native_powr(x, y); } -// genfloatf recip (genfloatf x) +// svgenfloatf recip (svgenfloatf x) template -detail::enable_if_t::value, T> recip(T x) __NOEXC { +detail::enable_if_t::value, T> recip(T x) __NOEXC { return __sycl_std::__invoke_native_recip(x); } -// genfloatf rsqrt (genfloatf x) +// svgenfloatf rsqrt (svgenfloatf x) template -detail::enable_if_t::value, T> rsqrt(T x) __NOEXC { +detail::enable_if_t::value, T> rsqrt(T x) __NOEXC { return __sycl_std::__invoke_native_rsqrt(x); } -// genfloatf sin (genfloatf x) +// svgenfloatf sin (svgenfloatf x) template -detail::enable_if_t::value, T> sin(T x) __NOEXC { +detail::enable_if_t::value, T> sin(T x) __NOEXC { return __sycl_std::__invoke_native_sin(x); } -// genfloatf sqrt (genfloatf x) +// svgenfloatf sqrt (svgenfloatf x) template -detail::enable_if_t::value, T> sqrt(T x) __NOEXC { +detail::enable_if_t::value, T> sqrt(T x) __NOEXC { return __sycl_std::__invoke_native_sqrt(x); } -// genfloatf tan (genfloatf x) +// svgenfloatf tan (svgenfloatf x) template -detail::enable_if_t::value, T> tan(T x) __NOEXC { +detail::enable_if_t::value, T> tan(T x) __NOEXC { return __sycl_std::__invoke_native_tan(x); } } // namespace native namespace half_precision { /* ----------------- 4.13.3 Math functions. ---------------------------------*/ -// genfloatf cos (genfloatf x) -template -detail::enable_if_t::value, T> cos(T x) __NOEXC { +#define __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE marray NAME(marray x) \ + __NOEXC { \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_half_##NAME>( \ + detail::to_vec2(x, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = __sycl_std::__invoke_half_##NAME(x[N - 1]); \ + } \ + return res; \ + } + +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(sin) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(cos) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(tan) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(exp2) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log2) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(log10) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(sqrt) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(rsqrt) +__SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD(recip) + +#undef __SYCL_HALF_PRECISION_MATH_FUNCTION_OVERLOAD + +#define __SYCL_HALF_PRECISION_MATH_FUNCTION_2_OVERLOAD(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE marray NAME( \ + marray x, marray y) __NOEXC { \ + marray res; \ + for (size_t i = 0; i < N / 2; i++) { \ + auto partial_res = __sycl_std::__invoke_half_##NAME>( \ + detail::to_vec2(x, i * 2), detail::to_vec2(y, i * 2)); \ + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); \ + } \ + if (N % 2) { \ + res[N - 1] = \ + __sycl_std::__invoke_half_##NAME(x[N - 1], y[N - 1]); \ + } \ + return res; \ + } + +__SYCL_HALF_PRECISION_MATH_FUNCTION_2_OVERLOAD(divide) +__SYCL_HALF_PRECISION_MATH_FUNCTION_2_OVERLOAD(powr) + +#undef __SYCL_HALF_PRECISION_MATH_FUNCTION_2_OVERLOAD + +// svgenfloatf cos (svgenfloatf x) +template +detail::enable_if_t::value, T> cos(T x) __NOEXC { return __sycl_std::__invoke_half_cos(x); } -// genfloatf divide (genfloatf x, genfloatf y) +// svgenfloatf divide (svgenfloatf x, svgenfloatf y) template -detail::enable_if_t::value, T> divide(T x, - T y) __NOEXC { +detail::enable_if_t::value, T> divide(T x, + T y) __NOEXC { return __sycl_std::__invoke_half_divide(x, y); } -// genfloatf exp (genfloatf x) +// svgenfloatf exp (svgenfloatf x) template -detail::enable_if_t::value, T> exp(T x) __NOEXC { +detail::enable_if_t::value, T> exp(T x) __NOEXC { return __sycl_std::__invoke_half_exp(x); } -// genfloatf exp2 (genfloatf x) +// svgenfloatf exp2 (svgenfloatf x) template -detail::enable_if_t::value, T> exp2(T x) __NOEXC { +detail::enable_if_t::value, T> exp2(T x) __NOEXC { return __sycl_std::__invoke_half_exp2(x); } -// genfloatf exp10 (genfloatf x) +// svgenfloatf exp10 (svgenfloatf x) template -detail::enable_if_t::value, T> exp10(T x) __NOEXC { +detail::enable_if_t::value, T> exp10(T x) __NOEXC { return __sycl_std::__invoke_half_exp10(x); } -// genfloatf log (genfloatf x) +// svgenfloatf log (svgenfloatf x) template -detail::enable_if_t::value, T> log(T x) __NOEXC { +detail::enable_if_t::value, T> log(T x) __NOEXC { return __sycl_std::__invoke_half_log(x); } -// genfloatf log2 (genfloatf x) +// svgenfloatf log2 (svgenfloatf x) template -detail::enable_if_t::value, T> log2(T x) __NOEXC { +detail::enable_if_t::value, T> log2(T x) __NOEXC { return __sycl_std::__invoke_half_log2(x); } -// genfloatf log10 (genfloatf x) +// svgenfloatf log10 (svgenfloatf x) template -detail::enable_if_t::value, T> log10(T x) __NOEXC { +detail::enable_if_t::value, T> log10(T x) __NOEXC { return __sycl_std::__invoke_half_log10(x); } -// genfloatf powr (genfloatf x, genfloatf y) +// svgenfloatf powr (svgenfloatf x, svgenfloatf y) template -detail::enable_if_t::value, T> powr(T x, T y) __NOEXC { +detail::enable_if_t::value, T> powr(T x, + T y) __NOEXC { return __sycl_std::__invoke_half_powr(x, y); } -// genfloatf recip (genfloatf x) +// svgenfloatf recip (svgenfloatf x) template -detail::enable_if_t::value, T> recip(T x) __NOEXC { +detail::enable_if_t::value, T> recip(T x) __NOEXC { return __sycl_std::__invoke_half_recip(x); } -// genfloatf rsqrt (genfloatf x) +// svgenfloatf rsqrt (svgenfloatf x) template -detail::enable_if_t::value, T> rsqrt(T x) __NOEXC { +detail::enable_if_t::value, T> rsqrt(T x) __NOEXC { return __sycl_std::__invoke_half_rsqrt(x); } -// genfloatf sin (genfloatf x) +// svgenfloatf sin (svgenfloatf x) template -detail::enable_if_t::value, T> sin(T x) __NOEXC { +detail::enable_if_t::value, T> sin(T x) __NOEXC { return __sycl_std::__invoke_half_sin(x); } -// genfloatf sqrt (genfloatf x) +// svgenfloatf sqrt (svgenfloatf x) template -detail::enable_if_t::value, T> sqrt(T x) __NOEXC { +detail::enable_if_t::value, T> sqrt(T x) __NOEXC { return __sycl_std::__invoke_half_sqrt(x); } -// genfloatf tan (genfloatf x) +// svgenfloatf tan (svgenfloatf x) template -detail::enable_if_t::value, T> tan(T x) __NOEXC { +detail::enable_if_t::value, T> tan(T x) __NOEXC { return __sycl_std::__invoke_half_tan(x); } @@ -1518,75 +1796,105 @@ detail::enable_if_t::value, T> tan(T x) __NOEXC { #ifdef __FAST_MATH__ /* ----------------- -ffast-math functions. ---------------------------------*/ -// genfloatf cos (genfloatf x) + +#define __SYCL_MATH_FUNCTION_OVERLOAD_FM(NAME) \ + template \ + inline __SYCL_ALWAYS_INLINE \ + std::enable_if_t, marray> \ + NAME(marray x) __NOEXC { \ + return native::NAME(x); \ + } + +__SYCL_MATH_FUNCTION_OVERLOAD_FM(sin) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(cos) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(tan) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(exp) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(exp2) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(exp10) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(log) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(log2) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(log10) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(sqrt) +__SYCL_MATH_FUNCTION_OVERLOAD_FM(rsqrt) +#undef __SYCL_MATH_FUNCTION_OVERLOAD_FM + +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t, marray> + powr(marray x, marray y) __NOEXC { + return native::powr(x, y); +} + +// svgenfloatf cos (svgenfloatf x) template -detail::enable_if_t::value, T> cos(T x) __NOEXC { +detail::enable_if_t::value, T> cos(T x) __NOEXC { return native::cos(x); } -// genfloatf exp (genfloatf x) +// svgenfloatf exp (svgenfloatf x) template -detail::enable_if_t::value, T> exp(T x) __NOEXC { +detail::enable_if_t::value, T> exp(T x) __NOEXC { return native::exp(x); } -// genfloatf exp2 (genfloatf x) +// svgenfloatf exp2 (svgenfloatf x) template -detail::enable_if_t::value, T> exp2(T x) __NOEXC { +detail::enable_if_t::value, T> exp2(T x) __NOEXC { return native::exp2(x); } -// genfloatf exp10 (genfloatf x) +// svgenfloatf exp10 (svgenfloatf x) template -detail::enable_if_t::value, T> exp10(T x) __NOEXC { +detail::enable_if_t::value, T> exp10(T x) __NOEXC { return native::exp10(x); } -// genfloatf log(genfloatf x) +// svgenfloatf log(svgenfloatf x) template -detail::enable_if_t::value, T> log(T x) __NOEXC { +detail::enable_if_t::value, T> log(T x) __NOEXC { return native::log(x); } -// genfloatf log2 (genfloatf x) +// svgenfloatf log2 (svgenfloatf x) template -detail::enable_if_t::value, T> log2(T x) __NOEXC { +detail::enable_if_t::value, T> log2(T x) __NOEXC { return native::log2(x); } -// genfloatf log10 (genfloatf x) +// svgenfloatf log10 (svgenfloatf x) template -detail::enable_if_t::value, T> log10(T x) __NOEXC { +detail::enable_if_t::value, T> log10(T x) __NOEXC { return native::log10(x); } -// genfloatf powr (genfloatf x) +// svgenfloatf powr (svgenfloatf x) template -detail::enable_if_t::value, T> powr(T x, T y) __NOEXC { +detail::enable_if_t::value, T> powr(T x, + T y) __NOEXC { return native::powr(x, y); } -// genfloatf rsqrt (genfloatf x) +// svgenfloatf rsqrt (svgenfloatf x) template -detail::enable_if_t::value, T> rsqrt(T x) __NOEXC { +detail::enable_if_t::value, T> rsqrt(T x) __NOEXC { return native::rsqrt(x); } -// genfloatf sin (genfloatf x) +// svgenfloatf sin (svgenfloatf x) template -detail::enable_if_t::value, T> sin(T x) __NOEXC { +detail::enable_if_t::value, T> sin(T x) __NOEXC { return native::sin(x); } -// genfloatf sqrt (genfloatf x) +// svgenfloatf sqrt (svgenfloatf x) template -detail::enable_if_t::value, T> sqrt(T x) __NOEXC { +detail::enable_if_t::value, T> sqrt(T x) __NOEXC { return native::sqrt(x); } -// genfloatf tan (genfloatf x) +// svgenfloatf tan (svgenfloatf x) template -detail::enable_if_t::value, T> tan(T x) __NOEXC { +detail::enable_if_t::value, T> tan(T x) __NOEXC { return native::tan(x); } diff --git a/sycl/include/sycl/detail/generic_type_lists.hpp b/sycl/include/sycl/detail/generic_type_lists.hpp index b833192405a31..558dddf4963ca 100644 --- a/sycl/include/sycl/detail/generic_type_lists.hpp +++ b/sycl/include/sycl/detail/generic_type_lists.hpp @@ -45,6 +45,8 @@ using marray_half_list = type_list, marray, marray, marray, marray, marray>; +using scalar_vector_half_list = type_list; + using half_list = type_list; @@ -58,6 +60,9 @@ using marray_float_list = type_list, marray, marray, marray, marray, marray>; +using scalar_vector_float_list = + type_list; + using float_list = type_list; @@ -71,6 +76,9 @@ using marray_double_list = type_list, marray, marray, marray, marray, marray>; +using scalar_vector_double_list = + type_list; + using double_list = type_list; @@ -83,6 +91,9 @@ using vector_floating_list = using marray_floating_list = type_list; +using scalar_vector_floating_list = + type_list; + using floating_list = type_list; diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp index b81d7f49393ad..0bf929978e0b2 100644 --- a/sycl/include/sycl/detail/generic_type_traits.hpp +++ b/sycl/include/sycl/detail/generic_type_traits.hpp @@ -26,15 +26,24 @@ template using is_floatn = is_contained; template using is_genfloatf = is_contained; +template +using is_svgenfloatf = is_contained; + template using is_doublen = is_contained; template using is_genfloatd = is_contained; +template +using is_svgenfloatd = is_contained; + template using is_halfn = is_contained; template using is_genfloath = is_contained; +template +using is_svgenfloath = is_contained; + template using is_genfloat = is_contained; template @@ -43,6 +52,9 @@ using is_sgenfloat = is_contained; template using is_vgenfloat = is_contained; +template +using is_svgenfloat = is_contained; + template using is_gengeofloat = is_contained; diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp index 8bce9d045eb59..f6d0039780153 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp @@ -39,8 +39,9 @@ std::enable_if_t::value, T> fabs(T x) { return oneapi::detail::bitsToBfloat16(__clc_fabs(XBits)); #else std::ignore = x; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -62,8 +63,9 @@ sycl::marray fabs(sycl::marray x) { return res; #else std::ignore = x; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -76,8 +78,9 @@ std::enable_if_t::value, T> fmin(T x, T y) { #else std::ignore = x; std::ignore = y; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -105,8 +108,9 @@ sycl::marray fmin(sycl::marray x, #else std::ignore = x; std::ignore = y; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -119,8 +123,9 @@ std::enable_if_t::value, T> fmax(T x, T y) { #else std::ignore = x; std::ignore = y; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -147,8 +152,9 @@ sycl::marray fmax(sycl::marray x, #else std::ignore = x; std::ignore = y; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -163,8 +169,9 @@ std::enable_if_t::value, T> fma(T x, T y, T z) { std::ignore = x; std::ignore = y; std::ignore = z; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } @@ -196,8 +203,9 @@ sycl::marray fma(sycl::marray x, std::ignore = x; std::ignore = y; std::ignore = z; - throw runtime_error("bfloat16 is not currently supported on the host device.", - PI_ERROR_INVALID_DEVICE); + throw runtime_error( + "bfloat16 math functions are not currently supported on the host device.", + PI_ERROR_INVALID_DEVICE); #endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) } diff --git a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp index 7a7105bf5519a..479ca9032d5f1 100755 --- a/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/builtins.hpp @@ -85,10 +85,12 @@ int printf(const FormatT *__format, Args... args) { namespace native { // genfloatfh tanh (genfloatfh x) +// sycl::native::tanh is only implemented on nvptx backend so far. For other +// backends we revert to the sycl::tanh impl. template inline __SYCL_ALWAYS_INLINE - sycl::detail::enable_if_t::value || - sycl::detail::is_genfloath::value, + sycl::detail::enable_if_t::value || + sycl::detail::is_svgenfloath::value, T> tanh(T x) __NOEXC { #if defined(__NVPTX__) @@ -100,10 +102,45 @@ inline __SYCL_ALWAYS_INLINE #endif } +// The marray math function implementations use vectorizations of +// size two as a simple general optimization. A more complex implementation +// using larger vectorizations for large marray sizes is possible; however more +// testing is required in order to ascertain the performance implications for +// all backends. +// sycl::native::tanh is only implemented on nvptx backend so far. For other +// backends we revert to the sycl::tanh impl. +template +inline __SYCL_ALWAYS_INLINE + std::enable_if_t || std::is_same_v, + sycl::marray> + tanh(sycl::marray x) __NOEXC { + sycl::marray res; +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#define FUNC_VEC native::tanh +#define FUNC FUNC_VEC +#else +#define FUNC_VEC __sycl_std::__invoke_tanh> +#define FUNC __sycl_std::__invoke_tanh +#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + + for (size_t i = 0; i < N / 2; i++) { + auto partial_res = FUNC_VEC(sycl::detail::to_vec2(x, i * 2)); + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); + } + if (N % 2) { + res[N - 1] = FUNC(x[N - 1]); + } +#undef FUNC_VEC +#undef FUNC + return res; +} + // genfloath exp2 (genfloath x) +// sycl::native::exp2 (using half) is only implemented on nvptx backend so far. +// For other backends we revert to the sycl::exp2 impl. template inline __SYCL_ALWAYS_INLINE - sycl::detail::enable_if_t::value, T> + sycl::detail::enable_if_t::value, T> exp2(T x) __NOEXC { #if defined(__NVPTX__) using _ocl_T = sycl::detail::ConvertToOpenCLType_t; @@ -114,6 +151,32 @@ inline __SYCL_ALWAYS_INLINE #endif } +// sycl::native::exp2 (using half) is only implemented on nvptx backend so far. +// For other backends we revert to the sycl::exp2 impl. +template +inline __SYCL_ALWAYS_INLINE sycl::marray +exp2(sycl::marray x) __NOEXC { + sycl::marray res; +#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) +#define FUNC_VEC native::exp2 +#define FUNC FUNC_VEC +#else +#define FUNC_VEC __sycl_std::__invoke_exp2> +#define FUNC __sycl_std::__invoke_exp2 +#endif // defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) + + for (size_t i = 0; i < N / 2; i++) { + auto partial_res = FUNC_VEC(sycl::detail::to_vec2(x, i * 2)); + std::memcpy(&res[i * 2], &partial_res, sizeof(vec)); + } + if (N % 2) { + res[N - 1] = FUNC(x[N - 1]); + } +#undef FUNC_VEC +#undef FUNC + return res; +} + } // namespace native } // namespace experimental diff --git a/sycl/include/sycl/marray.hpp b/sycl/include/sycl/marray.hpp index eca81ff509830..780966cedeca2 100644 --- a/sycl/include/sycl/marray.hpp +++ b/sycl/include/sycl/marray.hpp @@ -17,7 +17,7 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { -/// Provides a cross-patform math array class template that works on +/// Provides a cross-platform math array class template that works on /// SYCL devices as well as in host C++ code. /// /// \ingroup sycl_api diff --git a/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp b/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp new file mode 100644 index 0000000000000..d8ebab70315ff --- /dev/null +++ b/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp @@ -0,0 +1,69 @@ +// REQUIRES: cuda + +// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda -S -Xclang -emit-llvm %s -o -| FileCheck %s + +#include + +using namespace sycl; + +int main() { + + queue q; + + float input[2]; + float res[13]; + { + buffer input_buff(&input[0], range<1>(2)); + buffer res_buff(&res[0], range<1>(13)); + q.submit([&](handler &cgh) { + accessor res_acc(res_buff, + cgh); + accessor input_acc( + input_buff, cgh); + cgh.single_task([=]() { + // CHECK: tail call float @llvm.nvvm.cos.approx.f + res_acc[0] = sycl::native::cos(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.sin.approx.f + res_acc[1] = sycl::native::sin(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.ex2.approx.f + res_acc[2] = sycl::native::exp2(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.lg2.approx.f + res_acc[3] = sycl::native::log2(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.rsqrt.approx.f + res_acc[4] = sycl::native::rsqrt(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.sqrt.approx.f + res_acc[5] = sycl::native::sqrt(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.rcp.approx.f + res_acc[6] = sycl::native::recip(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.div.approx.f + res_acc[7] = sycl::native::divide(input_acc[0], input_acc[1]); + + // Functions that use the above builtins: + + // CHECK: tail call float @llvm.nvvm.sin.approx.f + // CHECK: tail call float @llvm.nvvm.cos.approx.f + // CHECK: tail call float @llvm.nvvm.div.approx.f + res_acc[8] = sycl::native::tan(input_acc[0]); + // CHECK: fmul float {{.*}}, 0x3FF7154760000000 + // CHECK: tail call float @llvm.nvvm.ex2.approx.f + res_acc[9] = sycl::native::exp(input_acc[0]); + // CHECK: fmul float {{.*}}, 0x400A934F00000000 + // CHECK: tail call float @llvm.nvvm.ex2.approx.f + res_acc[10] = sycl::native::exp10(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.lg2.approx.f + // CHECK: fmul float {{.*}}, 0x3FE62E4300000000 + res_acc[11] = sycl::native::log(input_acc[0]); + // CHECK: tail call float @llvm.nvvm.lg2.approx.f + // CHECK: fmul float {{.*}}, 0x3FD3441360000000 + res_acc[12] = sycl::native::log10(input_acc[0]); + + // CHECK: tail call float @llvm.nvvm.lg2.approx.f + // CHECK: fmul float {{.*}}, {{.*}} + // CHECK: tail call float @llvm.nvvm.ex2.approx.f + res_acc[13] = sycl::native::powr(input_acc[0], input_acc[1]); + }); + }); + } + + return 0; +};