diff --git a/include/graphene-config.h.meson b/include/graphene-config.h.meson index ba1f10c..41cea23 100644 --- a/include/graphene-config.h.meson +++ b/include/graphene-config.h.meson @@ -58,6 +58,7 @@ extern "C" { # if defined(GRAPHENE_USE_SSE) # include # include +# # if defined(_M_IX86_FP) # if _M_IX86_FP >= 2 # define GRAPHENE_USE_SSE4_1 @@ -67,9 +68,18 @@ extern "C" { # elif defined(_MSC_VER) # define GRAPHENE_USE_SSE4_1 # endif +# +# if defined(__AVX__) +# #define GRAPHENE_USE_AVX +# endif +# # if defined(GRAPHENE_USE_SSE4_1) # include # endif +# +# if defined(GRAPHENE_USE_AVX) +# include +# endif typedef __m128 graphene_simd4f_t; # elif defined(GRAPHENE_USE_ARM_NEON) # if defined (_MSC_VER) && (_MSC_VER < 1920) && defined (_M_ARM64) diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h index 758343d..9029d51 100644 --- a/include/graphene-simd4f.h +++ b/include/graphene-simd4f.h @@ -179,6 +179,11 @@ graphene_simd4f_t graphene_simd4f_ceil (const graphene_simd4f_t GRAPHENE_AVAILABLE_IN_1_12 graphene_simd4f_t graphene_simd4f_floor (const graphene_simd4f_t s); +GRAPHENE_AVAILABLE_IN_1_0 +graphene_simd4f_t graphene_simd4f_madd (const graphene_simd4f_t a, + const graphene_simd4f_t b, + const graphene_simd4f_t c); + #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE) /* SSE2 implementation of SIMD 4f */ @@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union { })) # endif +# if defined(GRAPHENE_USE_AVX) +# define graphene_simd4f_madd(a,b,c) \ + (__extension__ ({ \ + (graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \ + })) +# else +# define graphene_simd4f_madd(a,b,c) \ + (__extension__ ({ \ + (graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \ + })) +# endif + /* On MSVC, we use static inlines */ # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */ @@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s) #endif } +#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c) + +static inline graphene_simd4f_t +_simd4f_madd (const graphene_simd4f_t a, + const graphene_simd4f_t b, + const graphene_simd4f_t c) +{ +#if defined(GRAPHENE_USE_AVX) + return _mm_fmadd_ps (a, b, c); +#else + return _mm_add_ps (_mm_mul_ps (a, b), c); +#endif +} + #else /* SSE intrinsics-not GCC or Visual Studio */ # error "Need GCC-compatible or Visual Studio compiler for SSE extensions." @@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16))); (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \ })) +# define graphene_simd4f_madd(a,b,c) \ + (__extension__ ({ \ + (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \ + })) + #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON) /* ARM Neon implementation of SIMD4f */ @@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t; (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \ })) +# define graphene_simd4f_madd(a,b,c) \ + (__extension__ ({ \ + (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \ + })) + #elif defined _MSC_VER /* Visual Studio ARM */ # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w) @@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s) return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); } +# define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c) + +static inline graphene_simd4f_t +_simd4f_madd (const graphene_simd4f_t a, + const graphene_simd4f_t b, + const graphene_simd4f_t c) +{ + return graphene_simd4f_add (graphene_simd4f_mul (a, b), c); +} + #else /* ARM NEON intrinsics-not GCC or Visual Studio */ # error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions." @@ -1956,6 +2007,8 @@ _simd4f_floor (const graphene_simd4f_t s) (graphene_simd4f_ceil ((s))) #define graphene_simd4f_floor(s) \ (graphene_simd4f_floor ((s))) +#define graphene_simd4f_madd(a,b,c) \ + (graphene_simd4f_madd ((a), (b), (c))) #else # error "Unsupported simd4f implementation." @@ -1963,26 +2016,6 @@ _simd4f_floor (const graphene_simd4f_t s) /* Generic operations, inlined */ -/** - * graphene_simd4f_madd: - * @m1: a #graphene_simd4f_t - * @m2: a #graphene_simd4f_t - * @a: a #graphene_simd4f_t - * - * Adds @a to the product of @m1 and @m2. - * - * Returns: the result vector - * - * Since: 1.0 - */ -static inline graphene_simd4f_t -graphene_simd4f_madd (const graphene_simd4f_t m1, - const graphene_simd4f_t m2, - const graphene_simd4f_t a) -{ - return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a); -} - /** * graphene_simd4f_sum: * @v: a #graphene_simd4f_t diff --git a/src/graphene-simd4f.c b/src/graphene-simd4f.c index 00c545b..d9f7e99 100644 --- a/src/graphene-simd4f.c +++ b/src/graphene-simd4f.c @@ -1073,6 +1073,26 @@ graphene_simd4f_t return graphene_simd4f_floor (s); } +/** + * graphene_simd4f_madd: + * @a: a #graphene_simd4f_t + * @b: a #graphene_simd4f_t + * @c: a #graphene_simd4f_t + * + * Adds @a to the product of @m1 and @m2. + * + * Returns: the result vector + * + * Since: 1.0 + */ +graphene_simd4f_t +(graphene_simd4f_madd) (const graphene_simd4f_t a, + const graphene_simd4f_t b, + const graphene_simd4f_t c) +{ + return graphene_simd4f_madd (a, b, c); +} + #else /* GRAPHENE_USE_SCALAR */ graphene_simd4f_t @@ -1516,4 +1536,12 @@ graphene_simd4f_t return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w)); } +graphene_simd4f_t +(graphene_simd4f_madd) (const graphene_simd4f_t a, + const graphene_simd4f_t b, + const graphene_simd4f_t c) +{ + return graphene_simd4f_add (graphene_simd4f_mul (a, b), c); +} + #endif /* GRAPHENE_USE_SCALAR */