Use madd() intrinsic if available

AVX introduced the _mm_fmadd_ps() intrinsic, so we can use it if AVX (or an equivalent instruction set) is available when building Graphene. There is no functional difference in this commit if AVX is not available, except that we moved from a generic static inline implementation to a SIMD-specific one.
ebassi · Aug 12, 2024 · b185f55 · b185f55
1 parent df7fa97
commit b185f55
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 20 deletions.
diff --git a/include/graphene-simd4f.h b/include/graphene-simd4f.h
@@ -179,6 +179,11 @@ graphene_simd4f_t       graphene_simd4f_ceil            (const graphene_simd4f_t
 GRAPHENE_AVAILABLE_IN_1_12
 graphene_simd4f_t       graphene_simd4f_floor           (const graphene_simd4f_t s);
 
+GRAPHENE_AVAILABLE_IN_1_0
+graphene_simd4f_t       graphene_simd4f_madd            (const graphene_simd4f_t a,
+                                                         const graphene_simd4f_t b,
+                                                         const graphene_simd4f_t c);
+
 #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE)
 
 /* SSE2 implementation of SIMD 4f */
@@ -504,6 +509,18 @@ typedef GRAPHENE_ALIGN16 union {
   }))
 #  endif
 
+#  if defined(GRAPHENE_USE_AVX)
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_fmadd_ps ((a), (b), (c)); \
+  }))
+#  else
+#   define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) _mm_add_ps (_mm_mul_ps ((a), (b)), (c)); \
+  }))
+#  endif
+
 /* On MSVC, we use static inlines */
 # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */
 
@@ -835,6 +852,20 @@ _simd4f_floor (const graphene_simd4f_t s)
 #endif
 }
 
+#define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+#if defined(GRAPHENE_USE_AVX)
+  return _mm_fmadd_ps (a, b, c);
+#else
+  return _mm_add_ps (_mm_mul_ps (a, b), c);
+#endif
+}
+
 #else /* SSE intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for SSE extensions."
@@ -1158,6 +1189,11 @@ typedef int graphene_simd4i_t __attribute__((vector_size (16)));
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON)
 
 /* ARM Neon implementation of SIMD4f */
@@ -1498,6 +1534,11 @@ typedef float32x2_t graphene_simd2f_t;
     (graphene_simd4f_t) graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w); \
   }))
 
+# define graphene_simd4f_madd(a,b,c) \
+  (__extension__ ({ \
+    (graphene_simd4f_t) graphene_simd4f_add (graphene_simd4f_mul ((a), (b)), (c)); \
+  }))
+
 #elif defined _MSC_VER /* Visual Studio ARM */
 
 # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w)
@@ -1840,6 +1881,16 @@ _simd4f_floor (const graphene_simd4f_t s)
   return graphene_simd4f_init (__floor_x, __floor_y, __floor_z, __floor_w);
 }
 
+# define graphene_simd4f_madd(a,b,c) _simd4f_madd(a,b,c)
+
+static inline graphene_simd4f_t
+_simd4f_madd (const graphene_simd4f_t a,
+              const graphene_simd4f_t b,
+              const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #else /* ARM NEON intrinsics-not GCC or Visual Studio */
 
 #  error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions."
@@ -1956,33 +2007,15 @@ _simd4f_floor (const graphene_simd4f_t s)
   (graphene_simd4f_ceil ((s)))
 #define graphene_simd4f_floor(s) \
   (graphene_simd4f_floor ((s)))
+#define graphene_simd4f_madd(a,b,c) \
+  (graphene_simd4f_madd ((a), (b), (c)))
 
 #else
 # error "Unsupported simd4f implementation."
 #endif
 
 /* Generic operations, inlined */
 
-/**
- * graphene_simd4f_madd:
- * @m1: a #graphene_simd4f_t
- * @m2: a #graphene_simd4f_t
- * @a: a #graphene_simd4f_t
- *
- * Adds @a to the product of @m1 and @m2.
- *
- * Returns: the result vector
- *
- * Since: 1.0
- */
-static inline graphene_simd4f_t
-graphene_simd4f_madd (const graphene_simd4f_t m1,
-                      const graphene_simd4f_t m2,
-                      const graphene_simd4f_t a)
-{
-  return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a);
-}
-
 /**
  * graphene_simd4f_sum:
  * @v: a #graphene_simd4f_t

diff --git a/src/graphene-simd4f.c b/src/graphene-simd4f.c
@@ -1073,6 +1073,26 @@ graphene_simd4f_t
   return graphene_simd4f_floor (s);
 }
 
+/**
+ * graphene_simd4f_madd:
+ * @a: a #graphene_simd4f_t
+ * @b: a #graphene_simd4f_t
+ * @c: a #graphene_simd4f_t
+ *
+ * Adds @a to the product of @m1 and @m2.
+ *
+ * Returns: the result vector
+ *
+ * Since: 1.0
+ */
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_madd (a, b, c);
+}
+
 #else /* GRAPHENE_USE_SCALAR */
 
 graphene_simd4f_t
@@ -1516,4 +1536,12 @@ graphene_simd4f_t
   return graphene_simd4f_init (floorf (s.x), floorf (s.y), floorf (s.z), floorf (s.w));
 }
 
+graphene_simd4f_t
+(graphene_simd4f_madd) (const graphene_simd4f_t a,
+                        const graphene_simd4f_t b,
+                        const graphene_simd4f_t c)
+{
+  return graphene_simd4f_add (graphene_simd4f_mul (a, b), c);
+}
+
 #endif /* GRAPHENE_USE_SCALAR */