Fix GCC vector alignment and aliasing issues

Issue #90 with the simple vector division: GCC warnings & test failures This is a performance penalty
ridiculousfish · Feb 11, 2022 · 99e859a · 99e859a
1 parent 93bc078
commit 99e859a
Show file tree

Hide file tree

Showing 2 changed files with 37 additions and 16 deletions.
diff --git a/libdivide.h b/libdivide.h
@@ -1682,15 +1682,25 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
 
 // Simplest possible vector type division: treat the vector type as an array
 // of underlying native type.
+//
+// In order to access the individual vector elements, we need the type_pun_vec 
+// union in order to handle alignment and aliasing issues.
+// This means an extra copy of the input vector instance :-(
+// But SIMPLE_VECTOR_DIVISION is for 16-bit only and 16-bit division on
+// any CPU with vector hardware is probably very fast already.
 #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
     const size_t count = sizeof(VecT) / sizeof(IntT); \
-    VecT result; \
-    IntT *pSource = (IntT *)&numers; \
-    IntT *pTarget = (IntT *)&result; \
+    union type_pun_vec { \
+        VecT vec; \
+        IntT arr[sizeof(VecT) / sizeof(IntT)]; \
+    }; \
+    union type_pun_vec result; \
+    union type_pun_vec input; \
+    input.vec = numers; \
     for (size_t loop=0; loop<count; ++loop) { \
-        pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
+        result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
     } \
-    return result; \
+    return result.vec;
 
 #if defined(LIBDIVIDE_NEON)
 

diff --git a/test/DivideTest.h b/test/DivideTest.h
@@ -1,5 +1,6 @@
 #pragma once
 
+#include <string.h> // memcpy
 #include "outputs.h"
 
 #if defined(__AVR__)
@@ -121,18 +122,28 @@ class DivideTest {
     }
 
     template <typename VecType, Branching ALGO>
-    void test_vec(const T *numers, size_t count, T denom, const divider<T, ALGO> &div) {
-        size_t size = sizeof(VecType) / sizeof(T);
-        size_t iters = (sizeof(T)*count)/sizeof(VecType);
-
-        for (size_t j = 0; j < iters; j++, numers += size) {
-            VecType x = *((const VecType *)numers);
-            VecType resultVector = x / div;
-            T *results = (T *)&resultVector;
-
-            for (size_t i = 0; i < size; i++) {
+    void test_vec(const T * numers, size_t count, T denom, const divider<T, ALGO> &div) {
+        // Number of T (E.g. in16_t) that will fit in one VecType (E.g. __m256i)
+        const size_t countTinVec = sizeof(VecType) / sizeof(T);
+
+        // In order to access the individual vector elements, we need this 
+        // union in order to handle alignment and aliasing issues.
+        union type_pun_vec {
+            VecType vec = {};
+            T arr[countTinVec];
+        };
+
+        const size_t countVec = (sizeof(T)*count)/sizeof(VecType);
+        for (size_t j = 0; j < countVec; j++, numers += countTinVec) {
+            type_pun_vec vec_in;
+            memcpy(vec_in.arr, numers, sizeof(VecType));
+
+            type_pun_vec vec_result;
+            vec_result.vec = vec_in.vec / div;
+
+            for (size_t i = 0; i < countTinVec; i++) {
                 T numer = numers[i];
-                T result = results[i];
+                T result = vec_result.arr[i];
                 T expect = numer / denom;
 
                 if (result != expect) {