Skip to content

Commit

Permalink
Fix GCC vector alignment and aliasing issues
Browse files Browse the repository at this point in the history
Issue #90 with the simple vector division: GCC warnings & test failures
This is a performance penalty
  • Loading branch information
adbancroft committed Feb 11, 2022
1 parent 93bc078 commit 99e859a
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 16 deletions.
20 changes: 15 additions & 5 deletions libdivide.h
Original file line number Diff line number Diff line change
Expand Up @@ -1682,15 +1682,25 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t

// Simplest possible vector type division: treat the vector type as an array
// of underlying native type.
//
// In order to access the individual vector elements, we need the type_pun_vec
// union in order to handle alignment and aliasing issues.
// This means an extra copy of the input vector instance :-(
// But SIMPLE_VECTOR_DIVISION is for 16-bit only and 16-bit division on
// any CPU with vector hardware is probably very fast already.
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
const size_t count = sizeof(VecT) / sizeof(IntT); \
VecT result; \
IntT *pSource = (IntT *)&numers; \
IntT *pTarget = (IntT *)&result; \
union type_pun_vec { \
VecT vec; \
IntT arr[sizeof(VecT) / sizeof(IntT)]; \
}; \
union type_pun_vec result; \
union type_pun_vec input; \
input.vec = numers; \
for (size_t loop=0; loop<count; ++loop) { \
pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
} \
return result; \
return result.vec;

#if defined(LIBDIVIDE_NEON)

Expand Down
33 changes: 22 additions & 11 deletions test/DivideTest.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <string.h> // memcpy
#include "outputs.h"

#if defined(__AVR__)
Expand Down Expand Up @@ -121,18 +122,28 @@ class DivideTest {
}

template <typename VecType, Branching ALGO>
void test_vec(const T *numers, size_t count, T denom, const divider<T, ALGO> &div) {
size_t size = sizeof(VecType) / sizeof(T);
size_t iters = (sizeof(T)*count)/sizeof(VecType);

for (size_t j = 0; j < iters; j++, numers += size) {
VecType x = *((const VecType *)numers);
VecType resultVector = x / div;
T *results = (T *)&resultVector;

for (size_t i = 0; i < size; i++) {
void test_vec(const T * numers, size_t count, T denom, const divider<T, ALGO> &div) {
// Number of T (E.g. in16_t) that will fit in one VecType (E.g. __m256i)
const size_t countTinVec = sizeof(VecType) / sizeof(T);

// In order to access the individual vector elements, we need this
// union in order to handle alignment and aliasing issues.
union type_pun_vec {
VecType vec = {};
T arr[countTinVec];
};

const size_t countVec = (sizeof(T)*count)/sizeof(VecType);
for (size_t j = 0; j < countVec; j++, numers += countTinVec) {
type_pun_vec vec_in;
memcpy(vec_in.arr, numers, sizeof(VecType));

type_pun_vec vec_result;
vec_result.vec = vec_in.vec / div;

for (size_t i = 0; i < countTinVec; i++) {
T numer = numers[i];
T result = results[i];
T result = vec_result.arr[i];
T expect = numer / denom;

if (result != expect) {
Expand Down

0 comments on commit 99e859a

Please sign in to comment.