From c35bc3d576b2c6a2010586d385ef435ecc273471 Mon Sep 17 00:00:00 2001 From: Amaury Le Leyzour Date: Mon, 9 Sep 2019 17:25:05 -0700 Subject: [PATCH 1/5] Fix Arm 64bit detection __aarch64__ is the correct way, __arm64__ is for iOS. But the memory model does not need any of these architecture defines to be fully relevant. __LP64__ means Long Pointer 64 (ie 64bits), and __ILP32__ is Integer Long Pointer 32 (ie 32bits). That's enough and avoids errors like __arch64__ (correct way is __aarch64__) but forgets __arm64__ (iOS). --- glm/detail/setup.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/glm/detail/setup.hpp b/glm/detail/setup.hpp index cf020fdfa..3a5ce18b7 100644 --- a/glm/detail/setup.hpp +++ b/glm/detail/setup.hpp @@ -35,12 +35,12 @@ /////////////////////////////////////////////////////////////////////////////////// // Build model -#if defined(__arch64__) || defined(__LP64__) || defined(_M_X64) || defined(__ppc64__) || defined(__x86_64__) +#if defined(__LP64__) # define GLM_MODEL GLM_MODEL_64 -#elif defined(__i386__) || defined(__ppc__) +#elif defined(__ILP32__) # define GLM_MODEL GLM_MODEL_32 #else -# define GLM_MODEL GLM_MODEL_32 +# error "Architecture must be either 32 or 64-bits" #endif// #if !defined(GLM_MODEL) && GLM_COMPILER != 0 From e508cc604ff2aa29f59a01d64ee300f24945d874 Mon Sep 17 00:00:00 2001 From: Amaury Le Leyzour Date: Tue, 10 Sep 2019 10:55:59 -0700 Subject: [PATCH 2/5] More simd/Neon functions --- glm/detail/func_geometric_simd.inl | 44 ++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/glm/detail/func_geometric_simd.inl b/glm/detail/func_geometric_simd.inl index e6c8d85f2..00d8665d1 100644 --- a/glm/detail/func_geometric_simd.inl +++ b/glm/detail/func_geometric_simd.inl @@ -96,4 +96,48 @@ namespace detail }//namespace detail }//namespace glm +#elif GLM_ARCH & GLM_ARCH_NEON_BIT +namespace glm{ +namespace detail +{ + template + struct compute_length<4, float, Q, true> + { + GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& v) + { + return compute_dot, float, true>::call(v, v); + } + }; + + template + struct compute_distance<4, float, Q, true> + { + GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& p0, vec<4, float, Q> const& p1) + { + return compute_length<4, float, Q, true>::call(p1 - p0); + } + }; + + + template + struct compute_dot, float, true> + { + GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& x, vec<4, float, Q> const& y) + { +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + float32x4_t v = vmulq_f32(x.data, y.data); + v = vpaddq_f32(v, v); + v = vpaddq_f32(v, v); + return vgetq_lane_f32(v, 0); +#else // Armv7a with Neon + float32x4_t p = vmulq_f32(x.data, y.data); + float32x2_t v = vpadd_f32(vget_low_f32(p), vget_high_f32(p)); + v = vpadd_f32(v, v); + return vget_lane_f32(v, 0); +#endif + } + }; +}//namespace detail +}//namespace glm + #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT From 9b0a0535ce010fe9c61ca264ec5e5697356d93cd Mon Sep 17 00:00:00 2001 From: Amaury Le Leyzour Date: Tue, 10 Sep 2019 11:28:50 -0700 Subject: [PATCH 3/5] Fixing GLM_MODEL for Windows compilers targeting ARM --- glm/detail/setup.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/glm/detail/setup.hpp b/glm/detail/setup.hpp index 3a5ce18b7..d6025aec8 100644 --- a/glm/detail/setup.hpp +++ b/glm/detail/setup.hpp @@ -35,12 +35,12 @@ /////////////////////////////////////////////////////////////////////////////////// // Build model -#if defined(__LP64__) +#if defined(_M_ARM64) || defined(__LP64__) || defined(_M_X64) || defined(__ppc64__) || defined(__x86_64__) # define GLM_MODEL GLM_MODEL_64 -#elif defined(__ILP32__) +#elif defined(__i386__) || defined(__ppc__) || defined(__ILP32__) || defined(_M_ARM) # define GLM_MODEL GLM_MODEL_32 #else -# error "Architecture must be either 32 or 64-bits" +# define GLM_MODEL GLM_MODEL_32 #endif// #if !defined(GLM_MODEL) && GLM_COMPILER != 0 From 7b06a984af72e03b3e1af576b7e1b2942e98a808 Mon Sep 17 00:00:00 2001 From: Sebastian Buchwald Date: Wed, 11 Sep 2019 19:12:39 +0200 Subject: [PATCH 4/5] Let test return the number of errors --- test/gtx/gtx_fast_square_root.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/gtx/gtx_fast_square_root.cpp b/test/gtx/gtx_fast_square_root.cpp index e41a0a085..80d7fe4b7 100644 --- a/test/gtx/gtx_fast_square_root.cpp +++ b/test/gtx/gtx_fast_square_root.cpp @@ -14,7 +14,7 @@ int test_fastInverseSqrt() Error += glm::all(glm::epsilonEqual(glm::fastInverseSqrt(glm::dvec3(1.0)), glm::dvec3(1.0), 0.01)) ? 0 : 1; Error += glm::all(glm::epsilonEqual(glm::fastInverseSqrt(glm::dvec4(1.0)), glm::dvec4(1.0), 0.01)) ? 0 : 1; - return 0; + return Error; } int test_fastDistance() From 88a7aee27b6d324c3b6cc3e0fa1b49c15f94550e Mon Sep 17 00:00:00 2001 From: Amaury Le Leyzour Date: Tue, 10 Sep 2019 10:55:59 -0700 Subject: [PATCH 5/5] More simd/Neon functions --- glm/detail/func_geometric_simd.inl | 67 ++++++++++++++++++++++++++++++ glm/detail/func_matrix_simd.inl | 37 +++++++++++++++++ 2 files changed, 104 insertions(+) diff --git a/glm/detail/func_geometric_simd.inl b/glm/detail/func_geometric_simd.inl index e6c8d85f2..0a170d46c 100644 --- a/glm/detail/func_geometric_simd.inl +++ b/glm/detail/func_geometric_simd.inl @@ -96,4 +96,71 @@ namespace detail }//namespace detail }//namespace glm +#elif GLM_ARCH & GLM_ARCH_NEON_BIT +namespace glm{ +namespace detail +{ + template + struct compute_length<4, float, Q, true> + { + GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& v) + { + return compute_dot, float, true>::call(v, v); + } + }; + + template + struct compute_distance<4, float, Q, true> + { + GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& p0, vec<4, float, Q> const& p1) + { + return compute_length<4, float, Q, true>::call(p1 - p0); + } + }; + + + template + struct compute_dot, float, true> + { + GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& x, vec<4, float, Q> const& y) + { +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + float32x4_t v = vmulq_f32(x.data, y.data); + v = vpaddq_f32(v, v); + v = vpaddq_f32(v, v); + return vgetq_lane_f32(v, 0); +#else // Armv7a with Neon + float32x4_t p = vmulq_f32(x.data, y.data); + float32x2_t v = vpadd_f32(vget_low_f32(p), vget_high_f32(p)); + v = vpadd_f32(v, v); + return vget_lane_f32(v, 0); +#endif + } + }; + + template + struct compute_normalize<4, float, Q, true> + { + GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<4, float, Q> const& v) + { + float32x4_t p = vmulq_f32(v.data, v.data); +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + p = vpaddq_f32(p, p); + p = vpaddq_f32(p, p); +#else + float32x2_t t = vpadd_f32(vget_low_f32(p), vget_high_f32(p)); + t = vpadd_f32(t, t); + p = vcombine_f32(t, t); +#endif + + float32x4_t vd = vrsqrteq_f32(p); + vec<4, float, Q> Result; + Result.data = vmulq_f32(v, vd); + return Result; + } + }; + +}//namespace detail +}//namespace glm + #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT diff --git a/glm/detail/func_matrix_simd.inl b/glm/detail/func_matrix_simd.inl index f7337fe75..d052bf125 100644 --- a/glm/detail/func_matrix_simd.inl +++ b/glm/detail/func_matrix_simd.inl @@ -91,4 +91,41 @@ namespace detail # endif }//namespace glm +#elif GLM_ARCH & GLM_ARCH_NEON_BIT + +namespace glm { +#if GLM_LANG & GLM_LANG_CXX11_FLAG + template + GLM_FUNC_QUALIFIER + typename std::enable_if::value, mat<4, 4, float, Q>>::type + operator*(mat<4, 4, float, Q> const & m1, mat<4, 4, float, Q> const & m2) + { + auto MulRow = [&](int l) { + float32x4_t const SrcA = m2[l].data; + +#if GLM_ARCH & GLM_ARCH_ARMV8_BIT + float32x4_t r= vmulq_laneq_f32(m1[0].data, SrcA, 0); + r = vaddq_f32(r, vmulq_laneq_f32(m1[1].data, SrcA, 1)); + r = vaddq_f32(r, vmulq_laneq_f32(m1[2].data, SrcA, 2)); + r = vaddq_f32(r, vmulq_laneq_f32(m1[3].data, SrcA, 3)); +#else + float32x4_t r= vmulq_f32(m1[0].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 0))); + r = vaddq_f32(r, vmulq_f32(m1[1].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 1)))); + r = vaddq_f32(r, vmulq_f32(m1[2].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 2)))); + r = vaddq_f32(r, vmulq_f32(m1[3].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 3)))); +#endif + + return r; + }; + + mat<4, 4, float, aligned_highp> Result; + Result[0].data = MulRow(0); + Result[1].data = MulRow(1); + Result[2].data = MulRow(2); + Result[3].data = MulRow(3); + + return Result; + } +#endif // CXX11 +}//namespace glm #endif