From c35bc3d576b2c6a2010586d385ef435ecc273471 Mon Sep 17 00:00:00 2001
From: Amaury Le Leyzour <aleyzour@magicleap.com>
Date: Mon, 9 Sep 2019 17:25:05 -0700
Subject: [PATCH 1/5] Fix Arm 64bit detection

__aarch64__ is the correct way, __arm64__ is for iOS.
But the memory model does not need any of these architecture defines to be fully
relevant. __LP64__ means Long Pointer 64 (ie 64bits), and __ILP32__ is Integer Long Pointer 32 (ie 32bits).
That's enough and avoids errors like __arch64__ (correct way is __aarch64__) but forgets __arm64__ (iOS).
---
 glm/detail/setup.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/glm/detail/setup.hpp b/glm/detail/setup.hpp
index cf020fdfa..3a5ce18b7 100644
--- a/glm/detail/setup.hpp
+++ b/glm/detail/setup.hpp
@@ -35,12 +35,12 @@
 ///////////////////////////////////////////////////////////////////////////////////
 // Build model
 
-#if defined(__arch64__) || defined(__LP64__) || defined(_M_X64) || defined(__ppc64__) || defined(__x86_64__)
+#if defined(__LP64__)
 #	define GLM_MODEL	GLM_MODEL_64
-#elif defined(__i386__) || defined(__ppc__)
+#elif defined(__ILP32__)
 #	define GLM_MODEL	GLM_MODEL_32
 #else
-#	define GLM_MODEL	GLM_MODEL_32
+#	error "Architecture must be either 32 or 64-bits"
 #endif//
 
 #if !defined(GLM_MODEL) && GLM_COMPILER != 0

From e508cc604ff2aa29f59a01d64ee300f24945d874 Mon Sep 17 00:00:00 2001
From: Amaury Le Leyzour <aleyzour@magicleap.com>
Date: Tue, 10 Sep 2019 10:55:59 -0700
Subject: [PATCH 2/5] More simd/Neon functions

---
 glm/detail/func_geometric_simd.inl | 44 ++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/glm/detail/func_geometric_simd.inl b/glm/detail/func_geometric_simd.inl
index e6c8d85f2..00d8665d1 100644
--- a/glm/detail/func_geometric_simd.inl
+++ b/glm/detail/func_geometric_simd.inl
@@ -96,4 +96,48 @@ namespace detail
 }//namespace detail
 }//namespace glm
 
+#elif GLM_ARCH & GLM_ARCH_NEON_BIT
+namespace glm{
+namespace detail
+{
+	template<qualifier Q>
+	struct compute_length<4, float, Q, true>
+	{
+		GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& v)
+		{
+			return compute_dot<vec<4, float, Q>, float, true>::call(v, v);
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_distance<4, float, Q, true>
+	{
+		GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& p0, vec<4, float, Q> const& p1)
+		{
+			return compute_length<4, float, Q, true>::call(p1 - p0);
+		}
+	};
+
+
+	template<qualifier Q>
+	struct compute_dot<vec<4, float, Q>, float, true>
+	{
+		GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& x, vec<4, float, Q> const& y)
+		{
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			float32x4_t v = vmulq_f32(x.data, y.data);
+			v = vpaddq_f32(v, v);
+			v = vpaddq_f32(v, v);
+			return vgetq_lane_f32(v, 0);
+#else  // Armv7a with Neon
+			float32x4_t p = vmulq_f32(x.data, y.data);
+			float32x2_t v = vpadd_f32(vget_low_f32(p), vget_high_f32(p));
+			v = vpadd_f32(v, v);
+			return vget_lane_f32(v, 0);
+#endif
+		}
+	};
+}//namespace detail
+}//namespace glm
+
 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT

From 9b0a0535ce010fe9c61ca264ec5e5697356d93cd Mon Sep 17 00:00:00 2001
From: Amaury Le Leyzour <aleyzour@magicleap.com>
Date: Tue, 10 Sep 2019 11:28:50 -0700
Subject: [PATCH 3/5] Fixing GLM_MODEL for Windows compilers targeting ARM

---
 glm/detail/setup.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/glm/detail/setup.hpp b/glm/detail/setup.hpp
index 3a5ce18b7..d6025aec8 100644
--- a/glm/detail/setup.hpp
+++ b/glm/detail/setup.hpp
@@ -35,12 +35,12 @@
 ///////////////////////////////////////////////////////////////////////////////////
 // Build model
 
-#if defined(__LP64__)
+#if defined(_M_ARM64) || defined(__LP64__) || defined(_M_X64) || defined(__ppc64__) || defined(__x86_64__)
 #	define GLM_MODEL	GLM_MODEL_64
-#elif defined(__ILP32__)
+#elif defined(__i386__) || defined(__ppc__) || defined(__ILP32__) || defined(_M_ARM)
 #	define GLM_MODEL	GLM_MODEL_32
 #else
-#	error "Architecture must be either 32 or 64-bits"
+#	define GLM_MODEL	GLM_MODEL_32
 #endif//
 
 #if !defined(GLM_MODEL) && GLM_COMPILER != 0

From 7b06a984af72e03b3e1af576b7e1b2942e98a808 Mon Sep 17 00:00:00 2001
From: Sebastian Buchwald <UniQP@web.de>
Date: Wed, 11 Sep 2019 19:12:39 +0200
Subject: [PATCH 4/5] Let test return the number of errors

---
 test/gtx/gtx_fast_square_root.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/gtx/gtx_fast_square_root.cpp b/test/gtx/gtx_fast_square_root.cpp
index e41a0a085..80d7fe4b7 100644
--- a/test/gtx/gtx_fast_square_root.cpp
+++ b/test/gtx/gtx_fast_square_root.cpp
@@ -14,7 +14,7 @@ int test_fastInverseSqrt()
 	Error += glm::all(glm::epsilonEqual(glm::fastInverseSqrt(glm::dvec3(1.0)), glm::dvec3(1.0), 0.01)) ? 0 : 1;
 	Error += glm::all(glm::epsilonEqual(glm::fastInverseSqrt(glm::dvec4(1.0)), glm::dvec4(1.0), 0.01)) ? 0 : 1;
 
-	return 0;
+	return Error;
 }
 
 int test_fastDistance()

From 88a7aee27b6d324c3b6cc3e0fa1b49c15f94550e Mon Sep 17 00:00:00 2001
From: Amaury Le Leyzour <aleyzour@magicleap.com>
Date: Tue, 10 Sep 2019 10:55:59 -0700
Subject: [PATCH 5/5] More simd/Neon functions

---
 glm/detail/func_geometric_simd.inl | 67 ++++++++++++++++++++++++++++++
 glm/detail/func_matrix_simd.inl    | 37 +++++++++++++++++
 2 files changed, 104 insertions(+)

diff --git a/glm/detail/func_geometric_simd.inl b/glm/detail/func_geometric_simd.inl
index e6c8d85f2..0a170d46c 100644
--- a/glm/detail/func_geometric_simd.inl
+++ b/glm/detail/func_geometric_simd.inl
@@ -96,4 +96,71 @@ namespace detail
 }//namespace detail
 }//namespace glm
 
+#elif GLM_ARCH & GLM_ARCH_NEON_BIT
+namespace glm{
+namespace detail
+{
+	template<qualifier Q>
+	struct compute_length<4, float, Q, true>
+	{
+		GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& v)
+		{
+			return compute_dot<vec<4, float, Q>, float, true>::call(v, v);
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_distance<4, float, Q, true>
+	{
+		GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& p0, vec<4, float, Q> const& p1)
+		{
+			return compute_length<4, float, Q, true>::call(p1 - p0);
+		}
+	};
+
+
+	template<qualifier Q>
+	struct compute_dot<vec<4, float, Q>, float, true>
+	{
+		GLM_FUNC_QUALIFIER static float call(vec<4, float, Q> const& x, vec<4, float, Q> const& y)
+		{
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			float32x4_t v = vmulq_f32(x.data, y.data);
+			v = vpaddq_f32(v, v);
+			v = vpaddq_f32(v, v);
+			return vgetq_lane_f32(v, 0);
+#else  // Armv7a with Neon
+			float32x4_t p = vmulq_f32(x.data, y.data);
+			float32x2_t v = vpadd_f32(vget_low_f32(p), vget_high_f32(p));
+			v = vpadd_f32(v, v);
+			return vget_lane_f32(v, 0);
+#endif
+		}
+	};
+
+	template<qualifier Q>
+	struct compute_normalize<4, float, Q, true>
+	{
+		GLM_FUNC_QUALIFIER static vec<4, float, Q> call(vec<4, float, Q> const& v)
+		{
+			float32x4_t p = vmulq_f32(v.data, v.data);
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			p = vpaddq_f32(p, p);
+			p = vpaddq_f32(p, p);
+#else
+			float32x2_t t = vpadd_f32(vget_low_f32(p), vget_high_f32(p));
+			t = vpadd_f32(t, t);
+			p = vcombine_f32(t, t);
+#endif
+
+			float32x4_t vd = vrsqrteq_f32(p);
+			vec<4, float, Q> Result;
+			Result.data = vmulq_f32(v, vd);
+			return Result;
+		}
+	};
+
+}//namespace detail
+}//namespace glm
+
 #endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
diff --git a/glm/detail/func_matrix_simd.inl b/glm/detail/func_matrix_simd.inl
index f7337fe75..d052bf125 100644
--- a/glm/detail/func_matrix_simd.inl
+++ b/glm/detail/func_matrix_simd.inl
@@ -91,4 +91,41 @@ namespace detail
 #	endif
 }//namespace glm
 
+#elif GLM_ARCH & GLM_ARCH_NEON_BIT
+
+namespace glm {
+#if GLM_LANG & GLM_LANG_CXX11_FLAG
+	template <qualifier Q>
+	GLM_FUNC_QUALIFIER
+	typename std::enable_if<detail::is_aligned<Q>::value, mat<4, 4, float, Q>>::type
+	operator*(mat<4, 4, float, Q> const & m1, mat<4, 4, float, Q> const & m2)
+	{
+		auto MulRow = [&](int l) {
+			float32x4_t const SrcA = m2[l].data;
+
+#if GLM_ARCH & GLM_ARCH_ARMV8_BIT
+			float32x4_t r=   vmulq_laneq_f32(m1[0].data, SrcA, 0);
+			r = vaddq_f32(r, vmulq_laneq_f32(m1[1].data, SrcA, 1));
+			r = vaddq_f32(r, vmulq_laneq_f32(m1[2].data, SrcA, 2));
+			r = vaddq_f32(r, vmulq_laneq_f32(m1[3].data, SrcA, 3));
+#else
+			float32x4_t r=   vmulq_f32(m1[0].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 0)));
+			r = vaddq_f32(r, vmulq_f32(m1[1].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 1))));
+			r = vaddq_f32(r, vmulq_f32(m1[2].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 2))));
+			r = vaddq_f32(r, vmulq_f32(m1[3].data, vdupq_n_f32(vgetq_lane_f32(SrcA, 3))));
+#endif
+
+			return r;
+		};
+
+		mat<4, 4, float, aligned_highp> Result;
+		Result[0].data = MulRow(0);
+		Result[1].data = MulRow(1);
+		Result[2].data = MulRow(2);
+		Result[3].data = MulRow(3);
+
+		return Result;
+	}
+#endif // CXX11
+}//namespace glm
 #endif