Merge pull request #160 from theAsmodai/master

Mathlib fully optimized using SSE
rehlds · Feb 6, 2016 · 5a6244c · 5a6244c
2 parents 8900c5d + afc1eaf
commit 5a6244c
Show file tree

Hide file tree

Showing 10 changed files with 931 additions and 241 deletions.
diff --git a/rehlds/engine/mathlib.cpp b/rehlds/engine/mathlib.cpp
diff --git a/rehlds/engine/sse_mathfun.cpp b/rehlds/engine/sse_mathfun.cpp
diff --git a/rehlds/engine/sse_mathfun.h b/rehlds/engine/sse_mathfun.h
@@ -0,0 +1,120 @@
+/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
+
+Inspired by Intel Approximate Math library, and based on the
+corresponding algorithms of the cephes math library
+
+The default is to use the SSE1 version. If you define USE_SSE2 the
+the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
+not expect any significant performance improvement with SSE2.
+*/
+
+/* Copyright (C) 2007  Julien Pommier
+
+This software is provided 'as-is', without any express or implied
+warranty.  In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+3. This notice may not be removed or altered from any source distribution.
+
+(this is the zlib license)
+*/
+#pragma once
+
+#include <xmmintrin.h>
+
+/* yes I know, the top of this file is quite ugly */
+
+#ifdef _MSC_VER /* visual c++ */
+# define ALIGN16_BEG __declspec(align(16))
+# define ALIGN16_END 
+#else /* gcc or icc */
+# define ALIGN16_BEG
+# define ALIGN16_END __attribute__((aligned(16)))
+#endif
+
+/* __m128 is ugly to write */
+typedef __m128 v4sf;  // vector of 4 float (sse1)
+
+#include <emmintrin.h>
+typedef __m128i v4si; // vector of 4 int (sse2)
+
+
+/* declare some SSE constants -- why can't I figure a better way to do that? */
+#define _PS_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PI32_CONST(Name, Val)                                            \
+  static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+#define _PS_CONST_TYPE(Name, Type, Val)                                 \
+  static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
+
+_PS_CONST(1, 1.0f);
+_PS_CONST(0p5, 0.5f);
+/* the smallest non denormalized float number */
+_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
+_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
+_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);
+
+_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
+_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);
+
+_PI32_CONST(1, 1);
+_PI32_CONST(inv1, ~1);
+_PI32_CONST(2, 2);
+_PI32_CONST(4, 4);
+_PI32_CONST(0x7f, 0x7f);
+
+_PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
+_PS_CONST(cephes_log_p0, 7.0376836292E-2f);
+_PS_CONST(cephes_log_p1, -1.1514610310E-1f);
+_PS_CONST(cephes_log_p2, 1.1676998740E-1f);
+_PS_CONST(cephes_log_p3, -1.2420140846E-1f);
+_PS_CONST(cephes_log_p4, +1.4249322787E-1f);
+_PS_CONST(cephes_log_p5, -1.6668057665E-1f);
+_PS_CONST(cephes_log_p6, +2.0000714765E-1f);
+_PS_CONST(cephes_log_p7, -2.4999993993E-1f);
+_PS_CONST(cephes_log_p8, +3.3333331174E-1f);
+_PS_CONST(cephes_log_q1, -2.12194440e-4f);
+_PS_CONST(cephes_log_q2, 0.693359375f);
+
+
+
+_PS_CONST(exp_hi, 88.3762626647949f);
+_PS_CONST(exp_lo, -88.3762626647949f);
+
+_PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
+_PS_CONST(cephes_exp_C1, 0.693359375f);
+_PS_CONST(cephes_exp_C2, -2.12194440e-4f);
+
+_PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
+_PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
+_PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
+_PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
+_PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
+_PS_CONST(cephes_exp_p5, 5.0000001201E-1f);
+
+_PS_CONST(minus_cephes_DP1, -0.78515625f);
+_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
+_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
+_PS_CONST(sincof_p0, -1.9515295891E-4f);
+_PS_CONST(sincof_p1, 8.3321608736E-3f);
+_PS_CONST(sincof_p2, -1.6666654611E-1f);
+_PS_CONST(coscof_p0, 2.443315711809948E-005f);
+_PS_CONST(coscof_p1, -1.388731625493765E-003f);
+_PS_CONST(coscof_p2, 4.166664568298827E-002f);
+_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI
+
+extern v4sf log_ps(v4sf x);
+extern v4sf exp_ps(v4sf x);
+extern v4sf sin_ps(v4sf x);
+extern v4sf cos_ps(v4sf x);
+extern void sincos_ps(v4sf x, v4sf *s, v4sf *c);
diff --git a/rehlds/msvc/ReHLDS.vcxproj b/rehlds/msvc/ReHLDS.vcxproj
@@ -77,6 +77,7 @@
     <ClCompile Include="..\engine\public_amalgamation.cpp" />
     <ClCompile Include="..\engine\r_studio.cpp" />
     <ClCompile Include="..\engine\snd_null.cpp" />
+    <ClCompile Include="..\engine\sse_mathfun.cpp" />
     <ClCompile Include="..\engine\sv_log.cpp" />
     <ClCompile Include="..\engine\sv_main.cpp" />
     <ClCompile Include="..\engine\sv_move.cpp" />
@@ -440,6 +441,7 @@
     <ClInclude Include="..\engine\server.h" />
     <ClInclude Include="..\engine\server_static.h" />
     <ClInclude Include="..\engine\sound.h" />
+    <ClInclude Include="..\engine\sse_mathfun.h" />
     <ClInclude Include="..\engine\studio_rehlds.h" />
     <ClInclude Include="..\engine\sv_log.h" />
     <ClInclude Include="..\engine\sv_move.h" />

diff --git a/rehlds/msvc/ReHLDS.vcxproj.filters b/rehlds/msvc/ReHLDS.vcxproj.filters
@@ -346,6 +346,9 @@
     <ClCompile Include="..\rehlds\rehlds_security.cpp">
       <Filter>rehlds</Filter>
     </ClCompile>
+    <ClCompile Include="..\engine\sse_mathfun.cpp">
+      <Filter>engine</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\hookers\memory.h">
@@ -1068,6 +1071,9 @@
     <ClInclude Include="..\common\qlimits.h">
       <Filter>common</Filter>
     </ClInclude>
+    <ClInclude Include="..\engine\sse_mathfun.h">
+      <Filter>engine</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <None Include="..\linux\appversion.sh">

diff --git a/rehlds/public/asmlib.h b/rehlds/public/asmlib.h
@@ -120,105 +120,4 @@ static inline const char * A_strstr(const char * haystack, const char * needle)
 
 #endif // __cplusplus
 
-
-/***********************************************************************
-Function prototypes, integer division functions
-***********************************************************************/
-
-// Turn off name mangling
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-void setdivisori32(int buffer[2], int d);                      // Set divisor for repeated division
-int dividefixedi32(const int buffer[2], int x);                // Fast division with previously set divisor
-void setdivisoru32(uint32_t buffer[2], uint32_t d);            // Set divisor for repeated division
-uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x); // Fast division with previously set divisor
-
-// Test if emmintrin.h is included and __m128i defined
-#if defined(__GNUC__) && defined(_EMMINTRIN_H_INCLUDED) && !defined(__SSE2__)
-#error Please compile with -sse2 or higher 
-#endif
-
-#if defined(_INCLUDED_EMM) || (defined(_EMMINTRIN_H_INCLUDED) && defined(__SSE2__))
-#define VECTORDIVISIONDEFINED
-
-// Integer vector division functions. These functions divide an integer vector by a scalar:
-
-// Set divisor for repeated integer vector division
-void setdivisorV8i16(__m128i buf[2], int16_t d);               // Set divisor for repeated division
-void setdivisorV8u16(__m128i buf[2], uint16_t d);              // Set divisor for repeated division
-void setdivisorV4i32(__m128i buf[2], int32_t d);               // Set divisor for repeated division
-void setdivisorV4u32(__m128i buf[2], uint32_t d);              // Set divisor for repeated division
-
-// Fast division of vector by previously set divisor
-__m128i dividefixedV8i16(const __m128i buf[2], __m128i x);     // Fast division with previously set divisor
-__m128i dividefixedV8u16(const __m128i buf[2], __m128i x);     // Fast division with previously set divisor
-__m128i dividefixedV4i32(const __m128i buf[2], __m128i x);     // Fast division with previously set divisor
-__m128i dividefixedV4u32(const __m128i buf[2], __m128i x);     // Fast division with previously set divisor
-
-#endif // defined(_INCLUDED_EMM) || (defined(_EMMINTRIN_H_INCLUDED) && defined(__SSE2__))
-
-#ifdef __cplusplus
-}  // end of extern "C"
-#endif // __cplusplus
-
-#ifdef __cplusplus
-
-// Define classes and operator '/' for fast division with fixed divisor
-class div_i32;
-class div_u32;
-static inline int32_t  operator / (int32_t  x, div_i32 const &D);
-static inline uint32_t operator / (uint32_t x, div_u32 const & D);
-
-class div_i32 {                                                // Signed 32 bit integer division
-public:
-    div_i32() {                                                // Default constructor
-        buffer[0] = buffer[1] = 0;
-    }
-    div_i32(int d) {                                           // Constructor with divisor
-        setdivisor(d);
-    }
-    void setdivisor(int d) {                                   // Set divisor
-        setdivisori32(buffer, d);
-    }
-protected:
-    int buffer[2];                                             // Internal memory
-    friend int32_t operator / (int32_t x, div_i32 const & D);
-};
-
-static inline int32_t operator / (int32_t x, div_i32 const &D){// Overloaded operator '/'
-    return dividefixedi32(D.buffer, x);
-}
-
-static inline int32_t operator /= (int32_t &x, div_i32 const &D){// Overloaded operator '/='
-    return x = x / D;
-}
-
-class div_u32 {                                                // Unsigned 32 bit integer division
-public:
-    div_u32() {                                                // Default constructor
-        buffer[0] = buffer[1] = 0;
-    }
-    div_u32(uint32_t d) {                                      // Constructor with divisor
-        setdivisor(d);
-    }
-    void setdivisor(uint32_t d) {                              // Set divisor
-        setdivisoru32(buffer, d);
-    }
-protected:
-    uint32_t buffer[2];                                        // Internal memory
-    friend uint32_t operator / (uint32_t x, div_u32 const & D);
-};
-
-static inline uint32_t operator / (uint32_t x, div_u32 const & D){ // Overloaded operator '/'
-    return dividefixedu32(D.buffer, x);
-}
-
-static inline uint32_t operator /= (uint32_t &x, div_u32 const &D){// Overloaded operator '/='
-    return x = x / D;
-}
-
-#endif // __cplusplus
-
 #endif // ASMLIB_H
diff --git a/rehlds/public/rehlds/sys_shared.cpp b/rehlds/public/rehlds/sys_shared.cpp
@@ -35,6 +35,7 @@
 #define SSSE3_FLAG		(1<<9)
 #define SSE4_1_FLAG		(1<<19)
 #define SSE4_2_FLAG		(1<<20)
+#define POPCNT_FLAG		(1<<23)
 #define AVX_FLAG		(1<<28)
 #define AVX2_FLAG		(1<<5)
 
@@ -56,6 +57,7 @@ void Sys_CheckCpuInstructionsSupport(void)
 	cpuinfo.ssse3 = (cpuid_data[2] & SSSE3_FLAG) ? 1 : 0;
 	cpuinfo.sse4_1 = (cpuid_data[2] & SSE4_1_FLAG) ? 1 : 0;
 	cpuinfo.sse4_2 = (cpuid_data[2] & SSE4_2_FLAG) ? 1 : 0;
+	cpuinfo.popcnt = (cpuid_data[2] & POPCNT_FLAG) ? 1 : 0;
 	cpuinfo.avx = (cpuid_data[2] & AVX_FLAG) ? 1 : 0;
 
 #if defined ASMLIB_H

diff --git a/rehlds/public/rehlds/sys_shared.h b/rehlds/public/rehlds/sys_shared.h
@@ -31,7 +31,7 @@
 
 typedef struct cpuinfo_s
 {
-	uint8 sse3, ssse3, sse4_1, sse4_2, avx, avx2;
+	uint8 sse3, ssse3, sse4_1, sse4_2, avx, avx2, popcnt;
 } cpuinfo_t;
 
 extern cpuinfo_t cpuinfo;

diff --git a/rehlds/rehlds/precompiled.h b/rehlds/rehlds/precompiled.h
@@ -6,6 +6,7 @@
 
 #include "archtypes.h"
 #include "asmlib.h"
+#include "sse_mathfun.h"
 #include "mathlib.h"
 
 #include "sys_shared.h"