Skip to content

Commit

Permalink
Merge pull request #160 from theAsmodai/master
Browse files Browse the repository at this point in the history
Mathlib fully optimized using SSE
  • Loading branch information
theAsmodai committed Feb 6, 2016
2 parents 8900c5d + afc1eaf commit 5a6244c
Show file tree
Hide file tree
Showing 10 changed files with 931 additions and 241 deletions.
369 changes: 305 additions & 64 deletions rehlds/engine/mathlib.cpp

Large diffs are not rendered by default.

447 changes: 447 additions & 0 deletions rehlds/engine/sse_mathfun.cpp

Large diffs are not rendered by default.

120 changes: 120 additions & 0 deletions rehlds/engine/sse_mathfun.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
/* SIMD (SSE1+MMX or SSE2) implementation of sin, cos, exp and log
Inspired by Intel Approximate Math library, and based on the
corresponding algorithms of the cephes math library
The default is to use the SSE1 version. If you define USE_SSE2 the
the SSE2 intrinsics will be used in place of the MMX intrinsics. Do
not expect any significant performance improvement with SSE2.
*/

/* Copyright (C) 2007 Julien Pommier
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source distribution.
(this is the zlib license)
*/
#pragma once

#include <xmmintrin.h>

/* yes I know, the top of this file is quite ugly */

#ifdef _MSC_VER /* visual c++ */
# define ALIGN16_BEG __declspec(align(16))
# define ALIGN16_END
#else /* gcc or icc */
# define ALIGN16_BEG
# define ALIGN16_END __attribute__((aligned(16)))
#endif

/* __m128 is ugly to write */
typedef __m128 v4sf; // vector of 4 float (sse1)

#include <emmintrin.h>
typedef __m128i v4si; // vector of 4 int (sse2)


/* declare some SSE constants -- why can't I figure a better way to do that? */
#define _PS_CONST(Name, Val) \
static const ALIGN16_BEG float _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PI32_CONST(Name, Val) \
static const ALIGN16_BEG int _pi32_##Name[4] ALIGN16_END = { Val, Val, Val, Val }
#define _PS_CONST_TYPE(Name, Type, Val) \
static const ALIGN16_BEG Type _ps_##Name[4] ALIGN16_END = { Val, Val, Val, Val }

_PS_CONST(1, 1.0f);
_PS_CONST(0p5, 0.5f);
/* the smallest non denormalized float number */
_PS_CONST_TYPE(min_norm_pos, int, 0x00800000);
_PS_CONST_TYPE(mant_mask, int, 0x7f800000);
_PS_CONST_TYPE(inv_mant_mask, int, ~0x7f800000);

_PS_CONST_TYPE(sign_mask, int, (int)0x80000000);
_PS_CONST_TYPE(inv_sign_mask, int, ~0x80000000);

_PI32_CONST(1, 1);
_PI32_CONST(inv1, ~1);
_PI32_CONST(2, 2);
_PI32_CONST(4, 4);
_PI32_CONST(0x7f, 0x7f);

_PS_CONST(cephes_SQRTHF, 0.707106781186547524f);
_PS_CONST(cephes_log_p0, 7.0376836292E-2f);
_PS_CONST(cephes_log_p1, -1.1514610310E-1f);
_PS_CONST(cephes_log_p2, 1.1676998740E-1f);
_PS_CONST(cephes_log_p3, -1.2420140846E-1f);
_PS_CONST(cephes_log_p4, +1.4249322787E-1f);
_PS_CONST(cephes_log_p5, -1.6668057665E-1f);
_PS_CONST(cephes_log_p6, +2.0000714765E-1f);
_PS_CONST(cephes_log_p7, -2.4999993993E-1f);
_PS_CONST(cephes_log_p8, +3.3333331174E-1f);
_PS_CONST(cephes_log_q1, -2.12194440e-4f);
_PS_CONST(cephes_log_q2, 0.693359375f);



_PS_CONST(exp_hi, 88.3762626647949f);
_PS_CONST(exp_lo, -88.3762626647949f);

_PS_CONST(cephes_LOG2EF, 1.44269504088896341f);
_PS_CONST(cephes_exp_C1, 0.693359375f);
_PS_CONST(cephes_exp_C2, -2.12194440e-4f);

_PS_CONST(cephes_exp_p0, 1.9875691500E-4f);
_PS_CONST(cephes_exp_p1, 1.3981999507E-3f);
_PS_CONST(cephes_exp_p2, 8.3334519073E-3f);
_PS_CONST(cephes_exp_p3, 4.1665795894E-2f);
_PS_CONST(cephes_exp_p4, 1.6666665459E-1f);
_PS_CONST(cephes_exp_p5, 5.0000001201E-1f);

_PS_CONST(minus_cephes_DP1, -0.78515625f);
_PS_CONST(minus_cephes_DP2, -2.4187564849853515625e-4f);
_PS_CONST(minus_cephes_DP3, -3.77489497744594108e-8f);
_PS_CONST(sincof_p0, -1.9515295891E-4f);
_PS_CONST(sincof_p1, 8.3321608736E-3f);
_PS_CONST(sincof_p2, -1.6666654611E-1f);
_PS_CONST(coscof_p0, 2.443315711809948E-005f);
_PS_CONST(coscof_p1, -1.388731625493765E-003f);
_PS_CONST(coscof_p2, 4.166664568298827E-002f);
_PS_CONST(cephes_FOPI, 1.27323954473516f); // 4 / M_PI

extern v4sf log_ps(v4sf x);
extern v4sf exp_ps(v4sf x);
extern v4sf sin_ps(v4sf x);
extern v4sf cos_ps(v4sf x);
extern void sincos_ps(v4sf x, v4sf *s, v4sf *c);
2 changes: 2 additions & 0 deletions rehlds/msvc/ReHLDS.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
<ClCompile Include="..\engine\public_amalgamation.cpp" />
<ClCompile Include="..\engine\r_studio.cpp" />
<ClCompile Include="..\engine\snd_null.cpp" />
<ClCompile Include="..\engine\sse_mathfun.cpp" />
<ClCompile Include="..\engine\sv_log.cpp" />
<ClCompile Include="..\engine\sv_main.cpp" />
<ClCompile Include="..\engine\sv_move.cpp" />
Expand Down Expand Up @@ -440,6 +441,7 @@
<ClInclude Include="..\engine\server.h" />
<ClInclude Include="..\engine\server_static.h" />
<ClInclude Include="..\engine\sound.h" />
<ClInclude Include="..\engine\sse_mathfun.h" />
<ClInclude Include="..\engine\studio_rehlds.h" />
<ClInclude Include="..\engine\sv_log.h" />
<ClInclude Include="..\engine\sv_move.h" />
Expand Down
6 changes: 6 additions & 0 deletions rehlds/msvc/ReHLDS.vcxproj.filters
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,9 @@
<ClCompile Include="..\rehlds\rehlds_security.cpp">
<Filter>rehlds</Filter>
</ClCompile>
<ClCompile Include="..\engine\sse_mathfun.cpp">
<Filter>engine</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\hookers\memory.h">
Expand Down Expand Up @@ -1068,6 +1071,9 @@
<ClInclude Include="..\common\qlimits.h">
<Filter>common</Filter>
</ClInclude>
<ClInclude Include="..\engine\sse_mathfun.h">
<Filter>engine</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<None Include="..\linux\appversion.sh">
Expand Down
101 changes: 0 additions & 101 deletions rehlds/public/asmlib.h
Original file line number Diff line number Diff line change
Expand Up @@ -120,105 +120,4 @@ static inline const char * A_strstr(const char * haystack, const char * needle)

#endif // __cplusplus


/***********************************************************************
Function prototypes, integer division functions
***********************************************************************/

// Turn off name mangling
#ifdef __cplusplus
extern "C" {
#endif

void setdivisori32(int buffer[2], int d); // Set divisor for repeated division
int dividefixedi32(const int buffer[2], int x); // Fast division with previously set divisor
void setdivisoru32(uint32_t buffer[2], uint32_t d); // Set divisor for repeated division
uint32_t dividefixedu32(const uint32_t buffer[2], uint32_t x); // Fast division with previously set divisor

// Test if emmintrin.h is included and __m128i defined
#if defined(__GNUC__) && defined(_EMMINTRIN_H_INCLUDED) && !defined(__SSE2__)
#error Please compile with -sse2 or higher
#endif

#if defined(_INCLUDED_EMM) || (defined(_EMMINTRIN_H_INCLUDED) && defined(__SSE2__))
#define VECTORDIVISIONDEFINED

// Integer vector division functions. These functions divide an integer vector by a scalar:

// Set divisor for repeated integer vector division
void setdivisorV8i16(__m128i buf[2], int16_t d); // Set divisor for repeated division
void setdivisorV8u16(__m128i buf[2], uint16_t d); // Set divisor for repeated division
void setdivisorV4i32(__m128i buf[2], int32_t d); // Set divisor for repeated division
void setdivisorV4u32(__m128i buf[2], uint32_t d); // Set divisor for repeated division

// Fast division of vector by previously set divisor
__m128i dividefixedV8i16(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
__m128i dividefixedV8u16(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
__m128i dividefixedV4i32(const __m128i buf[2], __m128i x); // Fast division with previously set divisor
__m128i dividefixedV4u32(const __m128i buf[2], __m128i x); // Fast division with previously set divisor

#endif // defined(_INCLUDED_EMM) || (defined(_EMMINTRIN_H_INCLUDED) && defined(__SSE2__))

#ifdef __cplusplus
} // end of extern "C"
#endif // __cplusplus

#ifdef __cplusplus

// Define classes and operator '/' for fast division with fixed divisor
class div_i32;
class div_u32;
static inline int32_t operator / (int32_t x, div_i32 const &D);
static inline uint32_t operator / (uint32_t x, div_u32 const & D);

class div_i32 { // Signed 32 bit integer division
public:
div_i32() { // Default constructor
buffer[0] = buffer[1] = 0;
}
div_i32(int d) { // Constructor with divisor
setdivisor(d);
}
void setdivisor(int d) { // Set divisor
setdivisori32(buffer, d);
}
protected:
int buffer[2]; // Internal memory
friend int32_t operator / (int32_t x, div_i32 const & D);
};

static inline int32_t operator / (int32_t x, div_i32 const &D){// Overloaded operator '/'
return dividefixedi32(D.buffer, x);
}

static inline int32_t operator /= (int32_t &x, div_i32 const &D){// Overloaded operator '/='
return x = x / D;
}

class div_u32 { // Unsigned 32 bit integer division
public:
div_u32() { // Default constructor
buffer[0] = buffer[1] = 0;
}
div_u32(uint32_t d) { // Constructor with divisor
setdivisor(d);
}
void setdivisor(uint32_t d) { // Set divisor
setdivisoru32(buffer, d);
}
protected:
uint32_t buffer[2]; // Internal memory
friend uint32_t operator / (uint32_t x, div_u32 const & D);
};

static inline uint32_t operator / (uint32_t x, div_u32 const & D){ // Overloaded operator '/'
return dividefixedu32(D.buffer, x);
}

static inline uint32_t operator /= (uint32_t &x, div_u32 const &D){// Overloaded operator '/='
return x = x / D;
}

#endif // __cplusplus

#endif // ASMLIB_H
2 changes: 2 additions & 0 deletions rehlds/public/rehlds/sys_shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
#define SSSE3_FLAG (1<<9)
#define SSE4_1_FLAG (1<<19)
#define SSE4_2_FLAG (1<<20)
#define POPCNT_FLAG (1<<23)
#define AVX_FLAG (1<<28)
#define AVX2_FLAG (1<<5)

Expand All @@ -56,6 +57,7 @@ void Sys_CheckCpuInstructionsSupport(void)
cpuinfo.ssse3 = (cpuid_data[2] & SSSE3_FLAG) ? 1 : 0;
cpuinfo.sse4_1 = (cpuid_data[2] & SSE4_1_FLAG) ? 1 : 0;
cpuinfo.sse4_2 = (cpuid_data[2] & SSE4_2_FLAG) ? 1 : 0;
cpuinfo.popcnt = (cpuid_data[2] & POPCNT_FLAG) ? 1 : 0;
cpuinfo.avx = (cpuid_data[2] & AVX_FLAG) ? 1 : 0;

#if defined ASMLIB_H
Expand Down
2 changes: 1 addition & 1 deletion rehlds/public/rehlds/sys_shared.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

typedef struct cpuinfo_s
{
uint8 sse3, ssse3, sse4_1, sse4_2, avx, avx2;
uint8 sse3, ssse3, sse4_1, sse4_2, avx, avx2, popcnt;
} cpuinfo_t;

extern cpuinfo_t cpuinfo;
Expand Down
1 change: 1 addition & 0 deletions rehlds/rehlds/precompiled.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "archtypes.h"
#include "asmlib.h"
#include "sse_mathfun.h"
#include "mathlib.h"

#include "sys_shared.h"
Expand Down
Loading

0 comments on commit 5a6244c

Please sign in to comment.