Skip to content

Commit

Permalink
[spline/bezier]Move SIMD optimization of vector operations to Math3D.h.
Browse files Browse the repository at this point in the history
Needs rebuild to avoid a dialog confirmation on Visual Studio.
  • Loading branch information
xebra committed Oct 7, 2018
1 parent c8e45ae commit d0682d7
Show file tree
Hide file tree
Showing 3 changed files with 139 additions and 92 deletions.
98 changes: 8 additions & 90 deletions GPU/Common/SplineCommon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,50 +30,6 @@
#include "GPU/ge_constants.h"
#include "GPU/GPUState.h" // only needed for UVScale stuff

#if defined(_M_SSE)
#include <emmintrin.h>

inline __m128 SSECrossProduct(__m128 a, __m128 b)
{
const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2)));
const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1)));
return _mm_sub_ps(left, right);
}

inline __m128 SSENormalizeMultiplierSSE2(__m128 v)
{
const __m128 sq = _mm_mul_ps(v, v);
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));

const __m128 rt = _mm_rsqrt_ss(res);
return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
}

#if _M_SSE >= 0x401
#include <smmintrin.h>

inline __m128 SSENormalizeMultiplierSSE4(__m128 v)
{
return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
}

inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
{
if (useSSE4)
return SSENormalizeMultiplierSSE4(v);
return SSENormalizeMultiplierSSE2(v);
}
#else
inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
{
return SSENormalizeMultiplierSSE2(v);
}
#endif

#endif

static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, const int idx1, const int idx2, const int idx3) {
if (type == GE_PATCHPRIM_LINES) {
*(indices++) = idx0;
Expand Down Expand Up @@ -280,30 +236,6 @@ static void TessellateSplinePatchHardware(u8 *&dest, u16 *indices, int &count, c
BuildIndex(indices, count, spatch.tess_u, spatch.tess_v, spatch.primType);
}

static inline void AccumulateWeighted(Vec3f &out, const Vec3f &in, const Vec4f &w) {
#ifdef _M_SSE
out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec));
#else
out += in * w.x;
#endif
}

static inline void AccumulateWeighted(Vec4f &out, const Vec4f &in, const Vec4f &w) {
#ifdef _M_SSE
out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec));
#else
out += in * w;
#endif
}

static inline void AccumulateWeighted(Vec2f &out, const Vec2f &in, const Vec4f &w) {
#ifdef _M_SSE
out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec));
#else
out += in * w;
#endif
}

template <bool origNrm, bool origCol, bool origTc, bool useSSE4>
static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) {
// Full (mostly) correct tessellation of spline patches.
Expand Down Expand Up @@ -368,13 +300,11 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
Vec4f vert_color(0, 0, 0, 0);
Vec3f vert_pos;
vert_pos.SetZero();
Vec3f vert_nrm;
Vec3f du, dv;
du.SetZero();
dv.SetZero();
Vec2f vert_tex;
if (origNrm) {
vert_nrm.SetZero();
du.SetZero();
dv.SetZero();
}
if (origCol) {
vert_color.SetZero();
Expand Down Expand Up @@ -417,11 +347,6 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
float f = u_spline * v_spline;

if (f > 0.0f) {
#ifdef _M_SSE
Vec4f fv(_mm_set_ps1(f));
#else
Vec4f fv = Vec4f::AssignToAll(f);
#endif
int idx = spatch.count_u * (iv + jj) + (iu + ii);
/*
if (idx >= max_idx) {
Expand All @@ -430,30 +355,23 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp
OutputDebugStringA(temp);
Crash();
}*/
AccumulateWeighted(vert_pos, spatch.pos[idx], fv);
vert_pos += spatch.pos[idx] * f;
if (origTc) {
AccumulateWeighted(vert_tex, spatch.tex[idx], fv);
vert_tex += spatch.tex[idx] * f;
}
if (origCol) {
AccumulateWeighted(vert_color, spatch.col[idx], fv);
vert_color += spatch.col[idx] * f;
}
if (origNrm) {
AccumulateWeighted(du, spatch.pos[idx], Vec4f::AssignToAll(u_derivs[ii] * v_weights[jj]));
AccumulateWeighted(dv, spatch.pos[idx], Vec4f::AssignToAll(u_weights[ii] * v_derivs[jj]));
du += spatch.pos[idx] * (u_derivs[ii] * v_weights[jj]);
dv += spatch.pos[idx] * (u_weights[ii] * v_derivs[jj]);
}
}
}
}
vert->pos = vert_pos;
if (origNrm) {
vert_nrm = Cross(du, dv);
#ifdef _M_SSE
const __m128 normalize = SSENormalizeMultiplier(useSSE4, vert_nrm.vec);
vert_nrm.vec = _mm_mul_ps(vert_nrm.vec, normalize);
#else
vert_nrm.Normalize();
#endif
vert->nrm = vert_nrm;
vert->nrm = Cross(du, dv).Normalized(useSSE4);
} else {
vert->nrm.SetZero();
vert->nrm.z = 1.0f;
Expand Down
40 changes: 39 additions & 1 deletion GPU/Math3D.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,49 @@ float Vec3<float>::Distance2To(Vec3<float> &other)
return Vec3<float>(other-(*this)).Length2();
}

#if defined(_M_SSE)
__m128 SSENormalizeMultiplierSSE2(__m128 v)
{
const __m128 sq = _mm_mul_ps(v, v);
const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1));
const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2));
const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq));

const __m128 rt = _mm_rsqrt_ss(res);
return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0));
}

#if _M_SSE >= 0x401
__m128 SSENormalizeMultiplierSSE4(__m128 v)
{
return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF));
}

__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
{
if (useSSE4)
return SSENormalizeMultiplierSSE4(v);
return SSENormalizeMultiplierSSE2(v);
}
#else
__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v)
{
return SSENormalizeMultiplierSSE2(v);
}
#endif
template<>
Vec3<float> Vec3<float>::Normalized() const
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
{
const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec);
return _mm_mul_ps(normalize, vec);
}
#else
template<>
Vec3<float> Vec3<float>::Normalized(bool useSSE4) const
{
return (*this) / Length();
}
#endif

template<>
float Vec3<float>::Normalize()
Expand Down
93 changes: 92 additions & 1 deletion GPU/Math3D.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@

#if defined(_M_SSE)
#include <emmintrin.h>
#if _M_SSE >= 0x401
#include <smmintrin.h>
#endif
#endif

namespace Math3D {
Expand Down Expand Up @@ -293,7 +296,7 @@ class Vec3
void SetLength(const float l);
Vec3 WithLength(const float l) const;
float Distance2To(Vec3 &other);
Vec3 Normalized() const;
Vec3 Normalized(bool useSSE4 = false) const;
float Normalize(); // returns the previous length, which is often useful

T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
Expand Down Expand Up @@ -1081,6 +1084,94 @@ __forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
*(u32 *)rgba = ToRGBA();
}

#if defined(_M_SSE)
// Specialized for SIMD optimization

// Vec2<float> operation
template<>
inline void Vec2<float>::operator += (const Vec2<float> &other)
{
vec = _mm_add_ps(vec, other.vec);
}

template<>
inline Vec2<float> Vec2<float>::operator + (const Vec2 &other) const
{
return Vec2<float>(_mm_add_ps(vec, other.vec));
}

template<>
inline Vec2<float> Vec2<float>::operator * (const Vec2 &other) const
{
return Vec2<float>(_mm_mul_ps(vec, other.vec));
}

template<> template<>
inline Vec2<float> Vec2<float>::operator * (const float &other) const
{
return Vec2<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
}

// Vec3<float> operation
template<>
inline void Vec3<float>::operator += (const Vec3<float> &other)
{
vec = _mm_add_ps(vec, other.vec);
}

template<>
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const
{
return Vec3<float>(_mm_add_ps(vec, other.vec));
}

template<>
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const
{
return Vec3<float>(_mm_mul_ps(vec, other.vec));
}

template<> template<>
inline Vec3<float> Vec3<float>::operator * (const float &other) const
{
return Vec3<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
}

// Vec4<float> operation
template<>
inline void Vec4<float>::operator += (const Vec4<float> &other)
{
vec = _mm_add_ps(vec, other.vec);
}

template<>
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const
{
return Vec4<float>(_mm_add_ps(vec, other.vec));
}

template<>
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const
{
return Vec4<float>(_mm_mul_ps(vec, other.vec));
}

template<> template<>
inline Vec4<float> Vec4<float>::operator * (const float &other) const
{
return Vec4<float>(_mm_mul_ps(vec, _mm_set_ps1(other)));
}

// Vec3<float> cross product
template<>
inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
{
const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2)));
const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1)));
return _mm_sub_ps(left, right);
}
#endif

}; // namespace Math3D

// linear interpolation via float: 0.0=begin, 1.0=end
Expand Down

0 comments on commit d0682d7

Please sign in to comment.