diff --git a/GPU/Common/SplineCommon.cpp b/GPU/Common/SplineCommon.cpp index 2a6e66cd1952..6b84c8cc7658 100644 --- a/GPU/Common/SplineCommon.cpp +++ b/GPU/Common/SplineCommon.cpp @@ -30,50 +30,6 @@ #include "GPU/ge_constants.h" #include "GPU/GPUState.h" // only needed for UVScale stuff -#if defined(_M_SSE) -#include - -inline __m128 SSECrossProduct(__m128 a, __m128 b) -{ - const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 1, 0, 2))); - const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a, a, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b, b, _MM_SHUFFLE(3, 0, 2, 1))); - return _mm_sub_ps(left, right); -} - -inline __m128 SSENormalizeMultiplierSSE2(__m128 v) -{ - const __m128 sq = _mm_mul_ps(v, v); - const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); - const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); - const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); - - const __m128 rt = _mm_rsqrt_ss(res); - return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); -} - -#if _M_SSE >= 0x401 -#include - -inline __m128 SSENormalizeMultiplierSSE4(__m128 v) -{ - return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); -} - -inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) -{ - if (useSSE4) - return SSENormalizeMultiplierSSE4(v); - return SSENormalizeMultiplierSSE2(v); -} -#else -inline __m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) -{ - return SSENormalizeMultiplierSSE2(v); -} -#endif - -#endif - static void CopyQuadIndex(u16 *&indices, GEPatchPrimType type, const int idx0, const int idx1, const int idx2, const int idx3) { if (type == GE_PATCHPRIM_LINES) { *(indices++) = idx0; @@ -280,30 +236,6 @@ static void TessellateSplinePatchHardware(u8 *&dest, u16 *indices, int &count, c BuildIndex(indices, count, spatch.tess_u, spatch.tess_v, spatch.primType); } -static inline void AccumulateWeighted(Vec3f &out, const Vec3f &in, const Vec4f &w) { -#ifdef _M_SSE - out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec)); -#else - out += in * w.x; -#endif -} - -static inline void AccumulateWeighted(Vec4f &out, const Vec4f &in, const Vec4f &w) { -#ifdef _M_SSE - out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec)); -#else - out += in * w; -#endif -} - -static inline void AccumulateWeighted(Vec2f &out, const Vec2f &in, const Vec4f &w) { -#ifdef _M_SSE - out.vec = _mm_add_ps(out.vec, _mm_mul_ps(in.vec, w.vec)); -#else - out += in * w; -#endif -} - template static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const SplinePatchLocal &spatch, u32 origVertType, int quality, int maxVertices) { // Full (mostly) correct tessellation of spline patches. @@ -368,13 +300,11 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp Vec4f vert_color(0, 0, 0, 0); Vec3f vert_pos; vert_pos.SetZero(); - Vec3f vert_nrm; Vec3f du, dv; - du.SetZero(); - dv.SetZero(); Vec2f vert_tex; if (origNrm) { - vert_nrm.SetZero(); + du.SetZero(); + dv.SetZero(); } if (origCol) { vert_color.SetZero(); @@ -417,11 +347,6 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp float f = u_spline * v_spline; if (f > 0.0f) { -#ifdef _M_SSE - Vec4f fv(_mm_set_ps1(f)); -#else - Vec4f fv = Vec4f::AssignToAll(f); -#endif int idx = spatch.count_u * (iv + jj) + (iu + ii); /* if (idx >= max_idx) { @@ -430,30 +355,23 @@ static void SplinePatchFullQuality(u8 *&dest, u16 *indices, int &count, const Sp OutputDebugStringA(temp); Crash(); }*/ - AccumulateWeighted(vert_pos, spatch.pos[idx], fv); + vert_pos += spatch.pos[idx] * f; if (origTc) { - AccumulateWeighted(vert_tex, spatch.tex[idx], fv); + vert_tex += spatch.tex[idx] * f; } if (origCol) { - AccumulateWeighted(vert_color, spatch.col[idx], fv); + vert_color += spatch.col[idx] * f; } if (origNrm) { - AccumulateWeighted(du, spatch.pos[idx], Vec4f::AssignToAll(u_derivs[ii] * v_weights[jj])); - AccumulateWeighted(dv, spatch.pos[idx], Vec4f::AssignToAll(u_weights[ii] * v_derivs[jj])); + du += spatch.pos[idx] * (u_derivs[ii] * v_weights[jj]); + dv += spatch.pos[idx] * (u_weights[ii] * v_derivs[jj]); } } } } vert->pos = vert_pos; if (origNrm) { - vert_nrm = Cross(du, dv); -#ifdef _M_SSE - const __m128 normalize = SSENormalizeMultiplier(useSSE4, vert_nrm.vec); - vert_nrm.vec = _mm_mul_ps(vert_nrm.vec, normalize); -#else - vert_nrm.Normalize(); -#endif - vert->nrm = vert_nrm; + vert->nrm = Cross(du, dv).Normalized(useSSE4); } else { vert->nrm.SetZero(); vert->nrm.z = 1.0f; diff --git a/GPU/Math3D.cpp b/GPU/Math3D.cpp index f43f41ade606..c46a4e520d41 100644 --- a/GPU/Math3D.cpp +++ b/GPU/Math3D.cpp @@ -102,11 +102,49 @@ float Vec3::Distance2To(Vec3 &other) return Vec3(other-(*this)).Length2(); } +#if defined(_M_SSE) +__m128 SSENormalizeMultiplierSSE2(__m128 v) +{ + const __m128 sq = _mm_mul_ps(v, v); + const __m128 r2 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 1)); + const __m128 r3 = _mm_shuffle_ps(sq, sq, _MM_SHUFFLE(0, 0, 0, 2)); + const __m128 res = _mm_add_ss(r3, _mm_add_ss(r2, sq)); + + const __m128 rt = _mm_rsqrt_ss(res); + return _mm_shuffle_ps(rt, rt, _MM_SHUFFLE(0, 0, 0, 0)); +} + +#if _M_SSE >= 0x401 +__m128 SSENormalizeMultiplierSSE4(__m128 v) +{ + return _mm_rsqrt_ps(_mm_dp_ps(v, v, 0xFF)); +} + +__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) +{ + if (useSSE4) + return SSENormalizeMultiplierSSE4(v); + return SSENormalizeMultiplierSSE2(v); +} +#else +__m128 SSENormalizeMultiplier(bool useSSE4, __m128 v) +{ + return SSENormalizeMultiplierSSE2(v); +} +#endif template<> -Vec3 Vec3::Normalized() const +Vec3 Vec3::Normalized(bool useSSE4) const +{ + const __m128 normalize = SSENormalizeMultiplier(useSSE4, vec); + return _mm_mul_ps(normalize, vec); +} +#else +template<> +Vec3 Vec3::Normalized(bool useSSE4) const { return (*this) / Length(); } +#endif template<> float Vec3::Normalize() diff --git a/GPU/Math3D.h b/GPU/Math3D.h index 420de4e91aeb..04743436eaed 100644 --- a/GPU/Math3D.h +++ b/GPU/Math3D.h @@ -25,6 +25,9 @@ #if defined(_M_SSE) #include +#if _M_SSE >= 0x401 +#include +#endif #endif namespace Math3D { @@ -293,7 +296,7 @@ class Vec3 void SetLength(const float l); Vec3 WithLength(const float l) const; float Distance2To(Vec3 &other); - Vec3 Normalized() const; + Vec3 Normalized(bool useSSE4 = false) const; float Normalize(); // returns the previous length, which is often useful T& operator [] (int i) //allow vector[2] = 3 (vector.z=3) @@ -1081,6 +1084,94 @@ __forceinline void Vec4::ToRGBA(u8 *rgba) const *(u32 *)rgba = ToRGBA(); } +#if defined(_M_SSE) +// Specialized for SIMD optimization + +// Vec2 operation +template<> +inline void Vec2::operator += (const Vec2 &other) +{ + vec = _mm_add_ps(vec, other.vec); +} + +template<> +inline Vec2 Vec2::operator + (const Vec2 &other) const +{ + return Vec2(_mm_add_ps(vec, other.vec)); +} + +template<> +inline Vec2 Vec2::operator * (const Vec2 &other) const +{ + return Vec2(_mm_mul_ps(vec, other.vec)); +} + +template<> template<> +inline Vec2 Vec2::operator * (const float &other) const +{ + return Vec2(_mm_mul_ps(vec, _mm_set_ps1(other))); +} + +// Vec3 operation +template<> +inline void Vec3::operator += (const Vec3 &other) +{ + vec = _mm_add_ps(vec, other.vec); +} + +template<> +inline Vec3 Vec3::operator + (const Vec3 &other) const +{ + return Vec3(_mm_add_ps(vec, other.vec)); +} + +template<> +inline Vec3 Vec3::operator * (const Vec3 &other) const +{ + return Vec3(_mm_mul_ps(vec, other.vec)); +} + +template<> template<> +inline Vec3 Vec3::operator * (const float &other) const +{ + return Vec3(_mm_mul_ps(vec, _mm_set_ps1(other))); +} + +// Vec4 operation +template<> +inline void Vec4::operator += (const Vec4 &other) +{ + vec = _mm_add_ps(vec, other.vec); +} + +template<> +inline Vec4 Vec4::operator + (const Vec4 &other) const +{ + return Vec4(_mm_add_ps(vec, other.vec)); +} + +template<> +inline Vec4 Vec4::operator * (const Vec4 &other) const +{ + return Vec4(_mm_mul_ps(vec, other.vec)); +} + +template<> template<> +inline Vec4 Vec4::operator * (const float &other) const +{ + return Vec4(_mm_mul_ps(vec, _mm_set_ps1(other))); +} + +// Vec3 cross product +template<> +inline Vec3 Cross(const Vec3 &a, const Vec3 &b) +{ + const __m128 left = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 1, 0, 2))); + const __m128 right = _mm_mul_ps(_mm_shuffle_ps(a.vec, a.vec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(b.vec, b.vec, _MM_SHUFFLE(3, 0, 2, 1))); + return _mm_sub_ps(left, right); +} +#endif + }; // namespace Math3D // linear interpolation via float: 0.0=begin, 1.0=end