Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement vertex range culling #11393

Merged
merged 5 commits into from
Sep 18, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions GPU/Common/ShaderCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,15 @@ enum : uint64_t {

DIRTY_BEZIERSPLINE = 1ULL << 32,
DIRTY_TEXCLAMP = 1ULL << 33,
DIRTY_CULLRANGE = 1ULL << 34,

DIRTY_DEPAL = 1ULL << 34,
DIRTY_DEPAL = 1ULL << 35,

// space for 5 more uniform dirty flags. Remember to update DIRTY_ALL_UNIFORMS.

DIRTY_BONE_UNIFORMS = 0xFF000000ULL,

DIRTY_ALL_UNIFORMS = 0x7FFFFFFFFULL,
DIRTY_ALL_UNIFORMS = 0xFFFFFFFFFULL,
DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3,

// Other dirty elements that aren't uniforms!
Expand Down
47 changes: 47 additions & 0 deletions GPU/Common/ShaderUniforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,49 @@ static void ConvertProjMatrixToD3D11(Matrix4x4 &in) {
in.translateAndScale(trans, scale);
}

void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ) {
// Account for the projection viewport adjustment when viewport is too large.
auto reverseViewportX = [](float x) {
float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
return pspViewport * (1.0f / gstate_c.vpWidthScale);
};
auto reverseViewportY = [flipViewport](float y) {
float heightScale = gstate_c.vpHeightScale;
if (flipViewport) {
// For D3D11 and GLES non-buffered.
heightScale = -heightScale;
}
float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
return pspViewport * (1.0f / gstate_c.vpHeightScale);
};
auto reverseViewportZ = [hasNegZ](float z) {
float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
// Differs from GLES: depth is 0 to 1, not -1 to 1.
float realViewport = (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale);
return hasNegZ ? realViewport : (realViewport * 0.5f + 0.5f);
};
auto sortPair = [](float a, float b) {
return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
};

// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
// Any vertex outside this range (unless depth clamp enabled) is discarded.
auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
// Since we have space in w, use it to pass the depth clamp flag. We also pass NAN for w "discard".
float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;

minValues[0] = x.first;
minValues[1] = y.first;
minValues[2] = z.first;
minValues[3] = clampEnable;
maxValues[0] = x.second;
maxValues[1] = y.second;
maxValues[2] = z.second;
maxValues[3] = NAN;
}

void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport) {
if (dirtyUniforms & DIRTY_TEXENV) {
Uint8x3ToFloat4(ub->texEnvColor, gstate.texenvcolor);
Expand Down Expand Up @@ -192,6 +235,10 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
ub->depthRange[3] = viewZInvScale;
}

if (dirtyUniforms & DIRTY_CULLRANGE) {
CalcCullRange(ub->cullRangeMin, ub->cullRangeMax, flipViewport, false);
}

if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v);
}
Expand Down
10 changes: 9 additions & 1 deletion GPU/Common/ShaderUniforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ enum : uint64_t {
};

// TODO: Split into two structs, one for software transform and one for hardware transform, to save space.
// 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
// Currently 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
// Every line here is a 4-float.
struct UB_VS_FS_Base {
float proj[16];
Expand All @@ -32,6 +32,8 @@ struct UB_VS_FS_Base {
float matAmbient[4];
uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt; // 4 params packed into one.
int pad2; int pad3;
float cullRangeMin[4];
float cullRangeMax[4];
// Fragment data
float fogColor[4];
float texEnvColor[4];
Expand All @@ -58,6 +60,8 @@ R"( mat4 proj_mtx;
uint depal_mask_shift_off_fmt;
int pad2;
int pad3;
vec4 cullRangeMin;
vec4 cullRangeMax;
vec3 fogcolor;
vec3 texenv;
ivec4 alphacolorref;
Expand All @@ -84,6 +88,8 @@ R"( float4x4 u_proj;
uint u_depal_mask_shift_off_fmt;
int pad2;
int pad3;
float4 u_cullRangeMin;
float4 u_cullRangeMax;
float3 u_fogcolor;
float3 u_texenv;
uint4 u_alphacolorref;
Expand Down Expand Up @@ -175,6 +181,8 @@ static const char *cb_vs_bonesStr =
R"( float4x3 u_bone[8];
)";

void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ);

void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport);
void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms);
void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms);
Expand Down
12 changes: 10 additions & 2 deletions GPU/Directx9/ShaderManagerDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/ShaderUniforms.h"
#include "GPU/Directx9/ShaderManagerDX9.h"
#include "GPU/Directx9/DrawEngineDX9.h"
#include "GPU/Directx9/FramebufferDX9.h"
Expand Down Expand Up @@ -314,7 +315,7 @@ void ShaderManagerDX9::PSUpdateUniforms(u64 dirtyUniforms) {
}

const uint64_t vsUniforms = DIRTY_PROJMATRIX | DIRTY_PROJTHROUGHMATRIX | DIRTY_WORLDMATRIX | DIRTY_VIEWMATRIX | DIRTY_TEXMATRIX |
DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE |
DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE | DIRTY_CULLRANGE |
DIRTY_AMBIENT | DIRTY_MATAMBIENTALPHA | DIRTY_MATSPECULAR | DIRTY_MATDIFFUSE | DIRTY_MATEMISSIVE | DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3;

void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
Expand Down Expand Up @@ -425,7 +426,7 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
VSSetFloatArray(CONST_VS_UVSCALEOFFSET, uvscaleoff, 4);
}

if (dirtyUniforms & DIRTY_DEPTHRANGE) {
if (dirtyUniforms & DIRTY_DEPTHRANGE) {
// Depth is [0, 1] mapping to [minz, maxz], not too hard.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
Expand All @@ -447,6 +448,13 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
VSSetFloatUniform4(CONST_VS_DEPTHRANGE, data);
}
if (dirtyUniforms & DIRTY_CULLRANGE) {
float minValues[4], maxValues[4];
CalcCullRange(minValues, maxValues, false, false);
VSSetFloatUniform4(CONST_VS_CULLRANGEMIN, minValues);
VSSetFloatUniform4(CONST_VS_CULLRANGEMAX, maxValues);
}

// Lighting
if (dirtyUniforms & DIRTY_AMBIENT) {
VSSetColorUniform3Alpha(CONST_VS_AMBIENT, gstate.ambientcolor, gstate.getAmbientA());
Expand Down
37 changes: 27 additions & 10 deletions GPU/Directx9/VertexShaderGeneratorDX9.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,10 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
if (!isModeThrough && gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, "float4 u_depthRange : register(c%i);\n", CONST_VS_DEPTHRANGE);
}
if (!isModeThrough) {
WRITE(p, "float4 u_cullRangeMin : register(c%i);\n", CONST_VS_CULLRANGEMIN);
WRITE(p, "float4 u_cullRangeMax : register(c%i);\n", CONST_VS_CULLRANGEMAX);
}
} else {
WRITE(p, "cbuffer base : register(b0) {\n%s};\n", cb_baseStr);
WRITE(p, "cbuffer lights: register(b1) {\n%s};\n", cb_vs_lightsStr);
Expand Down Expand Up @@ -370,22 +374,22 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
}
if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
if (isModeThrough) {
WRITE(p, " Out.gl_Position = mul(u_proj_through, float4(In.position.xyz, 1.0));\n");
WRITE(p, " float4 outPos = mul(u_proj_through, float4(In.position.xyz, 1.0));\n");
} else {
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(u_proj, float4(In.position.xyz, 1.0)));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(u_proj, float4(In.position.xyz, 1.0)));\n");
} else {
WRITE(p, " Out.gl_Position = mul(u_proj, float4(In.position.xyz, 1.0));\n");
WRITE(p, " float4 outPos = mul(u_proj, float4(In.position.xyz, 1.0));\n");
}
}
} else {
if (isModeThrough) {
WRITE(p, " Out.gl_Position = mul(float4(In.position.xyz, 1.0), u_proj_through);\n");
WRITE(p, " float4 outPos = mul(float4(In.position.xyz, 1.0), u_proj_through);\n");
} else {
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(float4(In.position.xyz, 1.0), u_proj));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(float4(In.position.xyz, 1.0), u_proj));\n");
} else {
WRITE(p, " Out.gl_Position = mul(float4(In.position.xyz, 1.0), u_proj);\n");
WRITE(p, " float4 outPos = mul(float4(In.position.xyz, 1.0), u_proj);\n");
}
}
}
Expand Down Expand Up @@ -577,16 +581,16 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
// Final view and projection transforms.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(u_proj, viewPos));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(u_proj, viewPos));\n");
} else {
WRITE(p, " Out.gl_Position = mul(u_proj, viewPos);\n");
WRITE(p, " float4 outPos = mul(u_proj, viewPos);\n");
}
} else {
// Final view and projection transforms.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " Out.gl_Position = depthRoundZVP(mul(viewPos, u_proj));\n");
WRITE(p, " float4 outPos = depthRoundZVP(mul(viewPos, u_proj));\n");
} else {
WRITE(p, " Out.gl_Position = mul(viewPos, u_proj);\n");
WRITE(p, " float4 outPos = mul(viewPos, u_proj);\n");
}
}

Expand Down Expand Up @@ -811,6 +815,19 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
}
}

if (!isModeThrough) {
WRITE(p, " float3 projPos = outPos.xyz / outPos.w;\n");
// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
WRITE(p, " if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
WRITE(p, " if (%s || %s) {\n", outMin, outMax);
WRITE(p, " outPos.w = u_cullRangeMax.w;\n");
WRITE(p, " }\n");
WRITE(p, " }\n");
}
WRITE(p, " Out.gl_Position = outPos;\n");

WRITE(p, " return Out;\n");
WRITE(p, "}\n");
}
Expand Down
2 changes: 2 additions & 0 deletions GPU/Directx9/VertexShaderGeneratorDX9.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace DX9 {
CONST_VS_BONE6 = 71,
CONST_VS_BONE7 = 74,
CONST_VS_BONE8 = 77,
CONST_VS_CULLRANGEMIN = 80,
CONST_VS_CULLRANGEMAX = 81,
};

};
13 changes: 11 additions & 2 deletions GPU/GLES/ShaderManagerGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,10 @@
#include "GPU/Math3D.h"
#include "GPU/GPUState.h"
#include "GPU/ge_constants.h"
#include "GPU/Common/ShaderUniforms.h"
#include "GPU/GLES/ShaderManagerGLES.h"
#include "GPU/GLES/DrawEngineGLES.h"
#include "FramebufferManagerGLES.h"
#include "GPU/GLES/FramebufferManagerGLES.h"

Shader::Shader(GLRenderManager *render, const char *code, const std::string &desc, uint32_t glShaderType, bool useHWTransform, uint32_t attrMask, uint64_t uniformMask)
: render_(render), failed_(false), useHWTransform_(useHWTransform), attrMask_(attrMask), uniformMask_(uniformMask) {
Expand Down Expand Up @@ -116,6 +117,8 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
else
numBones = 0;
queries.push_back({ &u_depthRange, "u_depthRange" });
queries.push_back({ &u_cullRangeMin, "u_cullRangeMin" });
queries.push_back({ &u_cullRangeMax, "u_cullRangeMax" });

#ifdef USE_BONE_ARRAY
queries.push_back({ &u_bone, "u_bone" });
Expand Down Expand Up @@ -455,7 +458,7 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
if (dirty & DIRTY_TEXMATRIX) {
SetMatrix4x3(render_, &u_texmtx, gstate.tgenMatrix);
}
if ((dirty & DIRTY_DEPTHRANGE) && u_depthRange != -1) {
if (dirty & DIRTY_DEPTHRANGE) {
// Since depth is [-1, 1] mapping to [minz, maxz], this is easyish.
float vpZScale = gstate.getViewportZScale();
float vpZCenter = gstate.getViewportZCenter();
Expand All @@ -481,6 +484,12 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
SetFloatUniform4(render_, &u_depthRange, data);
}
if (dirty & DIRTY_CULLRANGE) {
float minValues[4], maxValues[4];
CalcCullRange(minValues, maxValues, g_Config.iRenderingMode == FB_NON_BUFFERED_MODE, true);
SetFloatUniform4(render_, &u_cullRangeMin, minValues);
SetFloatUniform4(render_, &u_cullRangeMax, maxValues);
}

if (dirty & DIRTY_STENCILREPLACEVALUE) {
float f = (float)gstate.getStencilTestRef() * (1.0f / 255.0f);
Expand Down
2 changes: 2 additions & 0 deletions GPU/GLES/ShaderManagerGLES.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ class LinkedShader {
int u_texmtx;
int u_world;
int u_depthRange; // x,y = viewport xscale/xcenter. z,w=clipping minz/maxz (?)
int u_cullRangeMin;
int u_cullRangeMax;

#ifdef USE_BONE_ARRAY
int u_bone; // array, size is numBones
Expand Down
34 changes: 29 additions & 5 deletions GPU/GLES/VertexShaderGeneratorGLES.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ enum DoLightComputation {
//
// Now, the regular machinery will take over and do the calculation again.
//
// Depth is not clipped to the viewport, but does clip to "minz" and "maxz". It may also be clamped
// to 0 and 65535 if a depth clamping/clipping flag is set (x/y clipping is performed only if depth
// needs to be clamped.)
//
// All this above is for full transform mode.
// In through mode, the Z coordinate just goes straight through and there is no perspective division.
// We simulate this of course with pretty much an identity matrix. Rounding Z becomes very easy.
Expand Down Expand Up @@ -335,6 +339,12 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
*uniformMask |= DIRTY_DEPTHRANGE;
}

if (!isModeThrough) {
WRITE(p, "uniform highp vec4 u_cullRangeMin;\n");
WRITE(p, "uniform highp vec4 u_cullRangeMax;\n");
*uniformMask |= DIRTY_CULLRANGE;
}

WRITE(p, "%s%s lowp vec4 v_color0;\n", shading, varying);
if (lmode) {
WRITE(p, "%s%s lowp vec3 v_color1;\n", shading, varying);
Expand Down Expand Up @@ -472,13 +482,13 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
WRITE(p, " v_fogdepth = position.w;\n");
}
if (isModeThrough) {
WRITE(p, " gl_Position = u_proj_through * vec4(position.xyz, 1.0);\n");
WRITE(p, " vec4 outPos = u_proj_through * vec4(position.xyz, 1.0);\n");
} else {
// The viewport is used in this case, so need to compensate for that.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " gl_Position = depthRoundZVP(u_proj * vec4(position.xyz, 1.0));\n");
WRITE(p, " vec4 outPos = depthRoundZVP(u_proj * vec4(position.xyz, 1.0));\n");
} else {
WRITE(p, " gl_Position = u_proj * vec4(position.xyz, 1.0);\n");
WRITE(p, " vec4 outPos = u_proj * vec4(position.xyz, 1.0);\n");
}
}
} else {
Expand Down Expand Up @@ -671,9 +681,9 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,

// Final view and projection transforms.
if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
WRITE(p, " gl_Position = depthRoundZVP(u_proj * viewPos);\n");
WRITE(p, " vec4 outPos = depthRoundZVP(u_proj * viewPos);\n");
} else {
WRITE(p, " gl_Position = u_proj * viewPos;\n");
WRITE(p, " vec4 outPos = u_proj * viewPos;\n");
}

// TODO: Declare variables for dots for shade mapping if needed.
Expand Down Expand Up @@ -898,5 +908,19 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
if (enableFog)
WRITE(p, " v_fogdepth = (viewPos.z + u_fogcoef.x) * u_fogcoef.y;\n");
}

if (!isModeThrough) {
WRITE(p, " vec3 projPos = outPos.xyz / outPos.w;\n");
// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
WRITE(p, " if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
WRITE(p, " if (%s || %s) {\n", outMin, outMax);
WRITE(p, " outPos.w = u_cullRangeMax.w;\n");
WRITE(p, " }\n");
WRITE(p, " }\n");
}
WRITE(p, " gl_Position = outPos;\n");

WRITE(p, "}\n");
}
Loading