From ab3a466621cdd68c51e5d12fefad6a1be3c3dce9 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 16 Sep 2018 22:52:43 -0700
Subject: [PATCH 1/5] GLES: Implement vertex range culling.

Based on tests, skips triangles with any point outside the 4096x4096 box,
except when depth clamping would engage.
---
 GPU/Common/ShaderCommon.h              |  5 ++--
 GPU/GLES/ShaderManagerGLES.cpp         | 38 ++++++++++++++++++++++++++
 GPU/GLES/ShaderManagerGLES.h           |  2 ++
 GPU/GLES/VertexShaderGeneratorGLES.cpp | 30 ++++++++++++++++----
 GPU/GPUCommon.cpp                      | 14 +++++-----
 5 files changed, 75 insertions(+), 14 deletions(-)

diff --git a/GPU/Common/ShaderCommon.h b/GPU/Common/ShaderCommon.h
index c26185885435..1ef4ba51e4ef 100644
--- a/GPU/Common/ShaderCommon.h
+++ b/GPU/Common/ShaderCommon.h
@@ -87,14 +87,15 @@ enum : uint64_t {
 
 	DIRTY_BEZIERSPLINE = 1ULL << 32,
 	DIRTY_TEXCLAMP = 1ULL << 33,
+	DIRTY_CULLRANGE = 1ULL << 34,
 
-	DIRTY_DEPAL = 1ULL << 34,
+	DIRTY_DEPAL = 1ULL << 35,
 
 	// space for 5 more uniform dirty flags. Remember to update DIRTY_ALL_UNIFORMS.
 
 	DIRTY_BONE_UNIFORMS = 0xFF000000ULL,
 
-	DIRTY_ALL_UNIFORMS = 0x7FFFFFFFFULL,
+	DIRTY_ALL_UNIFORMS = 0xFFFFFFFFFULL,
 	DIRTY_ALL_LIGHTS = DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3,
 
 	// Other dirty elements that aren't uniforms!
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index 484819332e6b..0f957329c379 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -116,6 +116,8 @@ LinkedShader::LinkedShader(GLRenderManager *render, VShaderID VSID, Shader *vs,
 	else
 		numBones = 0;
 	queries.push_back({ &u_depthRange, "u_depthRange" });
+	queries.push_back({ &u_cullRangeMin, "u_cullRangeMin" });
+	queries.push_back({ &u_cullRangeMax, "u_cullRangeMax" });
 
 #ifdef USE_BONE_ARRAY
 	queries.push_back({ &u_bone, "u_bone" });
@@ -481,6 +483,42 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
 		float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
 		SetFloatUniform4(render_, &u_depthRange, data);
 	}
+	if (dirty & DIRTY_CULLRANGE) {
+		// Account for the projection viewport adjustment when viewport is too large.
+		auto reverseViewportX = [](float x) {
+			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
+			return (pspViewport - gstate_c.vpXOffset) * (1.0f / gstate_c.vpWidthScale);
+		};
+		auto reverseViewportY = [](float y) {
+			float heightScale = gstate_c.vpHeightScale;
+			if (g_Config.iRenderingMode == FB_NON_BUFFERED_MODE) {
+				// GL upside down is a pain as usual.
+				heightScale = -heightScale;
+			}
+			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
+			return (pspViewport - gstate_c.vpYOffset) * (1.0f / heightScale);
+		};
+		auto reverseViewportZ = [](float z) {
+			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
+			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale);
+		};
+		auto sortPair = [](float a, float b) {
+			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
+		};
+
+		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
+		// Any vertex outside this range (unless depth clamp enabled) is discarded.
+		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
+		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
+		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
+		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
+		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
+
+		float minValues[4]{ x.first, y.first, z.first, clampEnable };
+		SetFloatUniform4(render_, &u_cullRangeMin, minValues);
+		float maxValues[4]{ x.second, y.second, z.second, NAN };
+		SetFloatUniform4(render_, &u_cullRangeMax, maxValues);
+	}
 
 	if (dirty & DIRTY_STENCILREPLACEVALUE) {
 		float f = (float)gstate.getStencilTestRef() * (1.0f / 255.0f);
diff --git a/GPU/GLES/ShaderManagerGLES.h b/GPU/GLES/ShaderManagerGLES.h
index fdcd2ce98b0d..5dbfe43b4e25 100644
--- a/GPU/GLES/ShaderManagerGLES.h
+++ b/GPU/GLES/ShaderManagerGLES.h
@@ -71,6 +71,8 @@ class LinkedShader {
 	int u_texmtx;
 	int u_world;
 	int u_depthRange;   // x,y = viewport xscale/xcenter. z,w=clipping minz/maxz (?)
+	int u_cullRangeMin;
+	int u_cullRangeMax;
 
 #ifdef USE_BONE_ARRAY
 	int u_bone;  // array, size is numBones
diff --git a/GPU/GLES/VertexShaderGeneratorGLES.cpp b/GPU/GLES/VertexShaderGeneratorGLES.cpp
index c3a6427d3b7d..bc6291a73c1e 100644
--- a/GPU/GLES/VertexShaderGeneratorGLES.cpp
+++ b/GPU/GLES/VertexShaderGeneratorGLES.cpp
@@ -335,6 +335,12 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 		*uniformMask |= DIRTY_DEPTHRANGE;
 	}
 
+	if (!isModeThrough) {
+		WRITE(p, "uniform highp vec4 u_cullRangeMin;\n");
+		WRITE(p, "uniform highp vec4 u_cullRangeMax;\n");
+		*uniformMask |= DIRTY_CULLRANGE;
+	}
+
 	WRITE(p, "%s%s lowp vec4 v_color0;\n", shading, varying);
 	if (lmode) {
 		WRITE(p, "%s%s lowp vec3 v_color1;\n", shading, varying);
@@ -472,13 +478,13 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 			WRITE(p, "  v_fogdepth = position.w;\n");
 		}
 		if (isModeThrough)	{
-			WRITE(p, "  gl_Position = u_proj_through * vec4(position.xyz, 1.0);\n");
+			WRITE(p, "  vec4 outPos = u_proj_through * vec4(position.xyz, 1.0);\n");
 		} else {
 			// The viewport is used in this case, so need to compensate for that.
 			if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-				WRITE(p, "  gl_Position = depthRoundZVP(u_proj * vec4(position.xyz, 1.0));\n");
+				WRITE(p, "  vec4 outPos = depthRoundZVP(u_proj * vec4(position.xyz, 1.0));\n");
 			} else {
-				WRITE(p, "  gl_Position = u_proj * vec4(position.xyz, 1.0);\n");
+				WRITE(p, "  vec4 outPos = u_proj * vec4(position.xyz, 1.0);\n");
 			}
 		}
 	} else {
@@ -671,9 +677,9 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 
 		// Final view and projection transforms.
 		if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-			WRITE(p, "  gl_Position = depthRoundZVP(u_proj * viewPos);\n");
+			WRITE(p, "  vec4 outPos = depthRoundZVP(u_proj * viewPos);\n");
 		} else {
-			WRITE(p, "  gl_Position = u_proj * viewPos;\n");
+			WRITE(p, "  vec4 outPos = u_proj * viewPos;\n");
 		}
 
 		// TODO: Declare variables for dots for shade mapping if needed.
@@ -898,5 +904,19 @@ void GenerateVertexShader(const VShaderID &id, char *buffer, uint32_t *attrMask,
 		if (enableFog)
 			WRITE(p, "  v_fogdepth = (viewPos.z + u_fogcoef.x) * u_fogcoef.y;\n");
 	}
+
+	if (!isModeThrough) {
+		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
+		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
+		WRITE(p, "  if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
+		const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
+		const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
+		WRITE(p, "    if (%s || %s) {\n", outMin, outMax);
+		WRITE(p, "      outPos.w = u_cullRangeMax.w;\n");
+		WRITE(p, "    }\n");
+		WRITE(p, "  }\n");
+	}
+	WRITE(p, "  gl_Position = outPos;\n");
+
 	WRITE(p, "}\n");
 }
diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
index fed71166aed3..ad9d0458cb05 100644
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@@ -188,13 +188,13 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	// Viewport.
 	{ GE_CMD_OFFSETX, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE },
 	{ GE_CMD_OFFSETY, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_VIEWPORTXSCALE, FLAG_FLUSHBEFOREONCHANGE,  DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_VIEWPORTYSCALE, FLAG_FLUSHBEFOREONCHANGE,  DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_VIEWPORTXCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_VIEWPORTYCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_VIEWPORTZSCALE, FLAG_FLUSHBEFOREONCHANGE,  DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_VIEWPORTZCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_DEPTHCLAMPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_RASTER_STATE },
+	{ GE_CMD_VIEWPORTXSCALE, FLAG_FLUSHBEFOREONCHANGE,  DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_VIEWPORTYSCALE, FLAG_FLUSHBEFOREONCHANGE,  DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_VIEWPORTXCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_VIEWPORTYCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_VIEWPORTZSCALE, FLAG_FLUSHBEFOREONCHANGE,  DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_VIEWPORTZCENTER, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_CULLRANGE | DIRTY_DEPTHRANGE | DIRTY_PROJMATRIX | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_DEPTHCLAMPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_RASTER_STATE },
 
 	// Z clip
 	{ GE_CMD_MINZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_VIEWPORTSCISSOR_STATE },

From 639a3f406d3a4b04c6b8119c30a7a3fd2b15c66e Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 16 Sep 2018 23:40:30 -0700
Subject: [PATCH 2/5] D3D9: Implement vertex range culling.

---
 GPU/Directx9/ShaderManagerDX9.cpp         | 43 +++++++++++++++++++++--
 GPU/Directx9/VertexShaderGeneratorDX9.cpp | 37 +++++++++++++------
 GPU/Directx9/VertexShaderGeneratorDX9.h   |  2 ++
 GPU/GLES/ShaderManagerGLES.cpp            |  2 +-
 GPU/GLES/VertexShaderGeneratorGLES.cpp    |  4 +++
 5 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/GPU/Directx9/ShaderManagerDX9.cpp b/GPU/Directx9/ShaderManagerDX9.cpp
index a10e56603f06..1b90a7d62152 100644
--- a/GPU/Directx9/ShaderManagerDX9.cpp
+++ b/GPU/Directx9/ShaderManagerDX9.cpp
@@ -314,7 +314,7 @@ void ShaderManagerDX9::PSUpdateUniforms(u64 dirtyUniforms) {
 }
 
 const uint64_t vsUniforms = DIRTY_PROJMATRIX | DIRTY_PROJTHROUGHMATRIX | DIRTY_WORLDMATRIX | DIRTY_VIEWMATRIX | DIRTY_TEXMATRIX |
-DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE |
+DIRTY_FOGCOEF | DIRTY_BONE_UNIFORMS | DIRTY_UVSCALEOFFSET | DIRTY_DEPTHRANGE | DIRTY_CULLRANGE |
 DIRTY_AMBIENT | DIRTY_MATAMBIENTALPHA | DIRTY_MATSPECULAR | DIRTY_MATDIFFUSE | DIRTY_MATEMISSIVE | DIRTY_LIGHT0 | DIRTY_LIGHT1 | DIRTY_LIGHT2 | DIRTY_LIGHT3;
 
 void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
@@ -425,7 +425,7 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
 		VSSetFloatArray(CONST_VS_UVSCALEOFFSET, uvscaleoff, 4);
 	}
 
-	if (dirtyUniforms & DIRTY_DEPTHRANGE)	{
+	if (dirtyUniforms & DIRTY_DEPTHRANGE) {
 		// Depth is [0, 1] mapping to [minz, maxz], not too hard.
 		float vpZScale = gstate.getViewportZScale();
 		float vpZCenter = gstate.getViewportZCenter();
@@ -447,6 +447,45 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
 		float data[4] = { viewZScale, viewZCenter, viewZCenter, viewZInvScale };
 		VSSetFloatUniform4(CONST_VS_DEPTHRANGE, data);
 	}
+	if (dirtyUniforms & DIRTY_CULLRANGE) {
+		// Account for the projection viewport adjustment when viewport is too large.
+		auto reverseViewportX = [](float x) {
+			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
+			return (pspViewport - gstate_c.vpXOffset) * (1.0f / gstate_c.vpWidthScale);
+		};
+		auto reverseViewportY = [](float y) {
+			float yOffset = gstate_c.vpYOffset;
+			if (g_Config.iRenderingMode == FB_NON_BUFFERED_MODE) {
+				// GL upside down is a pain as usual.
+				// TODO: Is this right?
+				yOffset = -yOffset;
+			}
+			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
+			return (pspViewport - yOffset) * (1.0f / gstate_c.vpHeightScale);
+		};
+		auto reverseViewportZ = [](float z) {
+			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
+			// Differs from GLES: depth is 0 to 1, not -1 to 1.
+			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale) * 0.5f + 0.5f;
+		};
+		auto sortPair = [](float a, float b) {
+			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
+		};
+
+		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
+		// Any vertex outside this range (unless depth clamp enabled) is discarded.
+		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
+		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
+		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
+		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
+		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
+
+		float minValues[4]{ x.first, y.first, z.first, clampEnable };
+		VSSetFloatUniform4(CONST_VS_CULLRANGEMIN, minValues);
+		float maxValues[4]{ x.second, y.second, z.second, NAN };
+		VSSetFloatUniform4(CONST_VS_CULLRANGEMAX, maxValues);
+	}
+
 	// Lighting
 	if (dirtyUniforms & DIRTY_AMBIENT) {
 		VSSetColorUniform3Alpha(CONST_VS_AMBIENT, gstate.ambientcolor, gstate.getAmbientA());
diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
index 056bcc158093..8cebd2dfdc15 100644
--- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
@@ -176,6 +176,10 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		if (!isModeThrough && gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
 			WRITE(p, "float4 u_depthRange : register(c%i);\n", CONST_VS_DEPTHRANGE);
 		}
+		if (!isModeThrough) {
+			WRITE(p, "float4 u_cullRangeMin : register(c%i);\n", CONST_VS_CULLRANGEMIN);
+			WRITE(p, "float4 u_cullRangeMax : register(c%i);\n", CONST_VS_CULLRANGEMAX);
+		}
 	} else {
 		WRITE(p, "cbuffer base : register(b0) {\n%s};\n", cb_baseStr);
 		WRITE(p, "cbuffer lights: register(b1) {\n%s};\n", cb_vs_lightsStr);
@@ -370,22 +374,22 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		}
 		if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
 			if (isModeThrough) {
-				WRITE(p, "  Out.gl_Position = mul(u_proj_through, float4(In.position.xyz, 1.0));\n");
+				WRITE(p, "  float4 outPos = mul(u_proj_through, float4(In.position.xyz, 1.0));\n");
 			} else {
 				if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-					WRITE(p, "  Out.gl_Position = depthRoundZVP(mul(u_proj, float4(In.position.xyz, 1.0)));\n");
+					WRITE(p, "  float4 outPos = depthRoundZVP(mul(u_proj, float4(In.position.xyz, 1.0)));\n");
 				} else {
-					WRITE(p, "  Out.gl_Position = mul(u_proj, float4(In.position.xyz, 1.0));\n");
+					WRITE(p, "  float4 outPos = mul(u_proj, float4(In.position.xyz, 1.0));\n");
 				}
 			}
 		} else {
 			if (isModeThrough) {
-				WRITE(p, "  Out.gl_Position = mul(float4(In.position.xyz, 1.0), u_proj_through);\n");
+				WRITE(p, "  float4 outPos = mul(float4(In.position.xyz, 1.0), u_proj_through);\n");
 			} else {
 				if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-					WRITE(p, "  Out.gl_Position = depthRoundZVP(mul(float4(In.position.xyz, 1.0), u_proj));\n");
+					WRITE(p, "  float4 outPos = depthRoundZVP(mul(float4(In.position.xyz, 1.0), u_proj));\n");
 				} else {
-					WRITE(p, "  Out.gl_Position = mul(float4(In.position.xyz, 1.0), u_proj);\n");
+					WRITE(p, "  float4 outPos = mul(float4(In.position.xyz, 1.0), u_proj);\n");
 				}
 			}
 		}
@@ -577,16 +581,16 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		if (lang == HLSL_D3D11 || lang == HLSL_D3D11_LEVEL9) {
 			// Final view and projection transforms.
 			if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-				WRITE(p, "  Out.gl_Position = depthRoundZVP(mul(u_proj, viewPos));\n");
+				WRITE(p, "  float4 outPos = depthRoundZVP(mul(u_proj, viewPos));\n");
 			} else {
-				WRITE(p, "  Out.gl_Position = mul(u_proj, viewPos);\n");
+				WRITE(p, "  float4 outPos = mul(u_proj, viewPos);\n");
 			}
 		} else {
 			// Final view and projection transforms.
 			if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-				WRITE(p, "  Out.gl_Position = depthRoundZVP(mul(viewPos, u_proj));\n");
+				WRITE(p, "  float4 outPos = depthRoundZVP(mul(viewPos, u_proj));\n");
 			} else {
-				WRITE(p, "  Out.gl_Position = mul(viewPos, u_proj);\n");
+				WRITE(p, "  float4 outPos = mul(viewPos, u_proj);\n");
 			}
 		}
 
@@ -811,6 +815,19 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		}
 	}
 
+	if (lang == HLSL_DX9 && !isModeThrough) {
+		WRITE(p, "  float3 projPos = outPos.xyz / outPos.w;\n");
+		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
+		WRITE(p, "  if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
+		const char *outMin = "projPos.x < u_cullRangeMin.x || projPos.y < u_cullRangeMin.y || projPos.z < u_cullRangeMin.z";
+		const char *outMax = "projPos.x > u_cullRangeMax.x || projPos.y > u_cullRangeMax.y || projPos.z > u_cullRangeMax.z";
+		WRITE(p, "    if (%s || %s) {\n", outMin, outMax);
+		WRITE(p, "      outPos.w = u_cullRangeMax.w;\n");
+		WRITE(p, "    }\n");
+		WRITE(p, "  }\n");
+	}
+	WRITE(p, "  Out.gl_Position = outPos;\n");
+
 	WRITE(p, "  return Out;\n");
 	WRITE(p, "}\n");
 }
diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.h b/GPU/Directx9/VertexShaderGeneratorDX9.h
index e33567992cc9..aecc113ea367 100644
--- a/GPU/Directx9/VertexShaderGeneratorDX9.h
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.h
@@ -53,6 +53,8 @@ namespace DX9 {
 		CONST_VS_BONE6 = 71,
 		CONST_VS_BONE7 = 74,
 		CONST_VS_BONE8 = 77,
+		CONST_VS_CULLRANGEMIN = 80,
+		CONST_VS_CULLRANGEMAX = 81,
 	};
 
 };
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index 0f957329c379..2ae8c1376b46 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -457,7 +457,7 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
 	if (dirty & DIRTY_TEXMATRIX) {
 		SetMatrix4x3(render_, &u_texmtx, gstate.tgenMatrix);
 	}
-	if ((dirty & DIRTY_DEPTHRANGE) && u_depthRange != -1) {
+	if (dirty & DIRTY_DEPTHRANGE) {
 		// Since depth is [-1, 1] mapping to [minz, maxz], this is easyish.
 		float vpZScale = gstate.getViewportZScale();
 		float vpZCenter = gstate.getViewportZCenter();
diff --git a/GPU/GLES/VertexShaderGeneratorGLES.cpp b/GPU/GLES/VertexShaderGeneratorGLES.cpp
index bc6291a73c1e..c09f879a55b9 100644
--- a/GPU/GLES/VertexShaderGeneratorGLES.cpp
+++ b/GPU/GLES/VertexShaderGeneratorGLES.cpp
@@ -87,6 +87,10 @@ enum DoLightComputation {
 //
 // Now, the regular machinery will take over and do the calculation again.
 //
+// Depth is not clipped to the viewport, but does clip to "minz" and "maxz".  It may also be clamped
+// to 0 and 65535 if a depth clamping/clipping flag is set (x/y clipping is performed only if depth
+// needs to be clamped.)
+//
 // All this above is for full transform mode.
 // In through mode, the Z coordinate just goes straight through and there is no perspective division.
 // We simulate this of course with pretty much an identity matrix. Rounding Z becomes very easy.

From 44ba31fbc67198086c3a65cb3b6ef88872ee5c92 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Sun, 16 Sep 2018 23:57:20 -0700
Subject: [PATCH 3/5] Vulkan: Implement verex range culling.

Also D3D11, since they are very similar.
---
 GPU/Common/ShaderUniforms.cpp              | 38 ++++++++++++++++++++++
 GPU/Common/ShaderUniforms.h                |  8 ++++-
 GPU/Directx9/VertexShaderGeneratorDX9.cpp  |  2 +-
 GPU/Vulkan/VertexShaderGeneratorVulkan.cpp | 24 +++++++++++---
 4 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp
index e2dee45c2826..5489df59b6e8 100644
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@@ -192,6 +192,44 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
 		ub->depthRange[3] = viewZInvScale;
 	}
 
+	if (dirtyUniforms & DIRTY_CULLRANGE) {
+		// Account for the projection viewport adjustment when viewport is too large.
+		auto reverseViewportX = [](float x) {
+			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
+			return pspViewport * (1.0f / gstate_c.vpWidthScale);
+		};
+		auto reverseViewportY = [flipViewport](float y) {
+			float heightScale = gstate_c.vpHeightScale;
+			if (flipViewport) {
+				// For D3D11.
+				heightScale = -heightScale;
+			}
+			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
+			return pspViewport * (1.0f / gstate_c.vpHeightScale);
+		};
+		auto reverseViewportZ = [](float z) {
+			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
+			// Differs from GLES: depth is 0 to 1, not -1 to 1.
+			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale) * 0.5f + 0.5f;
+		};
+		auto sortPair = [](float a, float b) {
+			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
+		};
+
+		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
+		// Any vertex outside this range (unless depth clamp enabled) is discarded.
+		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
+		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
+		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
+		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
+		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
+
+		float minValues[4]{ x.first, y.first, z.first, clampEnable };
+		memcpy(ub->cullRangeMin, minValues, sizeof(ub->cullRangeMin));
+		float maxValues[4]{ x.second, y.second, z.second, NAN };
+		memcpy(ub->cullRangeMax, maxValues, sizeof(ub->cullRangeMax));
+	}
+
 	if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
 		ub->spline_counts = BytesToUint32(gstate_c.spline_count_u, gstate_c.spline_count_v, gstate_c.spline_type_u, gstate_c.spline_type_v);
 	}
diff --git a/GPU/Common/ShaderUniforms.h b/GPU/Common/ShaderUniforms.h
index dd1e659604b4..5c5c29f16904 100644
--- a/GPU/Common/ShaderUniforms.h
+++ b/GPU/Common/ShaderUniforms.h
@@ -18,7 +18,7 @@ enum : uint64_t {
 };
 
 // TODO: Split into two structs, one for software transform and one for hardware transform, to save space.
-// 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
+// Currently 512 bytes. Probably can't get to 256 (nVidia's UBO alignment).
 // Every line here is a 4-float.
 struct UB_VS_FS_Base {
 	float proj[16];
@@ -32,6 +32,8 @@ struct UB_VS_FS_Base {
 	float matAmbient[4];
 	uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt;  // 4 params packed into one.
 	int pad2; int pad3;
+	float cullRangeMin[4];
+	float cullRangeMax[4];
 	// Fragment data
 	float fogColor[4];
 	float texEnvColor[4];
@@ -58,6 +60,8 @@ R"(  mat4 proj_mtx;
   uint depal_mask_shift_off_fmt;
   int pad2;
   int pad3;
+  vec4 cullRangeMin;
+  vec4 cullRangeMax;
   vec3 fogcolor;
   vec3 texenv;
   ivec4 alphacolorref;
@@ -84,6 +88,8 @@ R"(  float4x4 u_proj;
   uint u_depal_mask_shift_off_fmt;
   int pad2;
   int pad3;
+  float4 u_cullRangeMin;
+  float4 u_cullRangeMax;
   float3 u_fogcolor;
   float3 u_texenv;
   uint4 u_alphacolorref;
diff --git a/GPU/Directx9/VertexShaderGeneratorDX9.cpp b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
index 8cebd2dfdc15..1cd0f383c4fa 100644
--- a/GPU/Directx9/VertexShaderGeneratorDX9.cpp
+++ b/GPU/Directx9/VertexShaderGeneratorDX9.cpp
@@ -815,7 +815,7 @@ void GenerateVertexShaderHLSL(const VShaderID &id, char *buffer, ShaderLanguage
 		}
 	}
 
-	if (lang == HLSL_DX9 && !isModeThrough) {
+	if (!isModeThrough) {
 		WRITE(p, "  float3 projPos = outPos.xyz / outPos.w;\n");
 		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
 		WRITE(p, "  if (u_cullRangeMin.w <= 0.0f || (projPos.z >= u_cullRangeMin.z && projPos.z <= u_cullRangeMax.z)) {\n");
diff --git a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
index e901c4390043..c180631e9e27 100644
--- a/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
+++ b/GPU/Vulkan/VertexShaderGeneratorVulkan.cpp
@@ -317,13 +317,13 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 			WRITE(p, "  v_fogdepth = position.w;\n");
 		}
 		if (isModeThrough) {
-			WRITE(p, "  gl_Position = base.proj_through_mtx * vec4(position.xyz, 1.0);\n");
+			WRITE(p, "  vec4 outPos = base.proj_through_mtx * vec4(position.xyz, 1.0);\n");
 		} else {
 			// The viewport is used in this case, so need to compensate for that.
 			if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-				WRITE(p, "  gl_Position = depthRoundZVP(base.proj_mtx * vec4(position.xyz, 1.0));\n");
+				WRITE(p, "  vec4 outPos = depthRoundZVP(base.proj_mtx * vec4(position.xyz, 1.0));\n");
 			} else {
-				WRITE(p, "  gl_Position = base.proj_mtx * vec4(position.xyz, 1.0);\n");
+				WRITE(p, "  vec4 outPos = base.proj_mtx * vec4(position.xyz, 1.0);\n");
 			}
 		}
 	} else {
@@ -472,9 +472,9 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 
 		// Final view and projection transforms.
 		if (gstate_c.Supports(GPU_ROUND_DEPTH_TO_16BIT)) {
-			WRITE(p, "  gl_Position = depthRoundZVP(base.proj_mtx * viewPos);\n");
+			WRITE(p, "  vec4 outPos = depthRoundZVP(base.proj_mtx * viewPos);\n");
 		} else {
-			WRITE(p, "  gl_Position = base.proj_mtx * viewPos;\n");
+			WRITE(p, "  vec4 outPos = base.proj_mtx * viewPos;\n");
 		}
 
 		// TODO: Declare variables for dots for shade mapping if needed.
@@ -694,6 +694,20 @@ bool GenerateVulkanGLSLVertexShader(const VShaderID &id, char *buffer) {
 		if (enableFog)
 			WRITE(p, "  v_fogdepth = (viewPos.z + base.fogcoef.x) * base.fogcoef.y;\n");
 	}
+
+	if (!isModeThrough) {
+		WRITE(p, "  vec3 projPos = outPos.xyz / outPos.w;\n");
+		// Vertex range culling doesn't happen when depth is clamped, so only do this if in range.
+		WRITE(p, "  if (base.cullRangeMin.w <= 0.0f || (projPos.z >= base.cullRangeMin.z && projPos.z <= base.cullRangeMax.z)) {\n");
+		const char *outMin = "projPos.x < base.cullRangeMin.x || projPos.y < base.cullRangeMin.y || projPos.z < base.cullRangeMin.z";
+		const char *outMax = "projPos.x > base.cullRangeMax.x || projPos.y > base.cullRangeMax.y || projPos.z > base.cullRangeMax.z";
+		WRITE(p, "    if (%s || %s) {\n", outMin, outMax);
+		WRITE(p, "      outPos.w = base.cullRangeMax.w;\n");
+		WRITE(p, "    }\n");
+		WRITE(p, "  }\n");
+	}
+	WRITE(p, "  gl_Position = outPos;\n");
+
 	WRITE(p, "}\n");
 	return true;
 }

From 985982764502c29591c3e006078357f0a3448663 Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 17 Sep 2018 21:43:29 -0700
Subject: [PATCH 4/5] Vulkan: Avoid depth clamp with clip range.

Vulkan clamps to the clip range, not the full range.  So when clipping, we
don't really want to clamp at all.  Unfortunately, when one side is
clipping, we can't do it exactly right.

But many games clip depth, like Dissidia.  Fixes #11260.
---
 GPU/GPUCommon.cpp                 |  4 ++--
 GPU/Vulkan/StateMappingVulkan.cpp | 15 +++++++++++----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/GPU/GPUCommon.cpp b/GPU/GPUCommon.cpp
index ad9d0458cb05..a9f24a31b0d7 100644
--- a/GPU/GPUCommon.cpp
+++ b/GPU/GPUCommon.cpp
@@ -197,8 +197,8 @@ const CommonCommandTableEntry commonCommandTable[] = {
 	{ GE_CMD_DEPTHCLAMPENABLE, FLAG_FLUSHBEFOREONCHANGE, DIRTY_VIEWPORTSCISSOR_STATE | DIRTY_CULLRANGE | DIRTY_RASTER_STATE },
 
 	// Z clip
-	{ GE_CMD_MINZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
-	{ GE_CMD_MAXZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_MINZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE },
+	{ GE_CMD_MAXZ, FLAG_FLUSHBEFOREONCHANGE, DIRTY_DEPTHRANGE | DIRTY_RASTER_STATE | DIRTY_VIEWPORTSCISSOR_STATE },
 
 	// Region
 	{ GE_CMD_REGION1, FLAG_FLUSHBEFOREONCHANGE, DIRTY_FRAMEBUF | DIRTY_TEXTURE_PARAMS | DIRTY_VIEWPORTSCISSOR_STATE },
diff --git a/GPU/Vulkan/StateMappingVulkan.cpp b/GPU/Vulkan/StateMappingVulkan.cpp
index 68bc711a1da5..282b602dd82e 100644
--- a/GPU/Vulkan/StateMappingVulkan.cpp
+++ b/GPU/Vulkan/StateMappingVulkan.cpp
@@ -239,15 +239,22 @@ void DrawEngineVulkan::ConvertStateToVulkanKey(FramebufferManagerVulkan &fbManag
 	}
 
 	if (gstate_c.IsDirty(DIRTY_RASTER_STATE)) {
-		if (gstate.isModeClear()) {
+		if (gstate.isModeClear() || gstate.isModeThrough()) {
 			key.cullMode = VK_CULL_MODE_NONE;
-			// TODO: Or does it always clamp?
+			// TODO: Might happen in clear mode if not through...
 			key.depthClampEnable = false;
 		} else {
 			// Set cull
-			bool wantCull = !gstate.isModeThrough() && prim != GE_PRIM_RECTANGLES && gstate.isCullEnabled();
+			bool wantCull = prim != GE_PRIM_RECTANGLES && gstate.isCullEnabled();
 			key.cullMode = wantCull ? (gstate.getCullMode() ? VK_CULL_MODE_FRONT_BIT : VK_CULL_MODE_BACK_BIT) : VK_CULL_MODE_NONE;
-			key.depthClampEnable = gstate.isDepthClampEnabled() && gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP);
+			if (gstate.getDepthRangeMin() == 0 || gstate.getDepthRangeMax() == 65535) {
+				// TODO: Still has a bug where we clamp to depth range if one is not the full range.
+				// But the alternate is not clamping in either direction...
+				key.depthClampEnable = gstate.isDepthClampEnabled() && gstate_c.Supports(GPU_SUPPORTS_DEPTH_CLAMP);
+			} else {
+				// We just want to clip in this case, the clamp would be clipped anyway.
+				key.depthClampEnable = false;
+			}
 		}
 	}
 

From 52baec21a85f1f74ce6cc7e0019dd315779463ff Mon Sep 17 00:00:00 2001
From: "Unknown W. Brackets" <checkins@unknownbrackets.org>
Date: Mon, 17 Sep 2018 22:27:25 -0700
Subject: [PATCH 5/5] GPU: Refactor cull range calculation together.

---
 GPU/Common/ShaderUniforms.cpp     | 79 +++++++++++++++++--------------
 GPU/Common/ShaderUniforms.h       |  2 +
 GPU/Directx9/ShaderManagerDX9.cpp | 37 ++-------------
 GPU/GLES/ShaderManagerGLES.cpp    | 37 ++-------------
 4 files changed, 53 insertions(+), 102 deletions(-)

diff --git a/GPU/Common/ShaderUniforms.cpp b/GPU/Common/ShaderUniforms.cpp
index 5489df59b6e8..c6ffcf23c51e 100644
--- a/GPU/Common/ShaderUniforms.cpp
+++ b/GPU/Common/ShaderUniforms.cpp
@@ -26,6 +26,49 @@ static void ConvertProjMatrixToD3D11(Matrix4x4 &in) {
 	in.translateAndScale(trans, scale);
 }
 
+void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ) {
+	// Account for the projection viewport adjustment when viewport is too large.
+	auto reverseViewportX = [](float x) {
+		float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
+		return pspViewport * (1.0f / gstate_c.vpWidthScale);
+	};
+	auto reverseViewportY = [flipViewport](float y) {
+		float heightScale = gstate_c.vpHeightScale;
+		if (flipViewport) {
+			// For D3D11 and GLES non-buffered.
+			heightScale = -heightScale;
+		}
+		float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
+		return pspViewport * (1.0f / gstate_c.vpHeightScale);
+	};
+	auto reverseViewportZ = [hasNegZ](float z) {
+		float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
+		// Differs from GLES: depth is 0 to 1, not -1 to 1.
+		float realViewport = (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale);
+		return hasNegZ ? realViewport : (realViewport * 0.5f + 0.5f);
+	};
+	auto sortPair = [](float a, float b) {
+		return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
+	};
+
+	// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
+	// Any vertex outside this range (unless depth clamp enabled) is discarded.
+	auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
+	auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
+	auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
+	// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
+	float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
+
+	minValues[0] = x.first;
+	minValues[1] = y.first;
+	minValues[2] = z.first;
+	minValues[3] = clampEnable;
+	maxValues[0] = x.second;
+	maxValues[1] = y.second;
+	maxValues[2] = z.second;
+	maxValues[3] = NAN;
+}
+
 void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport) {
 	if (dirtyUniforms & DIRTY_TEXENV) {
 		Uint8x3ToFloat4(ub->texEnvColor, gstate.texenvcolor);
@@ -193,41 +236,7 @@ void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipView
 	}
 
 	if (dirtyUniforms & DIRTY_CULLRANGE) {
-		// Account for the projection viewport adjustment when viewport is too large.
-		auto reverseViewportX = [](float x) {
-			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
-			return pspViewport * (1.0f / gstate_c.vpWidthScale);
-		};
-		auto reverseViewportY = [flipViewport](float y) {
-			float heightScale = gstate_c.vpHeightScale;
-			if (flipViewport) {
-				// For D3D11.
-				heightScale = -heightScale;
-			}
-			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
-			return pspViewport * (1.0f / gstate_c.vpHeightScale);
-		};
-		auto reverseViewportZ = [](float z) {
-			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
-			// Differs from GLES: depth is 0 to 1, not -1 to 1.
-			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale) * 0.5f + 0.5f;
-		};
-		auto sortPair = [](float a, float b) {
-			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
-		};
-
-		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
-		// Any vertex outside this range (unless depth clamp enabled) is discarded.
-		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
-		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
-		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
-		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
-		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
-
-		float minValues[4]{ x.first, y.first, z.first, clampEnable };
-		memcpy(ub->cullRangeMin, minValues, sizeof(ub->cullRangeMin));
-		float maxValues[4]{ x.second, y.second, z.second, NAN };
-		memcpy(ub->cullRangeMax, maxValues, sizeof(ub->cullRangeMax));
+		CalcCullRange(ub->cullRangeMin, ub->cullRangeMax, flipViewport, false);
 	}
 
 	if (dirtyUniforms & DIRTY_BEZIERSPLINE) {
diff --git a/GPU/Common/ShaderUniforms.h b/GPU/Common/ShaderUniforms.h
index 5c5c29f16904..be8d2e69f397 100644
--- a/GPU/Common/ShaderUniforms.h
+++ b/GPU/Common/ShaderUniforms.h
@@ -181,6 +181,8 @@ static const char *cb_vs_bonesStr =
 R"(	float4x3 u_bone[8];
 )";
 
+void CalcCullRange(float minValues[4], float maxValues[4], bool flipViewport, bool hasNegZ);
+
 void BaseUpdateUniforms(UB_VS_FS_Base *ub, uint64_t dirtyUniforms, bool flipViewport);
 void LightUpdateUniforms(UB_VS_Lights *ub, uint64_t dirtyUniforms);
 void BoneUpdateUniforms(UB_VS_Bones *ub, uint64_t dirtyUniforms);
diff --git a/GPU/Directx9/ShaderManagerDX9.cpp b/GPU/Directx9/ShaderManagerDX9.cpp
index 1b90a7d62152..bdc7ce328ff8 100644
--- a/GPU/Directx9/ShaderManagerDX9.cpp
+++ b/GPU/Directx9/ShaderManagerDX9.cpp
@@ -36,6 +36,7 @@
 #include "GPU/Math3D.h"
 #include "GPU/GPUState.h"
 #include "GPU/ge_constants.h"
+#include "GPU/Common/ShaderUniforms.h"
 #include "GPU/Directx9/ShaderManagerDX9.h"
 #include "GPU/Directx9/DrawEngineDX9.h"
 #include "GPU/Directx9/FramebufferDX9.h"
@@ -448,41 +449,9 @@ void ShaderManagerDX9::VSUpdateUniforms(u64 dirtyUniforms) {
 		VSSetFloatUniform4(CONST_VS_DEPTHRANGE, data);
 	}
 	if (dirtyUniforms & DIRTY_CULLRANGE) {
-		// Account for the projection viewport adjustment when viewport is too large.
-		auto reverseViewportX = [](float x) {
-			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
-			return (pspViewport - gstate_c.vpXOffset) * (1.0f / gstate_c.vpWidthScale);
-		};
-		auto reverseViewportY = [](float y) {
-			float yOffset = gstate_c.vpYOffset;
-			if (g_Config.iRenderingMode == FB_NON_BUFFERED_MODE) {
-				// GL upside down is a pain as usual.
-				// TODO: Is this right?
-				yOffset = -yOffset;
-			}
-			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
-			return (pspViewport - yOffset) * (1.0f / gstate_c.vpHeightScale);
-		};
-		auto reverseViewportZ = [](float z) {
-			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
-			// Differs from GLES: depth is 0 to 1, not -1 to 1.
-			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale) * 0.5f + 0.5f;
-		};
-		auto sortPair = [](float a, float b) {
-			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
-		};
-
-		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
-		// Any vertex outside this range (unless depth clamp enabled) is discarded.
-		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
-		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
-		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
-		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
-		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
-
-		float minValues[4]{ x.first, y.first, z.first, clampEnable };
+		float minValues[4], maxValues[4];
+		CalcCullRange(minValues, maxValues, false, false);
 		VSSetFloatUniform4(CONST_VS_CULLRANGEMIN, minValues);
-		float maxValues[4]{ x.second, y.second, z.second, NAN };
 		VSSetFloatUniform4(CONST_VS_CULLRANGEMAX, maxValues);
 	}
 
diff --git a/GPU/GLES/ShaderManagerGLES.cpp b/GPU/GLES/ShaderManagerGLES.cpp
index 2ae8c1376b46..c47b6a184350 100644
--- a/GPU/GLES/ShaderManagerGLES.cpp
+++ b/GPU/GLES/ShaderManagerGLES.cpp
@@ -42,9 +42,10 @@
 #include "GPU/Math3D.h"
 #include "GPU/GPUState.h"
 #include "GPU/ge_constants.h"
+#include "GPU/Common/ShaderUniforms.h"
 #include "GPU/GLES/ShaderManagerGLES.h"
 #include "GPU/GLES/DrawEngineGLES.h"
-#include "FramebufferManagerGLES.h"
+#include "GPU/GLES/FramebufferManagerGLES.h"
 
 Shader::Shader(GLRenderManager *render, const char *code, const std::string &desc, uint32_t glShaderType, bool useHWTransform, uint32_t attrMask, uint64_t uniformMask)
 	  : render_(render), failed_(false), useHWTransform_(useHWTransform), attrMask_(attrMask), uniformMask_(uniformMask) {
@@ -484,39 +485,9 @@ void LinkedShader::UpdateUniforms(u32 vertType, const ShaderID &vsid) {
 		SetFloatUniform4(render_, &u_depthRange, data);
 	}
 	if (dirty & DIRTY_CULLRANGE) {
-		// Account for the projection viewport adjustment when viewport is too large.
-		auto reverseViewportX = [](float x) {
-			float pspViewport = (x - gstate.getViewportXCenter()) * (1.0f / gstate.getViewportXScale());
-			return (pspViewport - gstate_c.vpXOffset) * (1.0f / gstate_c.vpWidthScale);
-		};
-		auto reverseViewportY = [](float y) {
-			float heightScale = gstate_c.vpHeightScale;
-			if (g_Config.iRenderingMode == FB_NON_BUFFERED_MODE) {
-				// GL upside down is a pain as usual.
-				heightScale = -heightScale;
-			}
-			float pspViewport = (y - gstate.getViewportYCenter()) * (1.0f / gstate.getViewportYScale());
-			return (pspViewport - gstate_c.vpYOffset) * (1.0f / heightScale);
-		};
-		auto reverseViewportZ = [](float z) {
-			float pspViewport = (z - gstate.getViewportZCenter()) * (1.0f / gstate.getViewportZScale());
-			return (pspViewport - gstate_c.vpZOffset) * (1.0f / gstate_c.vpDepthScale);
-		};
-		auto sortPair = [](float a, float b) {
-			return a > b ? std::make_pair(b, a) : std::make_pair(a, b);
-		};
-
-		// The PSP seems to use 0.12.4 for X and Y, and 0.16.0 for Z.
-		// Any vertex outside this range (unless depth clamp enabled) is discarded.
-		auto x = sortPair(reverseViewportX(0.0f), reverseViewportX(4096.0f));
-		auto y = sortPair(reverseViewportY(0.0f), reverseViewportY(4096.0f));
-		auto z = sortPair(reverseViewportZ(0.0f), reverseViewportZ(65535.5f));
-		// Since we have space in w, use it to pass the depth clamp flag.  We also pass NAN for w "discard".
-		float clampEnable = gstate.isDepthClampEnabled() ? 1.0f : 0.0f;
-
-		float minValues[4]{ x.first, y.first, z.first, clampEnable };
+		float minValues[4], maxValues[4];
+		CalcCullRange(minValues, maxValues, g_Config.iRenderingMode == FB_NON_BUFFERED_MODE, true);
 		SetFloatUniform4(render_, &u_cullRangeMin, minValues);
-		float maxValues[4]{ x.second, y.second, z.second, NAN };
 		SetFloatUniform4(render_, &u_cullRangeMax, maxValues);
 	}