From 381a99ec975a14ef882f44c5082be859fbe12752 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Henrik=20Rydg=C3=A5rd?= Date: Wed, 10 May 2023 16:14:33 +0200 Subject: [PATCH] Restore fragmentshader ID flags double and texalpha. Add a ubershader mode that uses dynamic state. --- GPU/Common/FragmentShaderGenerator.cpp | 37 ++++++++++++++++++++------ GPU/Common/ShaderId.cpp | 17 +++++++++++- GPU/Common/ShaderId.h | 16 ++++++----- GPU/Common/ShaderUniforms.h | 13 +++++---- GPU/D3D11/GPU_D3D11.cpp | 2 ++ GPU/Directx9/GPU_DX9.cpp | 3 +++ GPU/GLES/DrawEngineGLES.cpp | 3 +++ GPU/GLES/GPU_GLES.cpp | 4 +++ GPU/GPUCommonHW.cpp | 7 +++++ GPU/GPUState.h | 2 +- GPU/Vulkan/GPU_Vulkan.cpp | 4 +++ 11 files changed, 85 insertions(+), 23 deletions(-) diff --git a/GPU/Common/FragmentShaderGenerator.cpp b/GPU/Common/FragmentShaderGenerator.cpp index b433284ddf7c..a529a4e30001 100644 --- a/GPU/Common/FragmentShaderGenerator.cpp +++ b/GPU/Common/FragmentShaderGenerator.cpp @@ -107,6 +107,11 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu bool colorTestAgainstZero = id.Bit(FS_BIT_COLOR_AGAINST_ZERO); bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ); + bool ubershader = id.Bit(FS_BIT_UBERSHADER); + // ubershader-controlled bits. If ubershader is on, these will not be used below (and will be false). + bool useTexAlpha = id.Bit(FS_BIT_TEXALPHA); + bool enableColorDouble = id.Bit(FS_BIT_DOUBLE_COLOR); + if (texture3D && arrayTexture) { *errorString = "Invalid combination of 3D texture and array texture, shouldn't happen"; return false; @@ -264,7 +269,9 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu if (texFunc == GE_TEXFUNC_BLEND) { WRITE(p, "float3 u_texenv : register(c%i);\n", CONST_PS_TEXENV); } - WRITE(p, "float2 u_texNoAlphaMul : register(c%i);\n", CONST_PS_TEX_NO_ALPHA_MUL); + if (ubershader) { + WRITE(p, "float2 u_texNoAlphaMul : register(c%i);\n", CONST_PS_TEX_NO_ALPHA_MUL); + } } if (enableFog) { WRITE(p, "float3 u_fogcolor : register(c%i);\n", CONST_PS_FOGCOLOR); @@ -363,7 +370,9 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu WRITE(p, "uniform sampler2D tex;\n"); } *uniformMask |= DIRTY_TEX_ALPHA_MUL; - WRITE(p, "uniform vec2 u_texNoAlphaMul;\n"); + if (ubershader) { + WRITE(p, "uniform vec2 u_texNoAlphaMul;\n"); + } } if (readFramebufferTex) { @@ -842,7 +851,11 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu WRITE(p, " vec4 p = v_color0;\n"); if (texFunc != GE_TEXFUNC_REPLACE) { - WRITE(p, " t.a = max(t.a, u_texNoAlphaMul.x);\n"); + if (ubershader) { + WRITE(p, " t.a = max(t.a, u_texNoAlphaMul.x);\n"); + } else if (!useTexAlpha) { + WRITE(p, " t.a = 1.0;\n"); + } } switch (texFunc) { @@ -857,7 +870,11 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu break; case GE_TEXFUNC_REPLACE: WRITE(p, " vec4 r = t;\n"); - WRITE(p, " r.a = mix(r.a, p.a, u_texNoAlphaMul.x);\n"); + if (ubershader) { + WRITE(p, " r.a = mix(r.a, p.a, u_texNoAlphaMul.x);\n"); + } else if (!useTexAlpha) { + WRITE(p, " r.a = p.a;\n"); + } WRITE(p, " vec4 v = r%s;\n", secondary); break; case GE_TEXFUNC_ADD: @@ -876,10 +893,14 @@ bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLangu *uniformMask |= DIRTY_TEX_ALPHA_MUL; // We only need a clamp if the color will be further processed. Otherwise the hardware color conversion will clamp for us. - if (enableFog || enableColorTest || replaceBlend != REPLACE_BLEND_NO || simulateLogicOpType != LOGICOPTYPE_NORMAL || colorWriteMask || blueToAlpha) { - WRITE(p, " v.rgb = clamp(v.rgb * u_texNoAlphaMul.y, 0.0, 1.0);\n"); - } else { - WRITE(p, " v.rgb *= u_texNoAlphaMul.y;\n"); + if (ubershader) { + if (enableFog || enableColorTest || replaceBlend != REPLACE_BLEND_NO || simulateLogicOpType != LOGICOPTYPE_NORMAL || colorWriteMask || blueToAlpha) { + WRITE(p, " v.rgb = clamp(v.rgb * u_texNoAlphaMul.y, 0.0, 1.0);\n"); + } else { + WRITE(p, " v.rgb *= u_texNoAlphaMul.y;\n"); + } + } else if (enableColorDouble) { + p.C(" v.rgb = clamp(v.rgb * 2.0, 0.0, 1.0);\n"); } } else { // No texture mapping diff --git a/GPU/Common/ShaderId.cpp b/GPU/Common/ShaderId.cpp index b435cd2d7485..864075e9e35a 100644 --- a/GPU/Common/ShaderId.cpp +++ b/GPU/Common/ShaderId.cpp @@ -190,8 +190,11 @@ std::string FragmentShaderDesc(const FShaderID &id) { if (id.Bit(FS_BIT_DO_TEXTURE_PROJ)) desc << "TexProj "; if (id.Bit(FS_BIT_ENABLE_FOG)) desc << "Fog "; if (id.Bit(FS_BIT_LMODE)) desc << "LM "; + if (id.Bit(FS_BIT_TEXALPHA)) desc << "TexAlpha "; + if (id.Bit(FS_BIT_DOUBLE_COLOR)) desc << "Double "; if (id.Bit(FS_BIT_FLATSHADE)) desc << "Flat "; if (id.Bit(FS_BIT_BGRA_TEXTURE)) desc << "BGRA "; + if (id.Bit(FS_BIT_UBERSHADER)) desc << "FragUber "; switch ((ShaderDepalMode)id.Bits(FS_BIT_SHADER_DEPAL_MODE, 2)) { case ShaderDepalMode::OFF: break; case ShaderDepalMode::NORMAL: desc << "Depal "; break; @@ -285,9 +288,14 @@ void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pip bool enableFog = gstate.isFogEnabled() && !isModeThrough; bool enableAlphaTest = gstate.isAlphaTestEnabled() && !IsAlphaTestTriviallyTrue(); bool enableColorTest = gstate.isColorTestEnabled() && !IsColorTestTriviallyTrue(); + bool enableColorDouble = gstate.isColorDoublingEnabled(); bool doTextureProjection = (gstate.getUVGenMode() == GE_TEXMAP_TEXTURE_MATRIX && MatrixNeedsProjection(gstate.tgenMatrix, gstate.getUVProjMode())); bool doFlatShading = gstate.getShadeMode() == GE_SHADE_FLAT; + bool enableTexAlpha = gstate.isTextureAlphaUsed(); + + bool uberShader = gstate_c.Use(GPU_USE_FRAGMENT_UBERSHADER); + ShaderDepalMode shaderDepalMode = gstate_c.shaderDepalMode; bool colorWriteMask = pipelineState.maskState.applyFramebufferRead; @@ -329,7 +337,14 @@ void ComputeFragmentShaderID(FShaderID *id_out, const ComputedPipelineState &pip id.SetBit(FS_BIT_TEST_DISCARD_TO_ZERO, !NeedsTestDiscard()); } - id.SetBit(FS_BIT_ENABLE_FOG, enableFog); + id.SetBit(FS_BIT_ENABLE_FOG, enableFog); // TODO: Will be moved back to the ubershader. + + id.SetBit(FS_BIT_UBERSHADER, uberShader); + if (!uberShader) { + id.SetBit(FS_BIT_TEXALPHA, enableTexAlpha); + id.SetBit(FS_BIT_DOUBLE_COLOR, enableColorDouble); + } + id.SetBit(FS_BIT_DO_TEXTURE_PROJ, doTextureProjection); // 2 bits diff --git a/GPU/Common/ShaderId.h b/GPU/Common/ShaderId.h index 59bf37fc3219..830abd0440fc 100644 --- a/GPU/Common/ShaderId.h +++ b/GPU/Common/ShaderId.h @@ -68,12 +68,12 @@ enum FShaderBit : uint8_t { FS_BIT_CLEARMODE = 0, FS_BIT_DO_TEXTURE = 1, FS_BIT_TEXFUNC = 2, // 3 bits - FS_BIT_DO_TEXTURE_PROJ = 5, + FS_BIT_DOUBLE_COLOR = 5, // Not used with FS_BIT_UBERSHADER FS_BIT_3D_TEXTURE = 6, FS_BIT_SHADER_TEX_CLAMP = 7, FS_BIT_CLAMP_S = 8, FS_BIT_CLAMP_T = 9, - FS_BIT_FLATSHADE = 10, + FS_BIT_TEXALPHA = 10, // Not used with FS_BIT_UBERSHADER FS_BIT_LMODE = 11, FS_BIT_ALPHA_TEST = 12, FS_BIT_ALPHA_TEST_FUNC = 13, // 3 bits @@ -81,9 +81,9 @@ enum FShaderBit : uint8_t { FS_BIT_COLOR_TEST = 17, FS_BIT_COLOR_TEST_FUNC = 18, // 2 bits FS_BIT_COLOR_AGAINST_ZERO = 20, - FS_BIT_ENABLE_FOG = 21, - FS_BIT_SAMPLE_ARRAY_TEXTURE = 22, // For multiview, framebuffers are array textures and we need to sample the two layers correctly. - FS_BIT_STEREO = 23, + FS_BIT_ENABLE_FOG = 21, // Not used with FS_BIT_UBERSHADER + FS_BIT_DO_TEXTURE_PROJ = 22, + // 1 free bit FS_BIT_STENCIL_TO_ALPHA = 24, // 2 bits FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE = 26, // 4 bits (ReplaceAlphaType) FS_BIT_SIMULATE_LOGIC_OP_TYPE = 30, // 2 bits @@ -91,13 +91,17 @@ enum FShaderBit : uint8_t { FS_BIT_BLENDEQ = 35, // 3 bits FS_BIT_BLENDFUNC_A = 38, // 4 bits FS_BIT_BLENDFUNC_B = 42, // 4 bits - FS_BIT_USE_FRAMEBUFFER_FETCH = 46, + FS_BIT_FLATSHADE = 46, FS_BIT_BGRA_TEXTURE = 47, FS_BIT_TEST_DISCARD_TO_ZERO = 48, FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL = 49, FS_BIT_COLOR_WRITEMASK = 50, FS_BIT_REPLACE_LOGIC_OP = 51, // 4 bits. GE_LOGIC_COPY means no-op/off. FS_BIT_SHADER_DEPAL_MODE = 55, // 2 bits (ShaderDepalMode) + FS_BIT_SAMPLE_ARRAY_TEXTURE = 57, // For multiview, framebuffers are array textures and we need to sample the two layers correctly. + FS_BIT_STEREO = 58, + FS_BIT_USE_FRAMEBUFFER_FETCH = 59, + FS_BIT_UBERSHADER = 60, }; static inline FShaderBit operator +(FShaderBit bit, int i) { diff --git a/GPU/Common/ShaderUniforms.h b/GPU/Common/ShaderUniforms.h index 176524e3d7d6..77c8ebc8de40 100644 --- a/GPU/Common/ShaderUniforms.h +++ b/GPU/Common/ShaderUniforms.h @@ -33,13 +33,13 @@ struct alignas(16) UB_VS_FS_Base { uint32_t spline_counts; uint32_t depal_mask_shift_off_fmt; // 4 params packed into one. uint32_t colorWriteMask; float mipBias; // Fragment data + float texNoAlpha; float texMul; float padding[2]; // this vec4 will hold ubershader stuff. We won't use integer flags in the fragment shader. float fogColor[3]; uint32_t alphaColorRef; float texEnvColor[3]; uint32_t colorTestMask; - float blendFixA[3]; float stencilReplaceValue; - float blendFixB[3]; float rotation; float texClamp[4]; float texClampOffset[2]; float fogCoef[2]; - float texNoAlpha; float texMul; float padding[2]; + float blendFixA[3]; float stencilReplaceValue; + float blendFixB[3]; float rotation; // VR stuff is to go here, later. For normal drawing, we can then get away // with just uploading the first 448 bytes of the struct (up to and including fogCoef). }; @@ -59,14 +59,13 @@ R"( mat4 u_proj; uint u_depal_mask_shift_off_fmt; uint u_colorWriteMask; float u_mipBias; + vec2 u_texNoAlphaMul; float pad1; float pad2; vec3 u_fogcolor; uint u_alphacolorref; vec3 u_texenv; uint u_alphacolormask; + vec4 u_texclamp; + vec2 u_texclampoff; vec2 u_fogcoef; vec3 u_blendFixA; float u_stencilReplaceValue; vec3 u_blendFixB; float u_rotation; - vec4 u_texclamp; - vec2 u_texclampoff; - vec2 u_fogcoef; - vec2 u_texNoAlphaMul; float pad1; float pad2; )"; // 512 bytes. Would like to shrink more. Some colors only have 8-bit precision and we expand diff --git a/GPU/D3D11/GPU_D3D11.cpp b/GPU/D3D11/GPU_D3D11.cpp index c92b8ad2f5e3..8b8a430e6d28 100644 --- a/GPU/D3D11/GPU_D3D11.cpp +++ b/GPU/D3D11/GPU_D3D11.cpp @@ -108,6 +108,8 @@ u32 GPU_D3D11::CheckGPUFeatures() const { features |= GPU_USE_16BIT_FORMATS; } + features |= GPU_USE_FRAGMENT_UBERSHADER; + return CheckGPUFeaturesLate(features); } diff --git a/GPU/Directx9/GPU_DX9.cpp b/GPU/Directx9/GPU_DX9.cpp index 1e099d356345..199438482912 100644 --- a/GPU/Directx9/GPU_DX9.cpp +++ b/GPU/Directx9/GPU_DX9.cpp @@ -101,6 +101,9 @@ u32 GPU_DX9::CheckGPUFeatures() const { // So we cannot incorrectly use the viewport transform as the depth range on Direct3D. features |= GPU_USE_ACCURATE_DEPTH; + // DX9 GPUs probably benefit more than they lose from this. Though, might be a vendor check. + features |= GPU_USE_FRAGMENT_UBERSHADER; + return CheckGPUFeaturesLate(features); } diff --git a/GPU/GLES/DrawEngineGLES.cpp b/GPU/GLES/DrawEngineGLES.cpp index b49f1104b29a..9fbaf74ba75c 100644 --- a/GPU/GLES/DrawEngineGLES.cpp +++ b/GPU/GLES/DrawEngineGLES.cpp @@ -479,6 +479,9 @@ void DrawEngineGLES::DoFlush() { gpuStats.numDrawCalls += numDrawCalls; gpuStats.numVertsSubmitted += vertexCountInDrawCalls_; + // TODO: When the next flush has the same vertex format, we can continue with the same offset in the vertex buffer, + // and start indexing from a higher value. This is very friendly to OpenGL (where we can't rely on baseindex if we + // wanted to avoid rebinding the vertex input every time). indexGen.Reset(); decodedVerts_ = 0; numDrawCalls = 0; diff --git a/GPU/GLES/GPU_GLES.cpp b/GPU/GLES/GPU_GLES.cpp index 963a10018cdf..47ed2699c6ae 100644 --- a/GPU/GLES/GPU_GLES.cpp +++ b/GPU/GLES/GPU_GLES.cpp @@ -197,6 +197,10 @@ u32 GPU_GLES::CheckGPUFeatures() const { } } + if (gl_extensions.GLES3) { + features |= GPU_USE_FRAGMENT_UBERSHADER; + } + return features; } diff --git a/GPU/GPUCommonHW.cpp b/GPU/GPUCommonHW.cpp index 85bbe1e53053..809f8402d3ed 100644 --- a/GPU/GPUCommonHW.cpp +++ b/GPU/GPUCommonHW.cpp @@ -464,6 +464,13 @@ void GPUCommonHW::UpdateCmdInfo() { cmdInfo_[GE_CMD_MATERIALUPDATE].RemoveDirty(DIRTY_LIGHT_CONTROL); cmdInfo_[GE_CMD_MATERIALUPDATE].AddDirty(DIRTY_VERTEXSHADER_STATE); } + + if (gstate_c.Use(GPU_USE_FRAGMENT_UBERSHADER)) { + // Texfunc controls both texalpha and doubling. The rest is not dynamic yet so can't remove fragment shader dirtying. + cmdInfo_[GE_CMD_TEXFUNC].AddDirty(DIRTY_TEX_ALPHA_MUL); + } else { + cmdInfo_[GE_CMD_TEXFUNC].RemoveDirty(DIRTY_TEX_ALPHA_MUL); + } } void GPUCommonHW::BeginFrame() { diff --git a/GPU/GPUState.h b/GPU/GPUState.h index 9f7e46432e9a..632fa1e86dc9 100644 --- a/GPU/GPUState.h +++ b/GPU/GPUState.h @@ -478,7 +478,7 @@ enum { GPU_USE_VS_RANGE_CULLING = FLAG_BIT(3), GPU_USE_BLEND_MINMAX = FLAG_BIT(4), GPU_USE_LOGIC_OP = FLAG_BIT(5), - // Bit 6 is free. + GPU_USE_FRAGMENT_UBERSHADER = FLAG_BIT(6), GPU_USE_TEXTURE_NPOT = FLAG_BIT(7), GPU_USE_ANISOTROPY = FLAG_BIT(8), GPU_USE_CLEAR_RAM_HACK = FLAG_BIT(9), diff --git a/GPU/Vulkan/GPU_Vulkan.cpp b/GPU/Vulkan/GPU_Vulkan.cpp index 31ddcb09131a..959faaf6661a 100644 --- a/GPU/Vulkan/GPU_Vulkan.cpp +++ b/GPU/Vulkan/GPU_Vulkan.cpp @@ -297,6 +297,10 @@ u32 GPU_Vulkan::CheckGPUFeatures() const { features &= ~GPU_USE_FRAMEBUFFER_FETCH; // } + // Only a few low-power GPUs should probably avoid this. + // Let's figure that out later. + features |= GPU_USE_FRAGMENT_UBERSHADER; + // Attempt to workaround #17386 if (draw_->GetBugs().Has(Draw::Bugs::UNIFORM_INDEXING_BROKEN)) { features &= ~GPU_USE_LIGHT_UBERSHADER;