Skip to content

Commit

Permalink
Merge branch 'dev/adunn/cpu_ibo_memoization' into 'main'
Browse files Browse the repository at this point in the history
CPU Perf: Main thread, IBO memoization

See merge request lightspeedrtx/dxvk-remix-nv!966
  • Loading branch information
AlexDunn committed Aug 30, 2024
2 parents dd39314 + 9ca2dcb commit fcadec8
Show file tree
Hide file tree
Showing 7 changed files with 135 additions and 15 deletions.
1 change: 1 addition & 0 deletions RtxOptions.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,7 @@ Tables below enumerate all the options and their defaults set by RTX Remix. Note
|rtx.enableFogColorRemap|bool|False|A flag to enable or disable remapping fixed function fox's color\. Only takes effect when fog remapping in general is enabled\.<br>Enables or disables remapping functionality relating to the color parameter of fixed function fog with the exception of the multiscattering scale \(as this scale can be set to 0 to disable it\)\.<br>This allows dynamic changes to the game's fog color to be reflected somewhat in the volumetrics system\. Overrides the specified volumetric transmittance color\.|
|rtx.enableFogMaxDistanceRemap|bool|True|A flag to enable or disable remapping fixed function fox's max distance\. Only takes effect when fog remapping in general is enabled\.<br>Enables or disables remapping functionality relating to the max distance parameter of fixed function fog\.<br>This allows dynamic changes to the game's fog max distance to be reflected somewhat in the volumetrics system\. Overrides the specified volumetric transmittance measurement distance\.|
|rtx.enableFogRemap|bool|False|A flag to enable or disable fixed function fog remapping\. Only takes effect when volumetrics are enabled\.<br>Typically many old games used fixed function fog for various effects and while sometimes this fog can be replaced with proper volumetrics globally, other times require some amount of dynamic behavior controlled by the game\.<br>When enabled this option allows for remapping of fixed function fog parameters from the game to volumetric parameters to accomodate this dynamic need\.|
|rtx.enableIndexBufferMemoization|bool|True|CPU performance optimization, should generally be enabled\. Will reduce main thread time by caching processIndexBuffer operations and reusing when possible, this will come at the expense of some CPU RAM\.|
|rtx.enableIndirectTranslucentShadows|bool|False|Include OBJECT\_MASK\_TRANSLUCENT into secondary visibility rays\.|
|rtx.enableInstanceDebuggingTools|bool|False|NOTE: This will disable temporal correllation for instances, but allow the use of instance developer debug tools|
|rtx.enableMultiStageTextureFactorBlending|bool|True|Support texture factor blending in stage 1~7\. Currently only support 1 additional blending stage, more than 1 additional blending stages will be ignored\.|
Expand Down
10 changes: 10 additions & 0 deletions src/d3d9/d3d9_common_buffer.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "d3d9_device_child.h"
#include "d3d9_format.h"
#include "../dxvk/dxvk_buffer.h"
#include "../util/util_memoization.h"

namespace dxvk {

Expand Down Expand Up @@ -208,6 +209,15 @@ namespace dxvk {
return m_seq;
}

// NV-DXVK start: Implement memoization for some expensive CPU operations
struct RemixIndexBufferMemoizationData {
DxvkBufferSlice slice;
uint32_t min, max;
};
using RemixIboMemoizer = MemoryRegionMemoizer<RemixIndexBufferMemoizationData>;
RemixIboMemoizer remixMemoization;
// NV-DXVK end

private:

Rc<DxvkBuffer> CreateBuffer() const;
Expand Down
10 changes: 10 additions & 0 deletions src/d3d9/d3d9_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4952,6 +4952,10 @@ namespace dxvk {

pResource->SetWrittenByGPU(false);
pResource->GPUReadingRange().Clear();

// NV-DXVK start: Implement memoization for some expensive CPU operations
pResource->remixMemoization.invalidateAll();
// NV-DXVK end
}
else {
// Use map pointer from previous map operation. This
Expand Down Expand Up @@ -4982,6 +4986,12 @@ namespace dxvk {
pResource->SetWrittenByGPU(false);
pResource->GPUReadingRange().Clear();
}

// NV-DXVK start: Implement memoization for some expensive CPU operations
if (!readOnly) {
pResource->remixMemoization.invalidate(offset, size);
}
// NV-DXVK end
}

uint8_t* data = reinterpret_cast<uint8_t*>(physSlice.mapPtr);
Expand Down
46 changes: 33 additions & 13 deletions src/d3d9/d3d9_rtx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ namespace dxvk {
}

template<typename T>
void D3D9Rtx::copyIndices(const uint32_t indexCount, T* pIndicesDst, const T* pIndices, uint32_t& minIndex, uint32_t& maxIndex) {
void D3D9Rtx::copyIndices(const uint32_t indexCount, T*& pIndicesDst, T* pIndices, uint32_t& minIndex, uint32_t& maxIndex) {
ScopedCpuProfileZone();

assert(indexCount >= 3);
Expand All @@ -82,26 +82,45 @@ namespace dxvk {
}

template<typename T>
DxvkBufferSlice D3D9Rtx::processIndexBuffer(const uint32_t indexCount, const uint32_t startIndex, const DxvkBufferSliceHandle& indexSlice, uint32_t& minIndex, uint32_t& maxIndex) {
DxvkBufferSlice D3D9Rtx::processIndexBuffer(const uint32_t indexCount, const uint32_t startIndex, const IndexContext& indexCtx, uint32_t& minIndex, uint32_t& maxIndex) {
ScopedCpuProfileZone();

const uint32_t indexStride = sizeof(T);
const size_t numIndexBytes = indexCount * indexStride;
const size_t indexOffset = indexStride * startIndex;

// Get our slice of the staging ring buffer
const DxvkBufferSlice& stagingSlice = m_rtStagingData.alloc(CACHE_LINE_SIZE, numIndexBytes);
auto processing = [this, &indexCtx, indexCount](const size_t offset, const size_t size) -> D3D9CommonBuffer::RemixIndexBufferMemoizationData {
D3D9CommonBuffer::RemixIndexBufferMemoizationData result;

// Acquire prevents the staging allocator from re-using this memory
stagingSlice.buffer()->acquire(DxvkAccess::Read);
// Get our slice of the staging ring buffer
result.slice = m_rtStagingData.alloc(CACHE_LINE_SIZE, size);

const uint8_t* pBaseIndex = (uint8_t*) indexSlice.mapPtr + indexOffset;
// Acquire prevents the staging allocator from re-using this memory
result.slice.buffer()->acquire(DxvkAccess::Read);

const uint8_t* pBaseIndex = (uint8_t*) indexCtx.indexBuffer.mapPtr + offset;

T* pIndices = (T*) pBaseIndex;
T* pIndicesDst = (T*) result.slice.mapPtr(0);
copyIndices<T>(indexCount, pIndicesDst, pIndices, result.min, result.max);

T* pIndices = (T*) pBaseIndex;
T* pIndicesDst = (T*) stagingSlice.mapPtr(0);
copyIndices<T>(indexCount, pIndicesDst, pIndices, minIndex, maxIndex);
return result;
};

if (enableIndexBufferMemoization() && indexCtx.ibo != nullptr) {
// If we have an index buffer, we can utilize memoization
D3D9CommonBuffer::RemixIboMemoizer& memoization = indexCtx.ibo->remixMemoization;
const auto result = memoization.memoize(indexOffset, numIndexBytes, processing);
minIndex = result.min;
maxIndex = result.max;
return result.slice;
}

return stagingSlice;
// No index buffer (so no memoization) - this could be a DrawPrimitiveUP call (where IB data is passed inline)
const auto result = processing(indexOffset, numIndexBytes);
minIndex = result.min;
maxIndex = result.max;
return result.slice;
}

void D3D9Rtx::prepareVertexCapture(const int vertexIndexOffset) {
Expand Down Expand Up @@ -538,9 +557,9 @@ namespace dxvk {
geoData.indexCount = GetVertexCount(drawContext.PrimitiveType, drawContext.PrimitiveCount);

if (indexContext.indexType == VK_INDEX_TYPE_UINT16)
geoData.indexBuffer = RasterBuffer(processIndexBuffer<uint16_t>(geoData.indexCount, drawContext.StartIndex, indexContext.indexBuffer, minIndex, maxIndex), 0, 2, indexContext.indexType);
geoData.indexBuffer = RasterBuffer(processIndexBuffer<uint16_t>(geoData.indexCount, drawContext.StartIndex, indexContext, minIndex, maxIndex), 0, 2, indexContext.indexType);
else
geoData.indexBuffer = RasterBuffer(processIndexBuffer<uint32_t>(geoData.indexCount, drawContext.StartIndex, indexContext.indexBuffer, minIndex, maxIndex), 0, 4, indexContext.indexType);
geoData.indexBuffer = RasterBuffer(processIndexBuffer<uint32_t>(geoData.indexCount, drawContext.StartIndex, indexContext, minIndex, maxIndex), 0, 4, indexContext.indexType);

// Unlikely, but invalid
if (maxIndex == minIndex) {
Expand Down Expand Up @@ -987,6 +1006,7 @@ namespace dxvk {
D3D9CommonBuffer* ibo = GetCommonBuffer(d3d9State().indices);
assert(ibo != nullptr);

indices.ibo = ibo;
indices.indexBuffer = ibo->GetMappedSlice();
indices.indexType = DecodeIndexType(ibo->Desc()->Format);
}
Expand Down
6 changes: 4 additions & 2 deletions src/d3d9/d3d9_rtx.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ namespace dxvk {
RTX_OPTION("rtx", bool, useVertexCapture, true, "When enabled, injects code into the original vertex shader to capture final shaded vertex positions. Is useful for games using simple vertex shaders, that still also set the fixed function transform matrices.");
RTX_OPTION("rtx", bool, useVertexCapturedNormals, true, "When enabled, vertex normals are read from the input assembler and used in raytracing. This doesn't always work as normals can be in any coordinate space, but can help sometimes.");
RTX_OPTION("rtx", bool, useWorldMatricesForShaders, true, "When enabled, Remix will utilize the world matrices being passed from the game via D3D9 fixed function API, even when running with shaders. Sometimes games pass these matrices and they are useful, however for some games they are very unreliable, and should be filtered out. If you're seeing precision related issues with shader vertex capture, try disabling this setting.");
RTX_OPTION("rtx", bool, enableIndexBufferMemoization, true, "CPU performance optimization, should generally be enabled. Will reduce main thread time by caching processIndexBuffer operations and reusing when possible, this will come at the expense of some CPU RAM.");

// Copy of the parameters issued to D3D9 on DrawXXX
struct DrawContext {
Expand Down Expand Up @@ -227,6 +228,7 @@ namespace dxvk {

struct IndexContext {
VkIndexType indexType = VK_INDEX_TYPE_NONE_KHR;
D3D9CommonBuffer* ibo = nullptr;
DxvkBufferSliceHandle indexBuffer;
};

Expand All @@ -246,10 +248,10 @@ namespace dxvk {
const Direct3DState9& d3d9State() const;

template<typename T>
static void copyIndices(const uint32_t indexCount, T* pIndicesDst, const T* pIndices, uint32_t& minIndex, uint32_t& maxIndex);
static void copyIndices(const uint32_t indexCount, T*& pIndicesDst, T* pIndices, uint32_t& minIndex, uint32_t& maxIndex);

template<typename T>
DxvkBufferSlice processIndexBuffer(const uint32_t indexCount, const uint32_t startIndex, const DxvkBufferSliceHandle& indexSlice, uint32_t& minIndex, uint32_t& maxIndex);
DxvkBufferSlice processIndexBuffer(const uint32_t indexCount, const uint32_t startIndex, const IndexContext& indexCtx, uint32_t& minIndex, uint32_t& maxIndex);

void prepareVertexCapture(const int vertexIndexOffset);

Expand Down
2 changes: 2 additions & 0 deletions src/util/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ util_src = files([
'xxHash/xxhash.c',
'xxHash/xxhash.h',

'util_memoization.h',

'util_messagechannel.cpp',
'util_messagechannel.h',

Expand Down
75 changes: 75 additions & 0 deletions src/util/util_memoization.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#include <map>

namespace dxvk {
template<typename T>
class MemoryRegionMemoizer {
private:
struct Range {
size_t start = 0;
size_t end = 0;
Range() = default;
Range(size_t s, size_t e) : start(s), end(e) { }

bool contains(size_t point) const {
return start <= point && point < end;
}

bool overlaps(const Range& other) const {
return (start <= other.start && other.start < end) ||
(start < other.end&& other.end <= end) ||
(other.start <= start && end <= other.end);
}
};

template<typename U>
struct CacheEntry {
Range range = {};
U result;
CacheEntry() = default;
CacheEntry(Range r, U res) : range(r), result(std::move(res)) { }
};

std::map<size_t, CacheEntry<T>> cache;

public:
template<typename Func>
T memoize(size_t start, size_t size, Func&& func) {
const Range currentRange(start, start + size);

auto it = cache.find(start);
if (it != cache.end() && it->second.range.start == currentRange.start && it->second.range.end == currentRange.end) {
// Exact match found, return cached result
return it->second.result;
}

// No exact match, invalidate all overlapping ranges
invalidate(currentRange.start, currentRange.end);

// If we didn't find a usable cached result, compute and store the result
T result = std::invoke(func, start, size);
cache[start] = CacheEntry<T>(currentRange, result);
return result;
}

void invalidate(size_t start, size_t size) {
Range invalidRange(start, start + size);

// Find the first range overlapping range in cache
auto it = cache.lower_bound(start);
if (it != cache.begin()) --it;

// Erase overlapping ranges
while (it != cache.end() && it->second.range.start < invalidRange.end) {
if (it->second.range.overlaps(invalidRange)) {
it = cache.erase(it);
} else {
++it;
}
}
}

void invalidateAll() {
cache.clear();
}
};
}

0 comments on commit fcadec8

Please sign in to comment.