From 9ed43a60a3397957f44a47ade7efcedbab5bf9f8 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Wed, 19 Feb 2025 15:47:45 +0100 Subject: [PATCH 01/44] Revert "[dxvk] Fix lack of forward progress guarantee in presenter" This reverts commit efeb15edbd6dc913030f0846cbc1b587f6fb7c5d. --- src/dxvk/dxvk_presenter.cpp | 61 ++++++++++++++----------------------- src/dxvk/dxvk_presenter.h | 1 - 2 files changed, 23 insertions(+), 39 deletions(-) diff --git a/src/dxvk/dxvk_presenter.cpp b/src/dxvk/dxvk_presenter.cpp index 79e10ad66..3297d14a0 100644 --- a/src/dxvk/dxvk_presenter.cpp +++ b/src/dxvk/dxvk_presenter.cpp @@ -259,16 +259,9 @@ namespace dxvk { return; if (m_device->features().khrPresentWait.presentWait) { - bool canSignal = false; - - { std::unique_lock lock(m_frameMutex); - - m_lastSignaled = frameId; - canSignal = m_lastCompleted >= frameId; - } - - if (canSignal) - m_signal->signal(frameId); + std::lock_guard lock(m_frameMutex); + m_lastSignaled = frameId; + m_frameCond.notify_one(); } else { m_fpsLimiter.delay(tracker); m_signal->signal(frameId); @@ -1210,26 +1203,25 @@ namespace dxvk { void Presenter::runFrameThread() { env::setThreadName("dxvk-frame"); - while (true) { - PresenterFrame frame = { }; + std::unique_lock lock(m_frameMutex); + while (true) { // Wait for all GPU work for this frame to complete in order to maintain // ordering guarantees of the frame signal w.r.t. objects being released - { std::unique_lock lock(m_frameMutex); + m_frameCond.wait(lock, [this] { + return !m_frameQueue.empty() && m_frameQueue.front().frameId <= m_lastSignaled; + }); - m_frameCond.wait(lock, [this] { - return !m_frameQueue.empty(); - }); + // Use a frame ID of 0 as an exit condition + PresenterFrame frame = m_frameQueue.front(); - // Use a frame ID of 0 as an exit condition - frame = m_frameQueue.front(); - - if (!frame.frameId) { - m_frameQueue.pop(); - return; - } + if (!frame.frameId) { + m_frameQueue.pop(); + return; } + lock.unlock(); + // If the present operation has succeeded, actually wait for it to complete. // Don't bother with it on MAILBOX / IMMEDIATE modes since doing so would // restrict us to the display refresh rate on some platforms (XWayland). @@ -1246,28 +1238,21 @@ namespace dxvk { if (frame.tracker) frame.tracker->notifyGpuPresentEnd(frame.frameId); - // Apply FPS limiter here to align it as closely with scanout as we can, + // Apply FPS limtier here to align it as closely with scanout as we can, // and delay signaling the frame latency event to emulate behaviour of a // low refresh rate display as closely as we can. m_fpsLimiter.delay(frame.tracker); frame.tracker = nullptr; - // Wake up any thread that may be waiting for the queue to become empty - bool canSignal = false; - - { std::unique_lock lock(m_frameMutex); - - m_frameQueue.pop(); - m_frameDrain.notify_one(); - - m_lastCompleted = frame.frameId; - canSignal = m_lastSignaled >= frame.frameId; - } - // Always signal even on error, since failures here // are transparent to the front-end. - if (canSignal) - m_signal->signal(frame.frameId); + m_signal->signal(frame.frameId); + + // Wake up any thread that may be waiting for the queue to become empty + lock.lock(); + + m_frameQueue.pop(); + m_frameDrain.notify_one(); } } diff --git a/src/dxvk/dxvk_presenter.h b/src/dxvk/dxvk_presenter.h index 8e403b244..afbe465c3 100644 --- a/src/dxvk/dxvk_presenter.h +++ b/src/dxvk/dxvk_presenter.h @@ -315,7 +315,6 @@ namespace dxvk { std::queue m_frameQueue; uint64_t m_lastSignaled = 0u; - uint64_t m_lastCompleted = 0u; alignas(CACHE_LINE_SIZE) FpsLimiter m_fpsLimiter; From 978d7cb65be45a868e3e82f714637aef8822d211 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 19:47:50 +0100 Subject: [PATCH 02/44] [dxvk] Add more convenience methods to track buffer barriers --- src/dxvk/dxvk_context.cpp | 75 +++++++++++++++++++++++++++++++++++---- src/dxvk/dxvk_context.h | 29 +++++++++++++++ 2 files changed, 97 insertions(+), 7 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index ec9fa0761..3b3aaa97e 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -936,13 +936,8 @@ namespace dxvk { m_queryManager.endQueries(m_cmd, VK_QUERY_TYPE_PIPELINE_STATISTICS); - accessBuffer(DxvkCmdBuffer::ExecBuffer, - *m_state.id.argBuffer.buffer(), - m_state.id.argBuffer.offset() + offset, - sizeof(VkDispatchIndirectCommand), - VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT, - VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT); - + accessDrawBuffer(offset, 1, 0, sizeof(VkDispatchIndirectCommand)); + this->trackDrawBuffer(); } } @@ -7659,6 +7654,18 @@ namespace dxvk { } + void DxvkContext::accessImage( + DxvkCmdBuffer cmdBuffer, + const DxvkImageView& imageView, + VkPipelineStageFlags2 srcStages, + VkAccessFlags2 srcAccess) { + accessImage(cmdBuffer, *imageView.image(), + imageView.imageSubresources(), + imageView.image()->info().layout, + srcStages, srcAccess); + } + + void DxvkContext::accessImage( DxvkCmdBuffer cmdBuffer, DxvkImage& image, @@ -7773,6 +7780,35 @@ namespace dxvk { } + void DxvkContext::accessBuffer( + DxvkCmdBuffer cmdBuffer, + const DxvkBufferSlice& bufferSlice, + VkPipelineStageFlags2 srcStages, + VkAccessFlags2 srcAccess) { + accessBuffer(cmdBuffer, + *bufferSlice.buffer(), + bufferSlice.offset(), + bufferSlice.length(), + srcStages, srcAccess); + } + + + void DxvkContext::accessBuffer( + DxvkCmdBuffer cmdBuffer, + const DxvkBufferSlice& bufferSlice, + VkPipelineStageFlags2 srcStages, + VkAccessFlags2 srcAccess, + VkPipelineStageFlags2 dstStages, + VkAccessFlags2 dstAccess) { + accessBuffer(cmdBuffer, + *bufferSlice.buffer(), + bufferSlice.offset(), + bufferSlice.length(), + srcStages, srcAccess, + dstStages, dstAccess); + } + + void DxvkContext::accessBuffer( DxvkCmdBuffer cmdBuffer, DxvkBufferView& bufferView, @@ -7802,6 +7838,31 @@ namespace dxvk { } + void DxvkContext::accessDrawBuffer( + VkDeviceSize offset, + uint32_t count, + uint32_t stride, + uint32_t size) { + uint32_t dataSize = count ? (count - 1u) * stride + size : 0u; + + accessBuffer(DxvkCmdBuffer::ExecBuffer, + *m_state.id.argBuffer.buffer(), + m_state.id.argBuffer.offset() + offset, dataSize, + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT, + VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR); + } + + + void DxvkContext::accessDrawCountBuffer( + VkDeviceSize offset) { + accessBuffer(DxvkCmdBuffer::ExecBuffer, + *m_state.id.cntBuffer.buffer(), + m_state.id.cntBuffer.offset() + offset, sizeof(uint32_t), + VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT, + VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR); + } + + void DxvkContext::flushPendingAccesses( DxvkBuffer& buffer, VkDeviceSize offset, diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index b841549b5..24bcb1f0a 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1878,6 +1878,12 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess); + void accessImage( + DxvkCmdBuffer cmdBuffer, + const DxvkImageView& imageView, + VkPipelineStageFlags2 srcStages, + VkAccessFlags2 srcAccess); + void accessImage( DxvkCmdBuffer cmdBuffer, DxvkImage& image, @@ -1907,6 +1913,20 @@ namespace dxvk { VkPipelineStageFlags2 dstStages, VkAccessFlags2 dstAccess); + void accessBuffer( + DxvkCmdBuffer cmdBuffer, + const DxvkBufferSlice& bufferSlice, + VkPipelineStageFlags2 srcStages, + VkAccessFlags2 srcAccess); + + void accessBuffer( + DxvkCmdBuffer cmdBuffer, + const DxvkBufferSlice& bufferSlice, + VkPipelineStageFlags2 srcStages, + VkAccessFlags2 srcAccess, + VkPipelineStageFlags2 dstStages, + VkAccessFlags2 dstAccess); + void accessBuffer( DxvkCmdBuffer cmdBuffer, DxvkBufferView& bufferView, @@ -1921,6 +1941,15 @@ namespace dxvk { VkPipelineStageFlags2 dstStages, VkAccessFlags2 dstAccess); + void accessDrawBuffer( + VkDeviceSize offset, + uint32_t count, + uint32_t stride, + uint32_t size); + + void accessDrawCountBuffer( + VkDeviceSize offset); + void flushPendingAccesses( DxvkBuffer& buffer, VkDeviceSize offset, From a7c1e7a2a066269b535f36d8138d89cb4450d681 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 19:53:56 +0100 Subject: [PATCH 03/44] [dxvk] Add resource flag to track graphics pipeline side effects --- src/dxvk/dxvk_sparse.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/src/dxvk/dxvk_sparse.h b/src/dxvk/dxvk_sparse.h index 7eb9385a9..d6e71ca17 100644 --- a/src/dxvk/dxvk_sparse.h +++ b/src/dxvk/dxvk_sparse.h @@ -580,6 +580,27 @@ namespace dxvk { m_trackId = 0u; } + /** + * \brief Checks whether the buffer has been used for gfx stores + * + * \returns \c true if any graphics pipeline has written this + * resource via transform feedback or a storage descriptor. + */ + bool hasGfxStores() const { + return m_hasGfxStores; + } + + /** + * \brief Tracks graphics pipeline side effects + * + * Must be called whenever the resource is written via graphics + * pipeline storage descriptors or transform feedback. + * \returns \c true if side effects were already tracked. + */ + bool trackGfxStores() { + return std::exchange(m_hasGfxStores, true); + } + /** * \brief Queries sparse page table * @@ -622,6 +643,8 @@ namespace dxvk { uint64_t m_trackId = { 0u }; uint64_t m_cookie = { 0u }; + bool m_hasGfxStores = false; + static constexpr uint64_t getIncrement(DxvkAccess access) { return uint64_t(1u) << (uint32_t(access) * 20u); } From 21eb682b39bfd8330d29f0634e8b622216c3339b Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 19:54:25 +0100 Subject: [PATCH 04/44] [dxvk] Track indirect draw buffer access --- src/dxvk/dxvk_context.cpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 3b3aaa97e..0ff67c855 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -967,6 +967,9 @@ namespace dxvk { descriptor.buffer.buffer, descriptor.buffer.offset + offset, count, stride); + + if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) + accessDrawBuffer(offset, count, stride, sizeof(VkDrawIndirectCommand)); } } @@ -986,6 +989,12 @@ namespace dxvk { cntDescriptor.buffer.buffer, cntDescriptor.buffer.offset + countOffset, maxCount, stride); + + if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) + accessDrawBuffer(offset, maxCount, stride, sizeof(VkDrawIndirectCommand)); + + if (unlikely(m_state.id.cntBuffer.buffer()->hasGfxStores())) + accessDrawCountBuffer(countOffset); } } @@ -1016,6 +1025,9 @@ namespace dxvk { descriptor.buffer.buffer, descriptor.buffer.offset + offset, count, stride); + + if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) + accessDrawBuffer(offset, count, stride, sizeof(VkDrawIndexedIndirectCommand)); } } @@ -1035,6 +1047,12 @@ namespace dxvk { cntDescriptor.buffer.buffer, cntDescriptor.buffer.offset + countOffset, maxCount, stride); + + if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) + accessDrawBuffer(offset, maxCount, stride, sizeof(VkDrawIndexedIndirectCommand)); + + if (unlikely(m_state.id.cntBuffer.buffer()->hasGfxStores())) + accessDrawCountBuffer(countOffset); } } From 24b58e5858ef8b132678c3d5e6b3823849157f13 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 20:02:37 +0100 Subject: [PATCH 05/44] [dxvk] Track index buffer access --- src/dxvk/dxvk_context.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 0ff67c855..f36a40d81 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -6310,6 +6310,11 @@ namespace dxvk { m_state.vi.indexType); } + if (unlikely(m_state.vi.indexBuffer.buffer()->hasGfxStores())) { + accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.vi.indexBuffer, + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_INDEX_READ_BIT); + } + m_cmd->track(m_state.vi.indexBuffer.buffer(), DxvkAccess::Read); return true; } From 96337f11d45b13129703ad944215f5068bf94a27 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 20:02:50 +0100 Subject: [PATCH 06/44] [dxvk] Track vertex buffer access --- src/dxvk/dxvk_context.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index f36a40d81..b775b9531 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -6352,6 +6352,11 @@ namespace dxvk { newDynamicStrides &= strides[i] >= m_state.vi.vertexExtents[i]; } + if (unlikely(m_state.vi.vertexBuffers[binding].buffer()->hasGfxStores())) { + accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.vi.vertexBuffers[binding], + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT); + } + m_cmd->track(m_state.vi.vertexBuffers[binding].buffer(), DxvkAccess::Read); } else { buffers[i] = VK_NULL_HANDLE; From be9391ded55d8d507d0f9f2b0336cb5b6a52a0a1 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 20:02:57 +0100 Subject: [PATCH 07/44] [dxvk] Track transform feedback buffer access --- src/dxvk/dxvk_context.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index b775b9531..0934c0562 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -5530,8 +5530,14 @@ namespace dxvk { ctrBuffers[i] = physSlice.handle; ctrOffsets[i] = physSlice.offset; - if (physSlice.handle != VK_NULL_HANDLE) - m_cmd->track(m_state.xfb.activeCounters[i].buffer(), DxvkAccess::Read); + if (physSlice.handle) { + accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.xfb.activeCounters[i], + VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT, + VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | + VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT); + + m_cmd->track(m_state.xfb.activeCounters[i].buffer(), DxvkAccess::Write); + } } m_cmd->cmdBeginTransformFeedback( @@ -5556,9 +5562,6 @@ namespace dxvk { ctrBuffers[i] = physSlice.handle; ctrOffsets[i] = physSlice.offset; - if (physSlice.handle != VK_NULL_HANDLE) - m_cmd->track(m_state.xfb.activeCounters[i].buffer(), DxvkAccess::Write); - m_state.xfb.activeCounters[i] = DxvkBufferSlice(); } @@ -6407,14 +6410,18 @@ namespace dxvk { xfbOffsets[i] = physSlice.offset; xfbLengths[i] = physSlice.length; - if (physSlice.handle == VK_NULL_HANDLE) + if (!physSlice.handle) xfbBuffers[i] = m_common->dummyResources().bufferHandle(); - - if (physSlice.handle != VK_NULL_HANDLE) { - const Rc& buffer = m_state.xfb.buffers[i].buffer(); + + if (physSlice.handle) { + Rc buffer = m_state.xfb.buffers[i].buffer(); buffer->setXfbVertexStride(gsInfo.xfbStrides[i]); - - m_cmd->track(buffer, DxvkAccess::Write); + + accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.xfb.buffers[i], + VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT, + VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); + + m_cmd->track(std::move(buffer), DxvkAccess::Write); } } From 009f8ee35682965e755cc840f1e32d5cb83ef6b5 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 20:29:17 +0100 Subject: [PATCH 08/44] [dxvk] Emit barriers when updating shader resources --- src/dxvk/dxvk_context.cpp | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 0934c0562..4e153a30d 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -899,9 +899,6 @@ namespace dxvk { uint32_t y, uint32_t z) { if (this->commitComputeState()) { - this->commitComputeBarriers(); - this->commitComputeBarriers(); - m_queryManager.beginQueries(m_cmd, VK_QUERY_TYPE_PIPELINE_STATISTICS); @@ -924,9 +921,6 @@ namespace dxvk { sizeof(VkDispatchIndirectCommand), DxvkAccess::Read); if (this->commitComputeState()) { - this->commitComputeBarriers(); - this->commitComputeBarriers(); - m_queryManager.beginQueries(m_cmd, VK_QUERY_TYPE_PIPELINE_STATISTICS); @@ -5918,6 +5912,9 @@ namespace dxvk { descriptorInfo.image.imageView = viewHandle; descriptorInfo.image.imageLayout = res.imageView->image()->info().layout; + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.imageView->image()->hasGfxStores())) + accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.imageView->image(), DxvkAccess::Read); } else { descriptorInfo.image.sampler = VK_NULL_HANDLE; @@ -5939,6 +5936,9 @@ namespace dxvk { descriptorInfo.image.imageView = viewHandle; descriptorInfo.image.imageLayout = res.imageView->image()->info().layout; + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || res.imageView->image()->hasGfxStores()) + accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.imageView->image(), (binding.access & vk::AccessWriteMask) ? DxvkAccess::Write : DxvkAccess::Read); } else { @@ -5961,6 +5961,9 @@ namespace dxvk { descriptorInfo.image.imageView = viewHandle; descriptorInfo.image.imageLayout = res.imageView->image()->info().layout; + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.imageView->image()->hasGfxStores())) + accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.sampler); m_cmd->track(res.imageView->image(), DxvkAccess::Read); } else { @@ -5976,6 +5979,9 @@ namespace dxvk { if (res.bufferView != nullptr) { descriptorInfo.texelBuffer = res.bufferView->handle(); + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.bufferView->buffer()->hasGfxStores())) + accessBuffer(DxvkCmdBuffer::ExecBuffer, *res.bufferView, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.bufferView->buffer(), DxvkAccess::Read); } else { descriptorInfo.texelBuffer = VK_NULL_HANDLE; @@ -5988,6 +5994,9 @@ namespace dxvk { if (res.bufferView != nullptr) { descriptorInfo.texelBuffer = res.bufferView->handle(); + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || res.bufferView->buffer()->hasGfxStores()) + accessBuffer(DxvkCmdBuffer::ExecBuffer, *res.bufferView, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.bufferView->buffer(), (binding.access & vk::AccessWriteMask) ? DxvkAccess::Write : DxvkAccess::Read); } else { @@ -6001,6 +6010,9 @@ namespace dxvk { if (res.bufferSlice.length()) { descriptorInfo = res.bufferSlice.getDescriptor(); + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.bufferSlice.buffer()->hasGfxStores())) + accessBuffer(DxvkCmdBuffer::ExecBuffer, res.bufferSlice, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.bufferSlice.buffer(), DxvkAccess::Read); } else { descriptorInfo.buffer.buffer = VK_NULL_HANDLE; @@ -6015,6 +6027,9 @@ namespace dxvk { if (res.bufferSlice.length()) { descriptorInfo = res.bufferSlice.getDescriptor(); + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.bufferSlice.buffer()->hasGfxStores())) + accessBuffer(DxvkCmdBuffer::ExecBuffer, res.bufferSlice, util::pipelineStages(binding.stage), binding.access); + m_cmd->track(res.bufferSlice.buffer(), (binding.access & vk::AccessWriteMask) ? DxvkAccess::Write : DxvkAccess::Read); } else { @@ -6606,7 +6621,9 @@ namespace dxvk { if (unlikely(!this->updateComputePipelineState())) return false; } - + + this->commitComputeBarriers(); + if (m_descriptorState.hasDirtyComputeSets()) this->updateComputeShaderResources(); From 53b076be61dae90ab006c8337630df6b0259a7a5 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 21:24:46 +0100 Subject: [PATCH 09/44] [dxvk] Rework graphics barrier tracking Avoids having to insert redundant barriers when the app does UAV rendering. --- src/dxvk/dxvk_context.cpp | 303 ++++++++++++++++++---------------- src/dxvk/dxvk_context.h | 11 +- src/dxvk/dxvk_context_state.h | 5 +- src/dxvk/dxvk_pipelayout.h | 2 +- src/vulkan/vulkan_util.h | 5 + 5 files changed, 181 insertions(+), 145 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 4e153a30d..ed04ef949 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -5170,7 +5170,8 @@ namespace dxvk { void DxvkContext::spillRenderPass(bool suspend) { if (m_flags.test(DxvkContextFlag::GpRenderPassBound)) { - m_flags.clr(DxvkContextFlag::GpRenderPassBound); + m_flags.clr(DxvkContextFlag::GpRenderPassBound, + DxvkContextFlag::GpRenderPassSideEffects); this->pauseTransformFeedback(); @@ -5659,24 +5660,11 @@ namespace dxvk { DxvkGraphicsPipelineFlags newFlags = newPipeline->flags(); DxvkGraphicsPipelineFlags diffFlags = oldFlags ^ newFlags; - DxvkGraphicsPipelineFlags hazardMask( - DxvkGraphicsPipelineFlag::HasTransformFeedback, - DxvkGraphicsPipelineFlag::HasStorageDescriptors); - m_state.gp.flags = newFlags; - if ((diffFlags & hazardMask) != 0) { - // Force-update vertex/index buffers for hazard checks - m_flags.set(DxvkContextFlag::GpDirtyIndexBuffer, - DxvkContextFlag::GpDirtyVertexBuffers, - DxvkContextFlag::GpDirtyXfbBuffers, - DxvkContextFlag::DirtyDrawBuffer); - - // This is necessary because we'll only do hazard - // tracking if the active pipeline has side effects - if (!m_barrierControl.test(DxvkBarrierControl::IgnoreGraphicsBarriers)) - this->spillRenderPass(true); - } + if (newFlags.any(DxvkGraphicsPipelineFlag::HasTransformFeedback, + DxvkGraphicsPipelineFlag::HasStorageDescriptors)) + m_flags.set(DxvkContextFlag::GpRenderPassSideEffects); if (diffFlags.test(DxvkGraphicsPipelineFlag::HasSampleMaskExport)) m_flags.set(DxvkContextFlag::GpDirtyMultisampleState); @@ -6644,32 +6632,33 @@ namespace dxvk { if (m_flags.test(DxvkContextFlag::GpDirtyFramebuffer)) this->updateFramebuffer(); - if (!m_flags.test(DxvkContextFlag::GpRenderPassBound)) - this->startRenderPass(); - - if (m_state.gp.flags.any( - DxvkGraphicsPipelineFlag::HasStorageDescriptors, - DxvkGraphicsPipelineFlag::HasTransformFeedback)) { - this->commitGraphicsBarriers(); - + if (m_flags.test(DxvkContextFlag::GpXfbActive)) { // If transform feedback is active and there is a chance that we might // need to rebind the pipeline, we need to end transform feedback and // issue a barrier. End the render pass to do that. Ignore dirty vertex // buffers here since non-dynamic vertex strides are such an extreme // edge case that it's likely irrelevant in practice. - if (m_flags.test(DxvkContextFlag::GpXfbActive) - && m_flags.any(DxvkContextFlag::GpDirtyPipelineState, - DxvkContextFlag::GpDirtySpecConstants)) + if (m_flags.any(DxvkContextFlag::GpDirtyPipelineState, + DxvkContextFlag::GpDirtySpecConstants, + DxvkContextFlag::GpDirtyXfbBuffers)) this->spillRenderPass(true); - - // This can only happen if the render pass was active before, - // so we'll never begin the render pass twice in one draw - if (!m_flags.test(DxvkContextFlag::GpRenderPassBound)) - this->startRenderPass(); - - this->commitGraphicsBarriers(); } + if (m_flags.test(DxvkContextFlag::GpRenderPassSideEffects)) { + // If either the current pipeline has side effects or if there are pending + // writes from previous draws, check for hazards. This also tracks any + // resources written for the first time, but does not emit any barriers + // on its own so calling this outside a render pass is safe. This also + // implicitly dirties all state for which we need to track resource access. + if (this->checkGraphicsHazards()) + this->spillRenderPass(true); + } + + // Start the render pass. This must happen before any render state + // is set up so that we can safely use secondary command buffers. + if (!m_flags.test(DxvkContextFlag::GpRenderPassBound)) + this->startRenderPass(); + if (m_flags.test(DxvkContextFlag::GpDirtyIndexBuffer) && Indexed) { if (unlikely(!this->updateIndexBufferBinding())) return false; @@ -6771,81 +6760,122 @@ namespace dxvk { } } } + + + template + bool DxvkContext::checkResourceHazards( + const DxvkBindingLayout& layout, + uint32_t setMask) { + constexpr bool IsGraphics = BindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS; + + // For graphics, if we are not currently inside a render pass, we'll issue + // a barrier anyway so checking hazards is not meaningful. Avoid some overhead + // and only track written resources in that case. + bool requiresBarrier = IsGraphics && !m_flags.test(DxvkContextFlag::GpRenderPassBound); + + for (auto setIndex : bit::BitMask(setMask)) { + uint32_t bindingCount = layout.getBindingCount(setIndex); + + for (uint32_t j = 0; j < bindingCount; j++) { + const DxvkBindingInfo& binding = layout.getBinding(setIndex, j); + const DxvkShaderResourceSlot& slot = m_rc[binding.resourceBinding]; + + // Skip read-only bindings if we already know that we need a barrier + if (requiresBarrier && !(binding.access & vk::AccessWriteMask)) + continue; + + switch (binding.descriptorType) { + case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { + if (slot.bufferView) { + if (!IsGraphics || slot.bufferView->buffer()->hasGfxStores()) + requiresBarrier |= checkBufferViewBarrier(slot.bufferView, util::pipelineStages(binding.stage), binding.access); + else if (binding.access & vk::AccessWriteMask) + requiresBarrier |= !slot.bufferView->buffer()->trackGfxStores(); + } + } break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { + if (slot.bufferView && (!IsGraphics || slot.bufferView->buffer()->hasGfxStores())) + requiresBarrier |= checkBufferViewBarrier(slot.bufferView, util::pipelineStages(binding.stage), binding.access); + } break; + + case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { + if (slot.bufferSlice.length() && (!IsGraphics || slot.bufferSlice.buffer()->hasGfxStores())) + requiresBarrier |= checkBufferBarrier(slot.bufferSlice, util::pipelineStages(binding.stage), binding.access); + } break; + + case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: { + if (slot.bufferSlice.length()) { + if (!IsGraphics || slot.bufferSlice.buffer()->hasGfxStores()) + requiresBarrier |= checkBufferBarrier(slot.bufferSlice, util::pipelineStages(binding.stage), binding.access); + else if (binding.access & vk::AccessWriteMask) + requiresBarrier |= !slot.bufferSlice.buffer()->trackGfxStores(); + } + } break; + + case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { + if (slot.imageView) { + if (!IsGraphics || slot.imageView->image()->hasGfxStores()) + requiresBarrier |= checkImageViewBarrier(slot.imageView, util::pipelineStages(binding.stage), binding.access); + else if (binding.access & vk::AccessWriteMask) + requiresBarrier |= !slot.imageView->image()->trackGfxStores(); + } + } break; + + case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: + case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { + if (slot.imageView && (!IsGraphics || slot.imageView->image()->hasGfxStores())) + requiresBarrier |= checkImageViewBarrier(slot.imageView, util::pipelineStages(binding.stage), binding.access); + } break; + + default: + /* nothing to do */; + } + + // We don't need to do any extra tracking for compute here, exit early + if (requiresBarrier && !IsGraphics) + return true; + } + } + + return requiresBarrier; + } - template - void DxvkContext::commitGraphicsBarriers() { + template + bool DxvkContext::checkGraphicsHazards() { if (m_barrierControl.test(DxvkBarrierControl::IgnoreGraphicsBarriers)) - return; + return false; - constexpr auto storageBufferAccess = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT; - constexpr auto storageImageAccess = VK_ACCESS_SHADER_WRITE_BIT; + // Check shader resources on every draw to handle WAW hazards, and to make + // sure that writes are handled properly. If the pipeline does not have any + // storage descriptors, we only need to check dirty resources. + const auto& layout = m_state.gp.pipeline->getBindings()->layout(); - bool requiresBarrier = false; + uint32_t setMask = layout.getSetMask(); - // Check the draw buffer for indirect draw calls - if (m_flags.test(DxvkContextFlag::DirtyDrawBuffer) && Indirect) { - std::array slices = {{ - &m_state.id.argBuffer, - &m_state.id.cntBuffer, - }}; + if (!m_state.gp.flags.test(DxvkGraphicsPipelineFlag::HasStorageDescriptors)) + setMask &= m_descriptorState.getDirtyGraphicsSets(); - for (uint32_t i = 0; i < slices.size() && !requiresBarrier; i++) { - if ((slices[i]->length()) - && (slices[i]->buffer()->info().access & storageBufferAccess)) { - requiresBarrier = this->checkBufferBarrier(*slices[i], - VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, - VK_ACCESS_INDIRECT_COMMAND_READ_BIT); - } - } - } + bool requiresBarrier = checkResourceHazards(layout, setMask); - // Read-only stage, so we only have to check this if - // the bindngs have actually changed between draws - if (m_flags.test(DxvkContextFlag::GpDirtyIndexBuffer) && !requiresBarrier && Indexed) { - const auto& indexBufferSlice = m_state.vi.indexBuffer; - - if ((indexBufferSlice.length()) - && (indexBufferSlice.bufferInfo().access & storageBufferAccess)) { - requiresBarrier = this->checkBufferBarrier(indexBufferSlice, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, - VK_ACCESS_INDEX_READ_BIT); - } - } - - // Same here, also ignore unused vertex bindings - if (m_flags.test(DxvkContextFlag::GpDirtyVertexBuffers)) { - uint32_t bindingCount = m_state.gp.state.il.bindingCount(); - - for (uint32_t i = 0; i < bindingCount && !requiresBarrier; i++) { - uint32_t binding = m_state.gp.state.ilBindings[i].binding(); - const auto& vertexBufferSlice = m_state.vi.vertexBuffers[binding]; - - if ((vertexBufferSlice.length()) - && (vertexBufferSlice.bufferInfo().access & storageBufferAccess)) { - requiresBarrier = this->checkBufferBarrier(vertexBufferSlice, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT); - } - } - } - - // Transform feedback buffer writes won't overlap, so we - // also only need to check those when they are rebound + // Transform feedback buffer writes won't overlap, so we also only need to + // check those if dirty. if (m_flags.test(DxvkContextFlag::GpDirtyXfbBuffers) && m_state.gp.flags.test(DxvkGraphicsPipelineFlag::HasTransformFeedback)) { - for (uint32_t i = 0; i < MaxNumXfbBuffers && !requiresBarrier; i++) { + for (uint32_t i = 0; i < MaxNumXfbBuffers; i++) { const auto& xfbBufferSlice = m_state.xfb.buffers[i]; const auto& xfbCounterSlice = m_state.xfb.activeCounters[i]; if (xfbBufferSlice.length()) { - requiresBarrier = this->checkBufferBarrier(xfbBufferSlice, - VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, - VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); + requiresBarrier |= !xfbBufferSlice.buffer()->trackGfxStores(); + requiresBarrier |= checkBufferBarrier(xfbBufferSlice, + VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, + VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); if (xfbCounterSlice.length()) { - requiresBarrier |= this->checkBufferBarrier(xfbCounterSlice, - VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT | + requiresBarrier |= !xfbCounterSlice.buffer()->trackGfxStores(); + requiresBarrier |= checkBufferBarrier(xfbCounterSlice, VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT); @@ -6854,56 +6884,53 @@ namespace dxvk { } } - // Check shader resources on every draw to handle WAW hazards - auto layout = m_state.gp.pipeline->getBindings()->layout(); + // From now on, we only have read-only resources to check and can + // exit early if we find a hazard. + if (requiresBarrier) + return true; - for (uint32_t i = 0; i < DxvkDescriptorSets::SetCount && !requiresBarrier; i++) { - uint32_t bindingCount = layout.getBindingCount(i); + // Check the draw buffer for indirect draw calls + if (m_flags.test(DxvkContextFlag::DirtyDrawBuffer) && Indirect) { + std::array slices = {{ + &m_state.id.argBuffer, + &m_state.id.cntBuffer, + }}; - for (uint32_t j = 0; j < bindingCount && !requiresBarrier; j++) { - const DxvkBindingInfo& binding = layout.getBinding(i, j); - const DxvkShaderResourceSlot& slot = m_rc[binding.resourceBinding]; - - switch (binding.descriptorType) { - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - if ((slot.bufferSlice.length()) - && (slot.bufferSlice.bufferInfo().access & storageBufferAccess)) { - requiresBarrier = this->checkBufferBarrier(slot.bufferSlice, - util::pipelineStages(binding.stage), binding.access); - } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - if ((slot.bufferView != nullptr) - && (slot.bufferView->buffer()->info().access & storageBufferAccess)) { - requiresBarrier = this->checkBufferViewBarrier(slot.bufferView, - util::pipelineStages(binding.stage), binding.access); - } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - if ((slot.imageView != nullptr) - && (slot.imageView->image()->info().access & storageImageAccess)) { - requiresBarrier = this->checkImageViewBarrier(slot.imageView, - util::pipelineStages(binding.stage), binding.access); - } - break; - - default: - /* nothing to do */; + for (uint32_t i = 0; i < slices.size(); i++) { + if (slices[i]->length() && slices[i]->buffer()->hasGfxStores()) { + if (checkBufferBarrier(*slices[i], VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, VK_ACCESS_INDIRECT_COMMAND_READ_BIT)) + return true; } } } - // External subpass dependencies serve as full memory - // and execution barriers, so we can use this to allow - // inter-stage synchronization. - if (requiresBarrier) - this->spillRenderPass(true); + // Read-only stage, so we only have to check this if + // the bindngs have actually changed between draws + if (m_flags.test(DxvkContextFlag::GpDirtyIndexBuffer) && Indexed) { + const auto& indexBufferSlice = m_state.vi.indexBuffer; + + if (indexBufferSlice.length() && indexBufferSlice.buffer()->hasGfxStores()) { + if (checkBufferBarrier(indexBufferSlice, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_INDEX_READ_BIT)) + return true; + } + } + + // Same here, also ignore unused vertex bindings + if (m_flags.test(DxvkContextFlag::GpDirtyVertexBuffers)) { + uint32_t bindingCount = m_state.gp.state.il.bindingCount(); + + for (uint32_t i = 0; i < bindingCount; i++) { + uint32_t binding = m_state.gp.state.ilBindings[i].binding(); + const auto& vertexBufferSlice = m_state.vi.vertexBuffers[binding]; + + if (vertexBufferSlice.length() && vertexBufferSlice.buffer()->hasGfxStores()) { + if (checkBufferBarrier(vertexBufferSlice, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT)) + return true; + } + } + } + + return false; } diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 24bcb1f0a..c70b1701d 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1757,13 +1757,16 @@ namespace dxvk { template bool commitGraphicsState(); + template + bool checkResourceHazards( + const DxvkBindingLayout& layout, + uint32_t setMask); + template void commitComputeBarriers(); - void commitComputePostBarriers(); - - template - void commitGraphicsBarriers(); + template + bool checkGraphicsHazards(); template bool checkBufferBarrier( diff --git a/src/dxvk/dxvk_context_state.h b/src/dxvk/dxvk_context_state.h index 2a753fee3..6e565e6a1 100644 --- a/src/dxvk/dxvk_context_state.h +++ b/src/dxvk/dxvk_context_state.h @@ -20,10 +20,11 @@ namespace dxvk { * of the graphics and compute pipelines * has changed and/or needs to be updated. */ - enum class DxvkContextFlag : uint32_t { + enum class DxvkContextFlag : uint64_t { GpRenderPassBound, ///< Render pass is currently bound GpRenderPassSuspended, ///< Render pass is currently suspended GpRenderPassSecondaryCmd, ///< Render pass uses secondary command buffer + GpRenderPassSideEffects, ///< Render pass has side effects GpXfbActive, ///< Transform feedback is enabled GpDirtyFramebuffer, ///< Framebuffer binding is out of date GpDirtyPipeline, ///< Graphics pipeline binding is out of date @@ -59,7 +60,7 @@ namespace dxvk { Count }; - static_assert(uint32_t(DxvkContextFlag::Count) <= 32u); + static_assert(uint32_t(DxvkContextFlag::Count) <= 64u); using DxvkContextFlags = Flags; diff --git a/src/dxvk/dxvk_pipelayout.h b/src/dxvk/dxvk_pipelayout.h index cccfb336e..5fb05c6b2 100644 --- a/src/dxvk/dxvk_pipelayout.h +++ b/src/dxvk/dxvk_pipelayout.h @@ -590,4 +590,4 @@ namespace dxvk { }; -} \ No newline at end of file +} diff --git a/src/vulkan/vulkan_util.h b/src/vulkan/vulkan_util.h index 2995c4f7d..f872d5801 100644 --- a/src/vulkan/vulkan_util.h +++ b/src/vulkan/vulkan_util.h @@ -40,6 +40,11 @@ namespace dxvk::vk { = VK_ACCESS_HOST_READ_BIT | VK_ACCESS_HOST_WRITE_BIT; + constexpr static VkAccessFlags AccessGfxSideEffectMask + = VK_ACCESS_SHADER_WRITE_BIT + | VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT + | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT; + constexpr static VkPipelineStageFlags StageDeviceMask = VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT | VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT From 3bbae86ec9587162e0addaa9e1f1ae09dad9d501 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 21:42:30 +0100 Subject: [PATCH 10/44] [dxvk] Rework compute barrier tracking --- src/dxvk/dxvk_context.cpp | 76 ++++++++++----------------------------- src/dxvk/dxvk_context.h | 3 +- 2 files changed, 19 insertions(+), 60 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index ed04ef949..7a8deeb86 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -6610,7 +6610,13 @@ namespace dxvk { return false; } - this->commitComputeBarriers(); + if (this->checkComputeHazards()) { + this->flushBarriers(); + + // Dirty descriptors if this hasn't happened yet for + // whatever reason in order to re-emit barriers + m_descriptorState.dirtyStages(VK_SHADER_STAGE_COMPUTE_BIT); + } if (m_descriptorState.hasDirtyComputeSets()) this->updateComputeShaderResources(); @@ -6705,63 +6711,6 @@ namespace dxvk { } - template - void DxvkContext::commitComputeBarriers() { - const auto& layout = m_state.cp.pipeline->getBindings()->layout(); - - // Exit early if we're only checking for hazards and - // if the barrier set is empty, to avoid some overhead. - if (!DoEmit && m_barrierTracker.empty()) - return; - - for (uint32_t i = 0; i < DxvkDescriptorSets::CsSetCount; i++) { - uint32_t bindingCount = layout.getBindingCount(i); - - for (uint32_t j = 0; j < bindingCount; j++) { - const DxvkBindingInfo& binding = layout.getBinding(i, j); - const DxvkShaderResourceSlot& slot = m_rc[binding.resourceBinding]; - - bool requiresBarrier = false; - - switch (binding.descriptorType) { - case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: - if (likely(slot.bufferSlice.length())) { - requiresBarrier = this->checkBufferBarrier(slot.bufferSlice, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, binding.access); - } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: - case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: - if (likely(slot.bufferView != nullptr)) { - requiresBarrier = this->checkBufferViewBarrier(slot.bufferView, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, binding.access); - } - break; - - case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: - case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: - case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: - if (likely(slot.imageView != nullptr)) { - requiresBarrier = this->checkImageViewBarrier(slot.imageView, - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, binding.access); - } - break; - - default: - /* nothing to do */; - } - - if (requiresBarrier) { - flushBarriers(); - return; - } - } - } - } - - template bool DxvkContext::checkResourceHazards( const DxvkBindingLayout& layout, @@ -6842,6 +6791,17 @@ namespace dxvk { } + bool DxvkContext::checkComputeHazards() { + // Exit early if we know that there cannot be any hazards to avoid + // some overhead after barriers are flushed. This is common. + if (m_barrierTracker.empty()) + return false; + + const auto& layout = m_state.cp.pipeline->getBindings()->layout(); + return checkResourceHazards(layout, layout.getSetMask()); + } + + template bool DxvkContext::checkGraphicsHazards() { if (m_barrierControl.test(DxvkBarrierControl::IgnoreGraphicsBarriers)) diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index c70b1701d..d1131032f 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1762,8 +1762,7 @@ namespace dxvk { const DxvkBindingLayout& layout, uint32_t setMask); - template - void commitComputeBarriers(); + bool checkComputeHazards(); template bool checkGraphicsHazards(); From 04d2609a914f99b37f5df4ac5432bcb779eca9b5 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 21:49:51 +0100 Subject: [PATCH 11/44] [dxvk] Clean up shader resource hazard checking --- src/dxvk/dxvk_context.cpp | 91 +++++++++++---------------------------- src/dxvk/dxvk_context.h | 29 ++++++++----- 2 files changed, 43 insertions(+), 77 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 7a8deeb86..2d1ac6c54 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -6737,7 +6737,7 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { if (slot.bufferView) { if (!IsGraphics || slot.bufferView->buffer()->hasGfxStores()) - requiresBarrier |= checkBufferViewBarrier(slot.bufferView, util::pipelineStages(binding.stage), binding.access); + requiresBarrier |= checkBufferViewBarrier(slot.bufferView, binding.access); else if (binding.access & vk::AccessWriteMask) requiresBarrier |= !slot.bufferView->buffer()->trackGfxStores(); } @@ -6745,18 +6745,18 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { if (slot.bufferView && (!IsGraphics || slot.bufferView->buffer()->hasGfxStores())) - requiresBarrier |= checkBufferViewBarrier(slot.bufferView, util::pipelineStages(binding.stage), binding.access); + requiresBarrier |= checkBufferViewBarrier(slot.bufferView, binding.access); } break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { if (slot.bufferSlice.length() && (!IsGraphics || slot.bufferSlice.buffer()->hasGfxStores())) - requiresBarrier |= checkBufferBarrier(slot.bufferSlice, util::pipelineStages(binding.stage), binding.access); + requiresBarrier |= checkBufferBarrier(slot.bufferSlice, binding.access); } break; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: { if (slot.bufferSlice.length()) { if (!IsGraphics || slot.bufferSlice.buffer()->hasGfxStores()) - requiresBarrier |= checkBufferBarrier(slot.bufferSlice, util::pipelineStages(binding.stage), binding.access); + requiresBarrier |= checkBufferBarrier(slot.bufferSlice, binding.access); else if (binding.access & vk::AccessWriteMask) requiresBarrier |= !slot.bufferSlice.buffer()->trackGfxStores(); } @@ -6765,7 +6765,7 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { if (slot.imageView) { if (!IsGraphics || slot.imageView->image()->hasGfxStores()) - requiresBarrier |= checkImageViewBarrier(slot.imageView, util::pipelineStages(binding.stage), binding.access); + requiresBarrier |= checkImageViewBarrier(slot.imageView, binding.access); else if (binding.access & vk::AccessWriteMask) requiresBarrier |= !slot.imageView->image()->trackGfxStores(); } @@ -6774,7 +6774,7 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { if (slot.imageView && (!IsGraphics || slot.imageView->image()->hasGfxStores())) - requiresBarrier |= checkImageViewBarrier(slot.imageView, util::pipelineStages(binding.stage), binding.access); + requiresBarrier |= checkImageViewBarrier(slot.imageView, binding.access); } break; default: @@ -6829,14 +6829,12 @@ namespace dxvk { if (xfbBufferSlice.length()) { requiresBarrier |= !xfbBufferSlice.buffer()->trackGfxStores(); - requiresBarrier |= checkBufferBarrier(xfbBufferSlice, - VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, - VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); + requiresBarrier |= checkBufferBarrier( + xfbBufferSlice, VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); if (xfbCounterSlice.length()) { requiresBarrier |= !xfbCounterSlice.buffer()->trackGfxStores(); - requiresBarrier |= checkBufferBarrier(xfbCounterSlice, - VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT, + requiresBarrier |= checkBufferBarrier(xfbCounterSlice, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT); } @@ -6858,7 +6856,7 @@ namespace dxvk { for (uint32_t i = 0; i < slices.size(); i++) { if (slices[i]->length() && slices[i]->buffer()->hasGfxStores()) { - if (checkBufferBarrier(*slices[i], VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT, VK_ACCESS_INDIRECT_COMMAND_READ_BIT)) + if (checkBufferBarrier(*slices[i], VK_ACCESS_INDIRECT_COMMAND_READ_BIT)) return true; } } @@ -6870,7 +6868,7 @@ namespace dxvk { const auto& indexBufferSlice = m_state.vi.indexBuffer; if (indexBufferSlice.length() && indexBufferSlice.buffer()->hasGfxStores()) { - if (checkBufferBarrier(indexBufferSlice, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_INDEX_READ_BIT)) + if (checkBufferBarrier(indexBufferSlice, VK_ACCESS_INDEX_READ_BIT)) return true; } } @@ -6884,7 +6882,7 @@ namespace dxvk { const auto& vertexBufferSlice = m_state.vi.vertexBuffers[binding]; if (vertexBufferSlice.length() && vertexBufferSlice.buffer()->hasGfxStores()) { - if (checkBufferBarrier(vertexBufferSlice, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT)) + if (checkBufferBarrier(vertexBufferSlice, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT)) return true; } } @@ -6894,73 +6892,34 @@ namespace dxvk { } - template + template bool DxvkContext::checkBufferBarrier( const DxvkBufferSlice& bufferSlice, - VkPipelineStageFlags stages, VkAccessFlags access) { - if constexpr (DoEmit) { - accessBuffer(DxvkCmdBuffer::ExecBuffer, - *bufferSlice.buffer(), bufferSlice.offset(), - bufferSlice.length(), stages, access); - return false; - } else { - return checkResourceBarrier([this, &bufferSlice] (DxvkAccess access) { - return resourceHasAccess(*bufferSlice.buffer(), - bufferSlice.offset(), bufferSlice.length(), access); - }, stages, access); - } + return checkResourceBarrier([this, &bufferSlice] (DxvkAccess access) { + return resourceHasAccess(*bufferSlice.buffer(), + bufferSlice.offset(), bufferSlice.length(), access); + }, access); } - template + template bool DxvkContext::checkBufferViewBarrier( const Rc& bufferView, - VkPipelineStageFlags stages, VkAccessFlags access) { - if constexpr (DoEmit) { - accessBuffer(DxvkCmdBuffer::ExecBuffer, - *bufferView, stages, access); - return false; - } else { - return checkResourceBarrier([this, &bufferView] (DxvkAccess access) { - return resourceHasAccess(*bufferView, access); - }, stages, access); - } + return checkResourceBarrier([this, &bufferView] (DxvkAccess access) { + return resourceHasAccess(*bufferView, access); + }, access); } - template + template bool DxvkContext::checkImageViewBarrier( const Rc& imageView, - VkPipelineStageFlags stages, VkAccessFlags access) { - if constexpr (DoEmit) { - accessImage(DxvkCmdBuffer::ExecBuffer, - *imageView->image(), - imageView->imageSubresources(), - imageView->image()->info().layout, - stages, access); - return false; - } else { - return checkResourceBarrier([this, &imageView] (DxvkAccess access) { - return resourceHasAccess(*imageView, access); - }, stages, access); - } - } - - - bool DxvkContext::canIgnoreWawHazards(VkPipelineStageFlags stages) { - if (!m_barrierControl.test(DxvkBarrierControl::IgnoreWriteAfterWrite)) - return false; - - if (stages & VK_SHADER_STAGE_COMPUTE_BIT) { - VkPipelineStageFlags2 stageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT - | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; - return !m_execBarriers.hasPendingStages(~stageMask); - } - - return true; + return checkResourceBarrier([this, &imageView] (DxvkAccess access) { + return resourceHasAccess(*imageView, access); + }, access); } diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index d1131032f..d0b72f0f6 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1767,26 +1767,34 @@ namespace dxvk { template bool checkGraphicsHazards(); - template + template bool checkBufferBarrier( const DxvkBufferSlice& bufferSlice, - VkPipelineStageFlags stages, VkAccessFlags access); - template + template bool checkBufferViewBarrier( const Rc& bufferView, - VkPipelineStageFlags stages, VkAccessFlags access); - template + template bool checkImageViewBarrier( const Rc& imageView, - VkPipelineStageFlags stages, VkAccessFlags access); - bool canIgnoreWawHazards( - VkPipelineStageFlags stages); + template + bool canIgnoreWawHazards() { + if (!m_barrierControl.test(DxvkBarrierControl::IgnoreWriteAfterWrite)) + return false; + + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { + VkPipelineStageFlags2 stageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT + | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; + return !m_execBarriers.hasPendingStages(~stageMask); + } + + return true; + } void emitMemoryBarrier( VkPipelineStageFlags srcStages, @@ -2011,10 +2019,9 @@ namespace dxvk { const Rc& image, DxvkAccess access); - template + template bool checkResourceBarrier( const Pred& pred, - VkPipelineStageFlags stages, VkAccessFlags access) { // Check for read-after-write first, this is common bool hasPendingWrite = pred(DxvkAccess::Write); @@ -2024,7 +2031,7 @@ namespace dxvk { // Check for a write-after-write hazard, but // ignore it if there are no reads involved. - bool ignoreWaW = canIgnoreWawHazards(stages); + bool ignoreWaW = canIgnoreWawHazards(); if (hasPendingWrite && !ignoreWaW) return true; From 48d145fff6231f46a65ecbdc4adb1ffd039682bf Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 23:36:38 +0100 Subject: [PATCH 12/44] [dxvk] Change debug color for pipelines with side effects --- src/dxvk/dxvk_context.cpp | 15 ++++++++++++++- src/dxvk/dxvk_context.h | 2 ++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 2d1ac6c54..ae353c996 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -5769,8 +5769,10 @@ namespace dxvk { } if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) { + uint32_t color = getGraphicsPipelineDebugColor(); + m_cmd->cmdInsertDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, - vk::makeLabel(0xa2dcf0, m_state.gp.pipeline->debugName())); + vk::makeLabel(color, m_state.gp.pipeline->debugName())); } m_flags.clr(DxvkContextFlag::GpDirtyPipelineState); @@ -5778,6 +5780,17 @@ namespace dxvk { } + uint32_t DxvkContext::getGraphicsPipelineDebugColor() const { + if (m_state.gp.flags.test(DxvkGraphicsPipelineFlag::HasStorageDescriptors)) + return 0xf0a2dc; + + if (m_state.gp.flags.test(DxvkGraphicsPipelineFlag::HasTransformFeedback)) + return 0xa2f0dc; + + return 0xa2dcf0; + } + + template void DxvkContext::resetSpecConstants( uint32_t newMask) { diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index d0b72f0f6..57d2fd45a 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1693,6 +1693,8 @@ namespace dxvk { bool updateGraphicsPipeline(); bool updateGraphicsPipelineState(DxvkGlobalPipelineBarrier srcBarrier); + uint32_t getGraphicsPipelineDebugColor() const; + template void resetSpecConstants( uint32_t newMask); From 18e5c12b6d9187a6e9c7dc904f5467aaa655f703 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Fri, 14 Feb 2025 23:48:19 +0100 Subject: [PATCH 13/44] [dxvk] Fix resource hazard checks Turns out we've had broken write-after-read checks for a while. --- src/dxvk/dxvk_context.h | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 57d2fd45a..5e5dc5a2b 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -2025,20 +2025,33 @@ namespace dxvk { bool checkResourceBarrier( const Pred& pred, VkAccessFlags access) { - // Check for read-after-write first, this is common + // If we're only reading the resource, only pending + // writes matter for synchronization purposes. bool hasPendingWrite = pred(DxvkAccess::Write); - if (access & vk::AccessReadMask) + if (!(access & vk::AccessWriteMask)) return hasPendingWrite; - // Check for a write-after-write hazard, but - // ignore it if there are no reads involved. - bool ignoreWaW = canIgnoreWawHazards(); + if (hasPendingWrite) { + // If there is a write-after-write hazard and synchronization + // for those is not explicitly disabled, insert a barrier. + if (!canIgnoreWawHazards()) + return true; - if (hasPendingWrite && !ignoreWaW) - return true; + // If write-after-write checking is disabled and we're on graphics, + // be aggressive about avoiding barriers and ignore any reads if we + // do find a write-after-write hazard. This essentially assumes that + // back-to-back read-modify-write operations are safe, but will still + // consider read-only or transform feedback operations as unsafe. + if (BindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) + return !(access & VK_ACCESS_SHADER_WRITE_BIT); - // Check whether there are any pending reads. + // On compute, if we are reading the resource, add a barrier. + if (access & vk::AccessReadMask) + return true; + } + + // Check if there are any pending reads to avoid write-after-read issues. return pred(DxvkAccess::Read); } From d37a13847a6503bcff4b23939d7354a2f13f6a65 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sat, 15 Feb 2025 00:46:01 +0100 Subject: [PATCH 14/44] [dxvk] Rename and repurpose ignoreGraphicsBarriers option Less nuclear approach that hopefully works just as well in practice. --- src/d3d11/d3d11_context_ext.cpp | 5 +---- src/d3d11/d3d11_device.h | 6 +++--- src/d3d11/d3d11_interfaces.h | 4 +++- src/d3d11/d3d11_options.cpp | 4 ++-- src/d3d11/d3d11_options.h | 4 ++-- src/dxvk/dxvk_context.cpp | 3 --- src/dxvk/dxvk_context.h | 6 +++++- src/dxvk/dxvk_context_state.h | 4 ++-- src/util/config/config.cpp | 7 +++---- 9 files changed, 21 insertions(+), 22 deletions(-) diff --git a/src/d3d11/d3d11_context_ext.cpp b/src/d3d11/d3d11_context_ext.cpp index e8c81e1dc..aac32b887 100644 --- a/src/d3d11/d3d11_context_ext.cpp +++ b/src/d3d11/d3d11_context_ext.cpp @@ -147,10 +147,7 @@ namespace dxvk { DxvkBarrierControlFlags flags = parent->GetOptionsBarrierControlFlags(); if (ControlFlags & D3D11_VK_BARRIER_CONTROL_IGNORE_WRITE_AFTER_WRITE) - flags.set(DxvkBarrierControl::IgnoreWriteAfterWrite); - - if (ControlFlags & D3D11_VK_BARRIER_CONTROL_IGNORE_GRAPHICS_UAV) - flags.set(DxvkBarrierControl::IgnoreGraphicsBarriers); + flags.set(DxvkBarrierControl::IgnoreComputeWriteAfterWrite, DxvkBarrierControl::IgnoreGraphicsWriteAfterWrite); m_ctx->EmitCs([cFlags = flags] (DxvkContext* ctx) { ctx->setBarrierControl(cFlags); diff --git a/src/d3d11/d3d11_device.h b/src/d3d11/d3d11_device.h index 48356f8b4..27d1900a4 100644 --- a/src/d3d11/d3d11_device.h +++ b/src/d3d11/d3d11_device.h @@ -475,10 +475,10 @@ namespace dxvk { DxvkBarrierControlFlags barrierControl; if (m_d3d11Options.relaxedBarriers) - barrierControl.set(DxvkBarrierControl::IgnoreWriteAfterWrite); + barrierControl.set(DxvkBarrierControl::IgnoreComputeWriteAfterWrite); - if (m_d3d11Options.ignoreGraphicsBarriers) - barrierControl.set(DxvkBarrierControl::IgnoreGraphicsBarriers); + if (m_d3d11Options.relaxedBarriers || m_d3d11Options.relaxedGraphicsBarriers) + barrierControl.set(DxvkBarrierControl::IgnoreGraphicsWriteAfterWrite); return barrierControl; } diff --git a/src/d3d11/d3d11_interfaces.h b/src/d3d11/d3d11_interfaces.h index c2288bd04..7c6b3a479 100644 --- a/src/d3d11/d3d11_interfaces.h +++ b/src/d3d11/d3d11_interfaces.h @@ -24,7 +24,9 @@ enum D3D11_VK_EXTENSION : uint32_t { */ enum D3D11_VK_BARRIER_CONTROL : uint32_t { D3D11_VK_BARRIER_CONTROL_IGNORE_WRITE_AFTER_WRITE = 1 << 0, - D3D11_VK_BARRIER_CONTROL_IGNORE_GRAPHICS_UAV = 1 << 1, + + // Removed: + // D3D11_VK_BARRIER_CONTROL_IGNORE_GRAPHICS_UAV = 1 << 1, }; diff --git a/src/d3d11/d3d11_options.cpp b/src/d3d11/d3d11_options.cpp index da8e2ca55..6b0705c14 100644 --- a/src/d3d11/d3d11_options.cpp +++ b/src/d3d11/d3d11_options.cpp @@ -17,7 +17,7 @@ namespace dxvk { this->zeroInitWorkgroupMemory = config.getOption("d3d11.zeroInitWorkgroupMemory", false); this->forceVolatileTgsmAccess = config.getOption("d3d11.forceVolatileTgsmAccess", false); this->relaxedBarriers = config.getOption("d3d11.relaxedBarriers", false); - this->ignoreGraphicsBarriers = config.getOption("d3d11.ignoreGraphicsBarriers", false); + this->relaxedGraphicsBarriers = config.getOption("d3d11.relaxedGraphicsBarriers", false); this->maxTessFactor = config.getOption("d3d11.maxTessFactor", 0); this->samplerAnisotropy = config.getOption("d3d11.samplerAnisotropy", -1); this->samplerLodBias = config.getOption("d3d11.samplerLodBias", 0.0f); @@ -61,4 +61,4 @@ namespace dxvk { this->shaderDumpPath = env::getEnvVar("DXVK_SHADER_DUMP_PATH"); } -} \ No newline at end of file +} diff --git a/src/d3d11/d3d11_options.h b/src/d3d11/d3d11_options.h index f41979d56..b1fe1e7af 100644 --- a/src/d3d11/d3d11_options.h +++ b/src/d3d11/d3d11_options.h @@ -43,7 +43,7 @@ namespace dxvk { /// /// May improve performance in some games, /// but might also cause rendering issues. - bool ignoreGraphicsBarriers = false; + bool relaxedGraphicsBarriers = false; /// Maximum tessellation factor. /// @@ -114,4 +114,4 @@ namespace dxvk { std::string shaderDumpPath; }; -} \ No newline at end of file +} diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index ae353c996..f2d1d6e3c 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -6817,9 +6817,6 @@ namespace dxvk { template bool DxvkContext::checkGraphicsHazards() { - if (m_barrierControl.test(DxvkBarrierControl::IgnoreGraphicsBarriers)) - return false; - // Check shader resources on every draw to handle WAW hazards, and to make // sure that writes are handled properly. If the pipeline does not have any // storage descriptors, we only need to check dirty resources. diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 5e5dc5a2b..ace683ebf 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1786,7 +1786,11 @@ namespace dxvk { template bool canIgnoreWawHazards() { - if (!m_barrierControl.test(DxvkBarrierControl::IgnoreWriteAfterWrite)) + constexpr auto controlFlag = BindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS + ? DxvkBarrierControl::IgnoreGraphicsWriteAfterWrite + : DxvkBarrierControl::IgnoreComputeWriteAfterWrite; + + if (!m_barrierControl.test(controlFlag)) return false; if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { diff --git a/src/dxvk/dxvk_context_state.h b/src/dxvk/dxvk_context_state.h index 6e565e6a1..ba39e14fa 100644 --- a/src/dxvk/dxvk_context_state.h +++ b/src/dxvk/dxvk_context_state.h @@ -86,8 +86,8 @@ namespace dxvk { * synchronize implicitly. */ enum class DxvkBarrierControl : uint32_t { - IgnoreWriteAfterWrite = 1, - IgnoreGraphicsBarriers = 2, + IgnoreComputeWriteAfterWrite = 0, + IgnoreGraphicsWriteAfterWrite = 1, }; using DxvkBarrierControlFlags = Flags; diff --git a/src/util/config/config.cpp b/src/util/config/config.cpp index 45e3f8c8d..9a65d478c 100644 --- a/src/util/config/config.cpp +++ b/src/util/config/config.cpp @@ -291,12 +291,11 @@ namespace dxvk { /* Final Fantasy XV: VXAO does thousands of * * draw calls with the same UAV bound */ { R"(\\ffxv_s\.exe$)", {{ - { "d3d11.ignoreGraphicsBarriers", "True" }, + { "d3d11.relaxedGraphicsBarriers", "True" }, }} }, /* God of War - relies on NVAPI/AMDAGS for * * barrier stuff, needs nvapi for DLSS */ { R"(\\GoW\.exe$)", {{ - { "d3d11.ignoreGraphicsBarriers", "True" }, { "d3d11.relaxedBarriers", "True" }, { "dxgi.hideNvidiaGpu", "False" }, { "dxgi.maxFrameLatency", "1" }, @@ -334,7 +333,7 @@ namespace dxvk { * presumably for culling, which doesn't play * * nicely with D3D11 without vendor libraries */ { R"(\\Stray-Win64-Shipping\.exe$)", {{ - { "d3d11.ignoreGraphicsBarriers", "True" }, + { "d3d11.relaxedGraphicsBarriers", "True" }, }} }, /* Metal Gear Solid V: Ground Zeroes * * Texture quality can break at high vram */ @@ -433,7 +432,7 @@ namespace dxvk { * and assumes that AMD GPUs do not expose * * native command lists for AGS usage */ { R"(\\granblue_fantasy_relink\.exe$)", {{ - { "d3d11.ignoreGraphicsBarriers", "True" }, + { "d3d11.relaxedGraphicsBarriers", "True" }, { "d3d11.exposeDriverCommandLists", "False" }, { "dxgi.hideNvidiaGpu", "False" }, }} }, From 07f7ccdc9637265312ecbf7be95c9621e0decd9d Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sun, 16 Feb 2025 13:53:46 +0100 Subject: [PATCH 15/44] [dxvk,d3d11] Fix draw buffer tracking for DrawAuto Not like anybody uses this feature, but we need to both check for hazards and make sure the SO counter actually gets tracked. Use the existing draw buffer mechanism for this. --- src/d3d11/d3d11_context.cpp | 16 ++++++++++++++-- src/dxvk/dxvk_context.cpp | 16 +++++++++------- src/dxvk/dxvk_context.h | 8 ++++---- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index bdbbf25a6..0868ab9ed 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -1009,10 +1009,22 @@ namespace dxvk { if (!ctrBuf.defined()) return; - EmitCs([=] (DxvkContext* ctx) { - ctx->drawIndirectXfb(ctrBuf, + // We bind the SO counter as an indirect count buffer, + // so reset any tracking we may have been doing here. + m_state.id.reset(); + + EmitCs([=] (DxvkContext* ctx) mutable { + ctx->bindDrawBuffers(DxvkBufferSlice(), + Forwarder::move(ctrBuf)); + + ctx->drawIndirectXfb(0u, vtxBuf.buffer()->getXfbVertexStride(), vtxBuf.offset()); + + // Reset draw buffer right away so we don't + // keep the SO counter alive indefinitely + ctx->bindDrawBuffers(DxvkBufferSlice(), + DxvkBufferSlice()); }); } diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index f2d1d6e3c..95aa05d4e 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -1052,17 +1052,19 @@ namespace dxvk { void DxvkContext::drawIndirectXfb( - const DxvkBufferSlice& counterBuffer, + VkDeviceSize counterOffset, uint32_t counterDivisor, uint32_t counterBias) { - if (this->commitGraphicsState()) { - auto physSlice = counterBuffer.getSliceHandle(); + if (this->commitGraphicsState()) { + auto physSlice = m_state.id.cntBuffer.getSliceHandle(); m_cmd->cmdDrawIndirectVertexCount(1, 0, - physSlice.handle, - physSlice.offset, - counterBias, - counterDivisor); + physSlice.handle, physSlice.offset + counterOffset, + counterBias, counterDivisor); + + // The count will generally be written from streamout + if (likely(m_state.id.cntBuffer.buffer()->hasGfxStores())) + accessDrawCountBuffer(counterOffset); } } diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index ace683ebf..a3aba78d2 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -832,14 +832,14 @@ namespace dxvk { uint32_t stride); /** - * \brief Transform feddback draw call - - * \param [in] counterBuffer Xfb counter buffer + * \brief Transform feedback draw call + * + * \param [in] counterOffset Draw count offset * \param [in] counterDivisor Vertex stride * \param [in] counterBias Counter bias */ void drawIndirectXfb( - const DxvkBufferSlice& counterBuffer, + VkDeviceSize counterOffset, uint32_t counterDivisor, uint32_t counterBias); From a2c9c0f7401b681d73d7c0b828b34c43aa13ee09 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sun, 16 Feb 2025 14:03:12 +0100 Subject: [PATCH 16/44] [d3d11] Use resource cookies for draw buffer tracking Avoids keeping draw buffers alive when the app stops using indirect draws. Unlikely to have caused issues in practice, but draw buffers are not part of the API state to begin with. --- src/d3d11/d3d11_context.cpp | 20 ++++++++++++-------- src/d3d11/d3d11_context_imm.cpp | 5 +++++ src/d3d11/d3d11_context_state.h | 10 +++++----- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index 0868ab9ed..97af10d3a 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -4644,10 +4644,6 @@ namespace dxvk { ApplyRasterizerSampleCount(); ApplyViewportState(); - BindDrawBuffers( - m_state.id.argBuffer.ptr(), - m_state.id.cntBuffer.ptr()); - BindIndexBuffer( m_state.ia.indexBuffer.buffer.ptr(), m_state.ia.indexBuffer.offset, @@ -4686,6 +4682,11 @@ namespace dxvk { RestoreSamplers(); RestoreSamplers(); RestoreSamplers(); + + // Draw buffer bindings aren't persistent at the API level, and + // we can't meaningfully track them. Just reset this state here + // and reapply on the next indirect draw. + SetDrawBuffers(nullptr, nullptr); } @@ -5012,10 +5013,13 @@ namespace dxvk { auto argBuffer = static_cast(pBufferForArgs); auto cntBuffer = static_cast(pBufferForCount); - if (m_state.id.argBuffer != argBuffer - || m_state.id.cntBuffer != cntBuffer) { - m_state.id.argBuffer = argBuffer; - m_state.id.cntBuffer = cntBuffer; + auto argBufferCookie = argBuffer ? argBuffer->GetCookie() : 0u; + auto cntBufferCookie = cntBuffer ? cntBuffer->GetCookie() : 0u; + + if (m_state.id.argBufferCookie != argBufferCookie + || m_state.id.cntBufferCookie != cntBufferCookie) { + m_state.id.argBufferCookie = argBufferCookie; + m_state.id.cntBufferCookie = cntBufferCookie; BindDrawBuffers(argBuffer, cntBuffer); } diff --git a/src/d3d11/d3d11_context_imm.cpp b/src/d3d11/d3d11_context_imm.cpp index e762dd9eb..e22201afa 100644 --- a/src/d3d11/d3d11_context_imm.cpp +++ b/src/d3d11/d3d11_context_imm.cpp @@ -865,6 +865,11 @@ namespace dxvk { Rc LatencyTracker) { D3D10DeviceLock lock = LockContext(); + // Don't keep draw buffers alive indefinitely. This cannot be + // done in ExecuteFlush because command recording itself might + // flush, so no state changes are allowed to happen there. + SetDrawBuffers(nullptr, nullptr); + EmitCs([ cTracker = std::move(LatencyTracker) ] (DxvkContext* ctx) { diff --git a/src/d3d11/d3d11_context_state.h b/src/d3d11/d3d11_context_state.h index fd48d8ee4..9dea34937 100644 --- a/src/d3d11/d3d11_context_state.h +++ b/src/d3d11/d3d11_context_state.h @@ -232,12 +232,12 @@ namespace dxvk { * argument and draw count buffer. */ struct D3D11ContextStateID { - Com argBuffer = nullptr; - Com cntBuffer = nullptr; + uint64_t argBufferCookie = 0u; + uint64_t cntBufferCookie = 0u; void reset() { - argBuffer = nullptr; - cntBuffer = nullptr; + argBufferCookie = 0u; + cntBufferCookie = 0u; } }; @@ -347,4 +347,4 @@ namespace dxvk { uint32_t soCount; }; -} \ No newline at end of file +} From b03d457ffb3252bd20635d4dc073538a10bee074 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sun, 16 Feb 2025 18:13:09 +0100 Subject: [PATCH 17/44] [dxvk,d3d11] Improve explicit UAV overlap behaviour If the app explicitly enables UAV overlap, don't synchronize back-to-back read-modify-write operations to the same UAV either. --- src/d3d11/d3d11_context_ext.cpp | 6 ++-- src/d3d11/d3d11_device.h | 6 ++-- src/dxvk/dxvk_context.cpp | 27 +++++++++++++++++- src/dxvk/dxvk_context.h | 49 ++++++++++++++++++++------------- src/dxvk/dxvk_context_state.h | 9 ++++-- 5 files changed, 70 insertions(+), 27 deletions(-) diff --git a/src/d3d11/d3d11_context_ext.cpp b/src/d3d11/d3d11_context_ext.cpp index aac32b887..051610167 100644 --- a/src/d3d11/d3d11_context_ext.cpp +++ b/src/d3d11/d3d11_context_ext.cpp @@ -146,8 +146,10 @@ namespace dxvk { D3D11Device* parent = static_cast(m_ctx->GetParentInterface()); DxvkBarrierControlFlags flags = parent->GetOptionsBarrierControlFlags(); - if (ControlFlags & D3D11_VK_BARRIER_CONTROL_IGNORE_WRITE_AFTER_WRITE) - flags.set(DxvkBarrierControl::IgnoreComputeWriteAfterWrite, DxvkBarrierControl::IgnoreGraphicsWriteAfterWrite); + if (ControlFlags & D3D11_VK_BARRIER_CONTROL_IGNORE_WRITE_AFTER_WRITE) { + flags.set(DxvkBarrierControl::ComputeAllowReadWriteOverlap, + DxvkBarrierControl::GraphicsAllowReadWriteOverlap); + } m_ctx->EmitCs([cFlags = flags] (DxvkContext* ctx) { ctx->setBarrierControl(cFlags); diff --git a/src/d3d11/d3d11_device.h b/src/d3d11/d3d11_device.h index 27d1900a4..041f758a3 100644 --- a/src/d3d11/d3d11_device.h +++ b/src/d3d11/d3d11_device.h @@ -472,13 +472,13 @@ namespace dxvk { const Rc& Adapter); DxvkBarrierControlFlags GetOptionsBarrierControlFlags() { - DxvkBarrierControlFlags barrierControl; + DxvkBarrierControlFlags barrierControl = 0u; if (m_d3d11Options.relaxedBarriers) - barrierControl.set(DxvkBarrierControl::IgnoreComputeWriteAfterWrite); + barrierControl.set(DxvkBarrierControl::ComputeAllowWriteOnlyOverlap); if (m_d3d11Options.relaxedBarriers || m_d3d11Options.relaxedGraphicsBarriers) - barrierControl.set(DxvkBarrierControl::IgnoreGraphicsWriteAfterWrite); + barrierControl.set(DxvkBarrierControl::GraphicsAllowReadWriteOverlap); return barrierControl; } diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 95aa05d4e..8a22d764c 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -2623,6 +2623,20 @@ namespace dxvk { void DxvkContext::setBarrierControl(DxvkBarrierControlFlags control) { + // If any currently relevant control flags change, play it safe and force + // a barrier the next time we encounter a write-after-write hazard, even + // if the same set of flags is restored by that time. Only check graphics + // flags inside a render pass to avoid performance regressions when an + // application uses this feature but we already have an app profile. + // Barriers get flushed when beginning or ending a render pass anyway. + DxvkBarrierControlFlags mask = m_flags.test(DxvkContextFlag::GpRenderPassBound) + ? DxvkBarrierControlFlags(DxvkBarrierControl::GraphicsAllowReadWriteOverlap) + : DxvkBarrierControlFlags(DxvkBarrierControl::ComputeAllowReadWriteOverlap, + DxvkBarrierControl::ComputeAllowWriteOnlyOverlap); + + if (!((m_barrierControl ^ control) & mask).isClear()) + m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); + m_barrierControl = control; } @@ -3740,12 +3754,14 @@ namespace dxvk { vk::makeSubresourceRange(imageSubresource), imageLayout, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_READ_BIT); + m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); m_cmd->track(buffer, DxvkAccess::Write); m_cmd->track(image, DxvkAccess::Read); - } +} void DxvkContext::clearImageViewFb( @@ -3951,6 +3967,9 @@ namespace dxvk { VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_WRITE_BIT); + if (cmdBuffer == DxvkCmdBuffer::ExecBuffer) + m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(cmdBuffer); @@ -5528,6 +5547,10 @@ namespace dxvk { ctrOffsets[i] = physSlice.offset; if (physSlice.handle) { + // Just in case someone is mad enough to write to a + // transform feedback buffer from a shader as well + m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); + accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.xfb.activeCounters[i], VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT, VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | @@ -7923,6 +7946,8 @@ namespace dxvk { void DxvkContext::flushBarriers() { m_execBarriers.flush(m_cmd); m_barrierTracker.clear(); + + m_flags.clr(DxvkContextFlag::ForceWriteAfterWriteSync); } diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index a3aba78d2..bca3eac77 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1785,23 +1785,33 @@ namespace dxvk { VkAccessFlags access); template - bool canIgnoreWawHazards() { - constexpr auto controlFlag = BindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS - ? DxvkBarrierControl::IgnoreGraphicsWriteAfterWrite - : DxvkBarrierControl::IgnoreComputeWriteAfterWrite; + DxvkAccessFlags getAllowedStorageHazards() { + if (m_barrierControl.isClear() || m_flags.test(DxvkContextFlag::ForceWriteAfterWriteSync)) + return DxvkAccessFlags(); - if (!m_barrierControl.test(controlFlag)) - return false; - - if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { + if constexpr (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { + // If there are any pending accesses that are not directly related + // to shader dispatches, always insert a barrier if there is a hazard. VkPipelineStageFlags2 stageMask = VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT | VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT; - return !m_execBarriers.hasPendingStages(~stageMask); + + if (!m_execBarriers.hasPendingStages(~stageMask)) { + if (m_barrierControl.test(DxvkBarrierControl::ComputeAllowReadWriteOverlap)) + return DxvkAccessFlags(DxvkAccess::Write, DxvkAccess::Read); + else if (m_barrierControl.test(DxvkBarrierControl::ComputeAllowWriteOnlyOverlap)) + return DxvkAccessFlags(DxvkAccess::Write); + } + } else { + // For graphics, the only type of unrelated access we have to worry about + // is transform feedback writes, in which case inserting a barrier is fine. + if (m_barrierControl.test(DxvkBarrierControl::GraphicsAllowReadWriteOverlap)) + return DxvkAccessFlags(DxvkAccess::Write, DxvkAccess::Read); } - return true; + return DxvkAccessFlags(); } + void emitMemoryBarrier( VkPipelineStageFlags srcStages, VkAccessFlags srcAccess, @@ -2039,18 +2049,17 @@ namespace dxvk { if (hasPendingWrite) { // If there is a write-after-write hazard and synchronization // for those is not explicitly disabled, insert a barrier. - if (!canIgnoreWawHazards()) + DxvkAccessFlags allowedHazards = getAllowedStorageHazards(); + + if (!allowedHazards.test(DxvkAccess::Write)) return true; - // If write-after-write checking is disabled and we're on graphics, - // be aggressive about avoiding barriers and ignore any reads if we - // do find a write-after-write hazard. This essentially assumes that - // back-to-back read-modify-write operations are safe, but will still - // consider read-only or transform feedback operations as unsafe. - if (BindPoint == VK_PIPELINE_BIND_POINT_GRAPHICS) - return !(access & VK_ACCESS_SHADER_WRITE_BIT); + // Skip barrier if overlapping read-modify-write ops are allowed. + // This includes shader atomics, but also non-atomic load-stores. + if (allowedHazards.test(DxvkAccess::Read)) + return false; - // On compute, if we are reading the resource, add a barrier. + // Otherwise, check if there is a read-after-write hazard. if (access & vk::AccessReadMask) return true; } @@ -2059,6 +2068,8 @@ namespace dxvk { return pred(DxvkAccess::Read); } + void invalidateWriteAfterWriteTracking(); + void beginRenderPassDebugRegion(); void beginInternalDebugRegion( diff --git a/src/dxvk/dxvk_context_state.h b/src/dxvk/dxvk_context_state.h index ba39e14fa..f91357e1e 100644 --- a/src/dxvk/dxvk_context_state.h +++ b/src/dxvk/dxvk_context_state.h @@ -57,6 +57,8 @@ namespace dxvk { DirtyDrawBuffer, ///< Indirect argument buffer is dirty DirtyPushConstants, ///< Push constant data has changed + ForceWriteAfterWriteSync, ///< Ignores barrier control flags for write-after-write hazards + Count }; @@ -86,8 +88,11 @@ namespace dxvk { * synchronize implicitly. */ enum class DxvkBarrierControl : uint32_t { - IgnoreComputeWriteAfterWrite = 0, - IgnoreGraphicsWriteAfterWrite = 1, + // Ignores write-after-write hazard + ComputeAllowWriteOnlyOverlap = 0, + ComputeAllowReadWriteOverlap = 1, + + GraphicsAllowReadWriteOverlap = 2, }; using DxvkBarrierControlFlags = Flags; From 636669e1a5bc1fd538f6d39a94d9e6d2b46d9cd7 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sun, 16 Feb 2025 15:50:00 +0100 Subject: [PATCH 18/44] [dxvk] Improve handling of nested debug regions --- src/dxvk/dxvk_context.cpp | 58 +++++++++++++++++++++------------------ src/dxvk/dxvk_context.h | 9 +++--- src/dxvk/dxvk_util.h | 17 ++++++++++-- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 8a22d764c..539e63f47 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -2869,27 +2869,20 @@ namespace dxvk { label << ")"; - beginInternalDebugRegion(vk::makeLabel(0xf0e6dc, label.str().c_str())); + pushDebugRegion(vk::makeLabel(0xf0e6dc, label.str().c_str()), + util::DxvkDebugLabelType::InternalRenderPass); } void DxvkContext::beginDebugLabel(const VkDebugUtilsLabelEXT& label) { - if (m_features.test(DxvkContextFeature::DebugUtils)) { - endInternalDebugRegion(); - - m_cmd->cmdBeginDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, label); - m_debugLabelStack.emplace_back(label); - } + if (m_features.test(DxvkContextFeature::DebugUtils)) + pushDebugRegion(label, util::DxvkDebugLabelType::External); } void DxvkContext::endDebugLabel() { - if (m_features.test(DxvkContextFeature::DebugUtils)) { - if (!m_debugLabelStack.empty()) { - m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); - m_debugLabelStack.pop_back(); - } - } + if (m_features.test(DxvkContextFeature::DebugUtils)) + popDebugRegion(util::DxvkDebugLabelType::External); } @@ -5208,7 +5201,9 @@ namespace dxvk { flushBarriers(); flushResolves(); - endInternalDebugRegion(); + + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + popDebugRegion(util::DxvkDebugLabelType::InternalRenderPass); } else if (!suspend) { // We may end a previously suspended render pass if (m_flags.test(DxvkContextFlag::GpRenderPassSuspended)) { @@ -8170,23 +8165,34 @@ namespace dxvk { } - void DxvkContext::beginInternalDebugRegion(const VkDebugUtilsLabelEXT& label) { - if (m_features.test(DxvkContextFeature::DebugUtils)) { - // If the app provides us with debug regions, don't add any - // internal ones to avoid potential issues with scoping. - if (m_debugLabelStack.empty()) { - m_cmd->cmdBeginDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, label); - m_debugLabelInternalActive = true; - } - } + void DxvkContext::pushDebugRegion(const VkDebugUtilsLabelEXT& label, util::DxvkDebugLabelType type) { + m_cmd->cmdBeginDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, label); + m_debugLabelStack.emplace_back(label, type); } - void DxvkContext::endInternalDebugRegion() { - if (m_debugLabelInternalActive) { - m_debugLabelInternalActive = false; + void DxvkContext::popDebugRegion(util::DxvkDebugLabelType type) { + // Find last active region of the given type + size_t index = m_debugLabelStack.size(); + + while (index && m_debugLabelStack[index - 1u].type() != type) + index -= 1u; + + if (!index) + return; + + // End all debug regions inside the scope we want to end, as + // well as the debug region of the requested type itself + for (size_t i = index; i <= m_debugLabelStack.size(); i++) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); + + // Re-emit nested debug regions and erase the region we ended + for (size_t i = index; i < m_debugLabelStack.size(); i++) { + m_cmd->cmdBeginDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, m_debugLabelStack[i].get()); + m_debugLabelStack[i - 1u] = m_debugLabelStack[i]; } + + m_debugLabelStack.pop_back(); } diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index bca3eac77..85ad7574c 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1453,7 +1453,6 @@ namespace dxvk { std::vector m_imageLayoutTransitions; std::vector m_debugLabelStack; - bool m_debugLabelInternalActive = false; Rc m_latencyTracker; uint64_t m_latencyFrameId = 0u; @@ -2072,10 +2071,12 @@ namespace dxvk { void beginRenderPassDebugRegion(); - void beginInternalDebugRegion( - const VkDebugUtilsLabelEXT& label); + void pushDebugRegion( + const VkDebugUtilsLabelEXT& label, + util::DxvkDebugLabelType type); - void endInternalDebugRegion(); + void popDebugRegion( + util::DxvkDebugLabelType type); void beginActiveDebugRegions(); diff --git a/src/dxvk/dxvk_util.h b/src/dxvk/dxvk_util.h index 863647f24..ab166e3f0 100644 --- a/src/dxvk/dxvk_util.h +++ b/src/dxvk/dxvk_util.h @@ -4,6 +4,14 @@ namespace dxvk::util { + /** + * \brief Debug utils label type + */ + enum class DxvkDebugLabelType : uint32_t { + External, ///< App-provided scope + InternalRenderPass, ///< Internal render pass markers + }; + /** * \brief Debug label wrapper * @@ -16,12 +24,16 @@ namespace dxvk::util { DxvkDebugLabel() = default; - DxvkDebugLabel(const VkDebugUtilsLabelEXT& label) - : m_text(label.pLabelName ? label.pLabelName : "") { + DxvkDebugLabel(const VkDebugUtilsLabelEXT& label, DxvkDebugLabelType type) + : m_text(label.pLabelName ? label.pLabelName : ""), m_type(type) { for (uint32_t i = 0; i < m_color.size(); i++) m_color[i] = label.color[i]; } + DxvkDebugLabelType type() const { + return m_type; + } + VkDebugUtilsLabelEXT get() const { VkDebugUtilsLabelEXT label = { VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT }; label.pLabelName = m_text.c_str(); @@ -34,6 +46,7 @@ namespace dxvk::util { std::string m_text; std::array m_color = { }; + DxvkDebugLabelType m_type; }; From 19361c962c8c827021e2f721e0784f3d8651cdd5 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sun, 16 Feb 2025 18:45:52 +0100 Subject: [PATCH 19/44] [dxvk] Add debug region for barrier control --- src/dxvk/dxvk_context.cpp | 58 +++++++++++++++++++++++++++++++++++++-- src/dxvk/dxvk_context.h | 6 ++++ src/dxvk/dxvk_util.h | 1 + 3 files changed, 62 insertions(+), 3 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 539e63f47..8dbdc8fde 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -2634,13 +2634,17 @@ namespace dxvk { : DxvkBarrierControlFlags(DxvkBarrierControl::ComputeAllowReadWriteOverlap, DxvkBarrierControl::ComputeAllowWriteOnlyOverlap); - if (!((m_barrierControl ^ control) & mask).isClear()) + if (!((m_barrierControl ^ control) & mask).isClear()) { m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + popDebugRegion(util::DxvkDebugLabelType::InternalBarrierControl); + } + m_barrierControl = control; } - - + + void DxvkContext::updatePageTable( const DxvkSparseBindInfo& bindInfo, DxvkSparseBindFlags flags) { @@ -2874,6 +2878,30 @@ namespace dxvk { } + template + void DxvkContext::beginBarrierControlDebugRegion() { + if (hasDebugRegion(util::DxvkDebugLabelType::InternalBarrierControl)) + return; + + const char* label = nullptr; + + if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE) { + if (m_barrierControl.test(DxvkBarrierControl::ComputeAllowReadWriteOverlap)) + label = "Relaxed sync"; + else if (m_barrierControl.test(DxvkBarrierControl::ComputeAllowWriteOnlyOverlap)) + label = "Relaxed sync (write-only)"; + } else { + if (m_barrierControl.test(DxvkBarrierControl::GraphicsAllowReadWriteOverlap)) + label = "Relaxed sync"; + } + + if (label) { + pushDebugRegion(vk::makeLabel(0x9bded9, label), + util::DxvkDebugLabelType::InternalBarrierControl); + } + } + + void DxvkContext::beginDebugLabel(const VkDebugUtilsLabelEXT& label) { if (m_features.test(DxvkContextFeature::DebugUtils)) pushDebugRegion(label, util::DxvkDebugLabelType::External); @@ -5132,6 +5160,9 @@ namespace dxvk { void DxvkContext::startRenderPass() { if (!m_flags.test(DxvkContextFlag::GpRenderPassBound)) { + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + popDebugRegion(util::DxvkDebugLabelType::InternalBarrierControl); + this->applyRenderTargetLoadLayouts(); this->flushClears(true); @@ -5192,6 +5223,9 @@ namespace dxvk { m_queryManager.endQueries(m_cmd, VK_QUERY_TYPE_OCCLUSION); m_queryManager.endQueries(m_cmd, VK_QUERY_TYPE_PIPELINE_STATISTICS); + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + popDebugRegion(util::DxvkDebugLabelType::InternalBarrierControl); + this->renderPassUnbindFramebuffer(); if (suspend) @@ -6651,6 +6685,9 @@ namespace dxvk { m_descriptorState.dirtyStages(VK_SHADER_STAGE_COMPUTE_BIT); } + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + this->beginBarrierControlDebugRegion(); + if (m_descriptorState.hasDirtyComputeSets()) this->updateComputeShaderResources(); @@ -6698,6 +6735,13 @@ namespace dxvk { if (!m_flags.test(DxvkContextFlag::GpRenderPassBound)) this->startRenderPass(); + if (m_flags.test(DxvkContextFlag::GpRenderPassSideEffects)) { + // Make sure that the debug label for barrier control + // always starts within an active render pass + if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) + this->beginBarrierControlDebugRegion(); + } + if (m_flags.test(DxvkContextFlag::GpDirtyIndexBuffer) && Indexed) { if (unlikely(!this->updateIndexBufferBinding())) return false; @@ -8196,6 +8240,14 @@ namespace dxvk { } + bool DxvkContext::hasDebugRegion( + util::DxvkDebugLabelType type) { + auto e = std::find_if(m_debugLabelStack.crbegin(), m_debugLabelStack.crend(), + [type] (const util::DxvkDebugLabel& label) { return label.type() == type; }); + return e != m_debugLabelStack.crend(); + } + + void DxvkContext::beginActiveDebugRegions() { for (const auto& region : m_debugLabelStack) m_cmd->cmdBeginDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer, region.get()); diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 85ad7574c..725c9cc1a 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -2071,6 +2071,9 @@ namespace dxvk { void beginRenderPassDebugRegion(); + template + void beginBarrierControlDebugRegion(); + void pushDebugRegion( const VkDebugUtilsLabelEXT& label, util::DxvkDebugLabelType type); @@ -2078,6 +2081,9 @@ namespace dxvk { void popDebugRegion( util::DxvkDebugLabelType type); + bool hasDebugRegion( + util::DxvkDebugLabelType type); + void beginActiveDebugRegions(); void endActiveDebugRegions(); diff --git a/src/dxvk/dxvk_util.h b/src/dxvk/dxvk_util.h index ab166e3f0..49a63da1c 100644 --- a/src/dxvk/dxvk_util.h +++ b/src/dxvk/dxvk_util.h @@ -10,6 +10,7 @@ namespace dxvk::util { enum class DxvkDebugLabelType : uint32_t { External, ///< App-provided scope InternalRenderPass, ///< Internal render pass markers + InternalBarrierControl, ///< Barrier control markers }; /** From 317607e192ca9938e2a07c758955a0dce437d886 Mon Sep 17 00:00:00 2001 From: WinterSnowfall Date: Wed, 19 Feb 2025 20:14:32 +0200 Subject: [PATCH 20/44] [d3d8/9] Prevent device child ref underruns on release --- src/d3d8/d3d8_device_child.h | 20 ++++++++++++++++---- src/d3d9/d3d9_device_child.h | 18 +++++++++++++++++- 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/src/d3d8/d3d8_device_child.h b/src/d3d8/d3d8_device_child.h index c45d965ad..598ce0b19 100644 --- a/src/d3d8/d3d8_device_child.h +++ b/src/d3d8/d3d8_device_child.h @@ -31,16 +31,28 @@ namespace dxvk { } ULONG STDMETHODCALLTYPE Release() { - // ignore Release calls on objects with 0 refCount - if(unlikely(!this->m_refCount)) - return this->m_refCount; + uint32_t oldRefCount, refCount; + + do { + oldRefCount = this->m_refCount.load(std::memory_order_acquire); + + // clamp value to 0 to prevent underruns + if (unlikely(!oldRefCount)) + return 0; + + refCount = oldRefCount - 1; + + } while (!this->m_refCount.compare_exchange_weak(oldRefCount, + refCount, + std::memory_order_release, + std::memory_order_acquire)); - uint32_t refCount = --this->m_refCount; if (unlikely(!refCount)) { auto* pDevice = GetDevice(); this->ReleasePrivate(); pDevice->Release(); } + return refCount; } diff --git a/src/d3d9/d3d9_device_child.h b/src/d3d9/d3d9_device_child.h index 433a269ac..0866c7d81 100644 --- a/src/d3d9/d3d9_device_child.h +++ b/src/d3d9/d3d9_device_child.h @@ -25,12 +25,28 @@ namespace dxvk { } ULONG STDMETHODCALLTYPE Release() { - uint32_t refCount = --this->m_refCount; + uint32_t oldRefCount, refCount; + + do { + oldRefCount = this->m_refCount.load(std::memory_order_acquire); + + // clamp value to 0 to prevent underruns + if (unlikely(!oldRefCount)) + return 0; + + refCount = oldRefCount - 1; + + } while (!this->m_refCount.compare_exchange_weak(oldRefCount, + refCount, + std::memory_order_release, + std::memory_order_acquire)); + if (unlikely(!refCount)) { auto* pDevice = GetDevice(); this->ReleasePrivate(); pDevice->Release(); } + return refCount; } From 3716d48c89209391f2fb7624fa39b7c851c08401 Mon Sep 17 00:00:00 2001 From: WinterSnowfall Date: Wed, 19 Feb 2025 21:39:10 +0200 Subject: [PATCH 21/44] [d3d8/9] Use numeric_limits globally --- src/d3d8/d3d8_options.cpp | 2 +- src/d3d9/d3d9_adapter.cpp | 2 +- src/d3d9/d3d9_common_texture.h | 2 +- src/d3d9/d3d9_device.cpp | 14 +++++++------- src/d3d9/d3d9_fixed_function.cpp | 2 +- src/d3d9/d3d9_state.cpp | 2 +- src/d3d9/d3d9_stateblock.cpp | 2 +- src/dxso/dxso_compiler.cpp | 12 ++++++------ src/dxso/dxso_tables.cpp | 2 +- src/dxso/dxso_tables.h | 2 +- 10 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/d3d8/d3d8_options.cpp b/src/d3d8/d3d8_options.cpp index 549fcd3cf..ea8dbc65d 100644 --- a/src/d3d8/d3d8_options.cpp +++ b/src/d3d8/d3d8_options.cpp @@ -9,7 +9,7 @@ namespace dxvk { static inline uint32_t parseDword(std::string_view str) { - uint32_t value = UINT32_MAX; + uint32_t value = std::numeric_limits::max(); std::from_chars(str.data(), str.data() + str.size(), value); return value; } diff --git a/src/d3d9/d3d9_adapter.cpp b/src/d3d9/d3d9_adapter.cpp index 713a83650..212cc737a 100644 --- a/src/d3d9/d3d9_adapter.cpp +++ b/src/d3d9/d3d9_adapter.cpp @@ -588,7 +588,7 @@ namespace dxvk { // Max Vertex Shader Const pCaps->MaxVertexShaderConst = MaxFloatConstantsVS; // Max PS1 Value - pCaps->PixelShader1xMaxValue = options.shaderModel > 0 ? FLT_MAX : 0.0f; + pCaps->PixelShader1xMaxValue = options.shaderModel > 0 ? std::numeric_limits::max() : 0.0f; // Dev Caps 2 pCaps->DevCaps2 = D3DDEVCAPS2_STREAMOFFSET /* | D3DDEVCAPS2_DMAPNPATCH */ diff --git a/src/d3d9/d3d9_common_texture.h b/src/d3d9/d3d9_common_texture.h index bba49d65a..378a6ee2d 100644 --- a/src/d3d9/d3d9_common_texture.h +++ b/src/d3d9/d3d9_common_texture.h @@ -75,7 +75,7 @@ namespace dxvk { public: - static constexpr UINT AllLayers = UINT32_MAX; + static constexpr UINT AllLayers = std::numeric_limits::max(); D3D9CommonTexture( D3D9DeviceEx* pDevice, diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp index d2128512b..f43f3d74a 100644 --- a/src/d3d9/d3d9_device.cpp +++ b/src/d3d9/d3d9_device.cpp @@ -1760,7 +1760,7 @@ namespace dxvk { m_state.depthStencil = ds; - UpdateActiveHazardsDS(UINT32_MAX); + UpdateActiveHazardsDS(std::numeric_limits::max()); return D3D_OK; } @@ -2157,7 +2157,7 @@ namespace dxvk { if (m_state.IsLightEnabled(Index) == !!Enable) return D3D_OK; - uint32_t searchIndex = UINT32_MAX; + uint32_t searchIndex = std::numeric_limits::max(); uint32_t setIndex = Index; if (!Enable) @@ -2384,7 +2384,7 @@ namespace dxvk { case D3DRS_ZWRITEENABLE: if (likely(!old != !Value)) - UpdateActiveHazardsDS(UINT32_MAX); + UpdateActiveHazardsDS(std::numeric_limits::max()); [[fallthrough]]; case D3DRS_STENCILENABLE: case D3DRS_ZENABLE: @@ -3795,8 +3795,8 @@ namespace dxvk { if (m_psShaderMasks.samplerMask != newShaderMasks.samplerMask || m_psShaderMasks.rtMask != newShaderMasks.rtMask) { m_psShaderMasks = newShaderMasks; - UpdateActiveHazardsRT(UINT32_MAX); - UpdateActiveHazardsDS(UINT32_MAX); + UpdateActiveHazardsRT(std::numeric_limits::max()); + UpdateActiveHazardsDS(std::numeric_limits::max()); } return D3D_OK; @@ -7815,7 +7815,7 @@ namespace dxvk { if (key.Data.Contents.UseLighting) { for (uint32_t i = 0; i < caps::MaxEnabledLights; i++) { - if (m_state.enabledLightIndices[i] != UINT32_MAX) + if (m_state.enabledLightIndices[i] != std::numeric_limits::max()) lightCount++; } } @@ -7912,7 +7912,7 @@ namespace dxvk { uint32_t lightIdx = 0; for (uint32_t i = 0; i < caps::MaxEnabledLights; i++) { auto idx = m_state.enabledLightIndices[i]; - if (idx == UINT32_MAX) + if (idx == std::numeric_limits::max()) continue; data->Lights[lightIdx++] = D3D9Light(m_state.lights[idx].value(), m_state.transforms[GetTransformIndex(D3DTS_VIEW)]); diff --git a/src/d3d9/d3d9_fixed_function.cpp b/src/d3d9/d3d9_fixed_function.cpp index a47f89cad..0d92b7431 100644 --- a/src/d3d9/d3d9_fixed_function.cpp +++ b/src/d3d9/d3d9_fixed_function.cpp @@ -1297,7 +1297,7 @@ namespace dxvk { uint32_t atten = m_module.opFFma (m_floatType, d, atten2, atten1); atten = m_module.opFFma (m_floatType, d, atten, atten0); atten = m_module.opFDiv (m_floatType, m_module.constf32(1.0f), atten); - atten = m_module.opNMin (m_floatType, atten, m_module.constf32(FLT_MAX)); + atten = m_module.opNMin (m_floatType, atten, m_module.constf32(std::numeric_limits::max())); atten = m_module.opSelect(m_floatType, m_module.opFOrdGreaterThan(bool_t, d, range), m_module.constf32(0.0f), atten); atten = m_module.opSelect(m_floatType, isDirectional, m_module.constf32(1.0f), atten); diff --git a/src/d3d9/d3d9_state.cpp b/src/d3d9/d3d9_state.cpp index af749cf76..6479d2589 100644 --- a/src/d3d9/d3d9_state.cpp +++ b/src/d3d9/d3d9_state.cpp @@ -10,7 +10,7 @@ namespace dxvk { streamFreq[i] = 1; for (uint32_t i = 0; i < enabledLightIndices.size(); i++) - enabledLightIndices[i] = UINT32_MAX; + enabledLightIndices[i] = std::numeric_limits::max(); } diff --git a/src/d3d9/d3d9_stateblock.cpp b/src/d3d9/d3d9_stateblock.cpp index 9998fa94d..8b62c1d58 100644 --- a/src/d3d9/d3d9_stateblock.cpp +++ b/src/d3d9/d3d9_stateblock.cpp @@ -204,7 +204,7 @@ namespace dxvk { if (m_state.IsLightEnabled(Index) == !!Enable) return D3D_OK; - uint32_t searchIndex = UINT32_MAX; + uint32_t searchIndex = std::numeric_limits::max(); uint32_t setIndex = Index; if (!Enable) diff --git a/src/dxso/dxso_compiler.cpp b/src/dxso/dxso_compiler.cpp index d7cdc672f..54c0f0543 100644 --- a/src/dxso/dxso_compiler.cpp +++ b/src/dxso/dxso_compiler.cpp @@ -1957,7 +1957,7 @@ namespace dxvk { if (m_moduleInfo.options.d3d9FloatEmulation == D3D9FloatEmulation::Enabled) { result.id = m_module.opNMin(typeId, result.id, - m_module.constfReplicant(FLT_MAX, result.type.ccount)); + m_module.constfReplicant(std::numeric_limits::max(), result.type.ccount)); } break; case DxsoOpcode::Rsq: @@ -1969,7 +1969,7 @@ namespace dxvk { if (m_moduleInfo.options.d3d9FloatEmulation == D3D9FloatEmulation::Enabled) { result.id = m_module.opNMin(typeId, result.id, - m_module.constfReplicant(FLT_MAX, result.type.ccount)); + m_module.constfReplicant(std::numeric_limits::max(), result.type.ccount)); } break; case DxsoOpcode::Dp3: { @@ -2029,7 +2029,7 @@ namespace dxvk { if (m_moduleInfo.options.d3d9FloatEmulation == D3D9FloatEmulation::Enabled) { result.id = m_module.opNMin(typeId, result.id, - m_module.constfReplicant(FLT_MAX, result.type.ccount)); + m_module.constfReplicant(std::numeric_limits::max(), result.type.ccount)); } break; } @@ -2040,7 +2040,7 @@ namespace dxvk { if (m_moduleInfo.options.d3d9FloatEmulation == D3D9FloatEmulation::Enabled) { result.id = m_module.opNMin(typeId, result.id, - m_module.constfReplicant(FLT_MAX, result.type.ccount)); + m_module.constfReplicant(std::numeric_limits::max(), result.type.ccount)); } break; case DxsoOpcode::Pow: { @@ -2102,7 +2102,7 @@ namespace dxvk { rcpLength.type = scalarType; rcpLength.id = m_module.opInverseSqrt(scalarTypeId, dot.id); if (m_moduleInfo.options.d3d9FloatEmulation == D3D9FloatEmulation::Enabled) { - rcpLength.id = m_module.opNMin(scalarTypeId, rcpLength.id, m_module.constf32(FLT_MAX)); + rcpLength.id = m_module.opNMin(scalarTypeId, rcpLength.id, m_module.constf32(std::numeric_limits::max())); } // r * rsq(r . r) @@ -2216,7 +2216,7 @@ namespace dxvk { result.id = m_module.opLog2(typeId, result.id); if (m_moduleInfo.options.d3d9FloatEmulation == D3D9FloatEmulation::Enabled) { result.id = m_module.opNMax(typeId, result.id, - m_module.constfReplicant(-FLT_MAX, result.type.ccount)); + m_module.constfReplicant(-std::numeric_limits::max(), result.type.ccount)); } break; case DxsoOpcode::Lrp: diff --git a/src/dxso/dxso_tables.cpp b/src/dxso/dxso_tables.cpp index 5b8ab91f0..df79a970a 100644 --- a/src/dxso/dxso_tables.cpp +++ b/src/dxso/dxso_tables.cpp @@ -86,7 +86,7 @@ namespace dxvk { case DxsoOpcode::SetP: return 3; case DxsoOpcode::TexLdl: return 3; case DxsoOpcode::BreakP: return 2; - default: Logger::warn("DxsoGetDefaultOpcodeLength: unknown opcode to get default length for."); return UINT32_MAX; + default: Logger::warn("DxsoGetDefaultOpcodeLength: unknown opcode to get default length for."); return std::numeric_limits::max(); } } diff --git a/src/dxso/dxso_tables.h b/src/dxso/dxso_tables.h index 73e3801d3..19b94f63b 100644 --- a/src/dxso/dxso_tables.h +++ b/src/dxso/dxso_tables.h @@ -4,7 +4,7 @@ namespace dxvk { - constexpr uint32_t InvalidOpcodeLength = UINT32_MAX; + constexpr uint32_t InvalidOpcodeLength = std::numeric_limits::max(); uint32_t DxsoGetDefaultOpcodeLength(DxsoOpcode opcode); From 22052106d868347cc849f1719e9a9fdb1bd20f44 Mon Sep 17 00:00:00 2001 From: WinterSnowfall Date: Wed, 19 Feb 2025 22:36:29 +0200 Subject: [PATCH 22/44] [d3d9] Relax logging level on validateGammaRamp --- src/d3d9/d3d9_swapchain.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/d3d9/d3d9_swapchain.cpp b/src/d3d9/d3d9_swapchain.cpp index 73218c516..84236b36c 100644 --- a/src/d3d9/d3d9_swapchain.cpp +++ b/src/d3d9/d3d9_swapchain.cpp @@ -650,17 +650,17 @@ namespace dxvk { static bool validateGammaRamp(const WORD (&ramp)[256]) { if (ramp[0] >= ramp[std::size(ramp) - 1]) { - Logger::err("validateGammaRamp: ramp inverted or flat"); + Logger::warn("validateGammaRamp: ramp inverted or flat"); return false; } for (size_t i = 1; i < std::size(ramp); i++) { if (ramp[i] < ramp[i - 1]) { - Logger::err("validateGammaRamp: ramp not monotonically increasing"); + Logger::warn("validateGammaRamp: ramp not monotonically increasing"); return false; } if (ramp[i] - ramp[i - 1] >= UINT16_MAX / 2) { - Logger::err("validateGammaRamp: huuuge jump"); + Logger::warn("validateGammaRamp: huuuge jump"); return false; } } From 8c7da070855efd599d5381ccb3813667442e538f Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sat, 15 Feb 2025 13:33:36 +0100 Subject: [PATCH 23/44] [util] Fix enum declaration --- src/util/com/com_private_data.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/util/com/com_private_data.h b/src/util/com/com_private_data.h index 0673f8e2d..8a21d4153 100644 --- a/src/util/com/com_private_data.h +++ b/src/util/com/com_private_data.h @@ -1,5 +1,7 @@ #pragma once +#include +#include #include #include "com_include.h" @@ -9,7 +11,7 @@ namespace dxvk { /** * \brief COM private data entry type */ - enum ComPrivateDataType { + enum class ComPrivateDataType : uint32_t { None, Data, Iface, From d94e3633dc7b0a9f44b7beaa0a66c5c8d6aa1366 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sat, 15 Feb 2025 13:34:03 +0100 Subject: [PATCH 24/44] [dxvk] Introduce concept of order-invariant atomic stores --- src/dxvk/dxvk_pipelayout.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/dxvk/dxvk_pipelayout.h b/src/dxvk/dxvk_pipelayout.h index 5fb05c6b2..6271be481 100644 --- a/src/dxvk/dxvk_pipelayout.h +++ b/src/dxvk/dxvk_pipelayout.h @@ -11,6 +11,24 @@ namespace dxvk { class DxvkDevice; + /** + * \brief Order-invariant atomic access operation + * + * Information used to optimize barriers when a resource + * is accessed exlusively via order-invariant stores. + */ + enum class DxvkAccessOp : uint32_t { + None = 0, + Or = 1, + And = 2, + Xor = 3, + Add = 4, + IMin = 5, + IMax = 6, + UMin = 7, + UMax = 8, + }; + /** * \brief Descriptor set indices */ @@ -37,6 +55,7 @@ namespace dxvk { VkShaderStageFlagBits stage; ///< Shader stage VkAccessFlags access; ///< Access mask for the resource VkBool32 uboSet; ///< Whether to include this in the UBO set + DxvkAccessOp accessOp; ///< Order-invariant store type, if any /** * \brief Computes descriptor set index for the given binding From e01a6eec3e8efd15dfd07dc24101724f951b9027 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sat, 15 Feb 2025 13:34:20 +0100 Subject: [PATCH 25/44] [dxbc] Track bindings with order-invariant atomic stores --- src/dxbc/dxbc_analysis.cpp | 38 ++++++++++++++++++++++++++++++++------ src/dxbc/dxbc_analysis.h | 8 +++++--- src/dxbc/dxbc_compiler.cpp | 14 +++++++++++--- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/src/dxbc/dxbc_analysis.cpp b/src/dxbc/dxbc_analysis.cpp index dd4a51324..a5bf4fc19 100644 --- a/src/dxbc/dxbc_analysis.cpp +++ b/src/dxbc/dxbc_analysis.cpp @@ -30,26 +30,48 @@ namespace dxvk { switch (ins.opClass) { case DxbcInstClass::Atomic: { const uint32_t operandId = ins.dstCount - 1; - + if (ins.dst[operandId].type == DxbcOperandType::UnorderedAccessView) { const uint32_t registerId = ins.dst[operandId].idx[0].offset; m_analysis->uavInfos[registerId].accessAtomicOp = true; m_analysis->uavInfos[registerId].accessFlags |= VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT; + + // Check whether the atomic operation is order-invariant + DxvkAccessOp store = DxvkAccessOp::None; + + switch (ins.op) { + case DxbcOpcode::AtomicAnd: store = DxvkAccessOp::And; break; + case DxbcOpcode::AtomicOr: store = DxvkAccessOp::Or; break; + case DxbcOpcode::AtomicXor: store = DxvkAccessOp::Xor; break; + case DxbcOpcode::AtomicIAdd: store = DxvkAccessOp::Add; break; + case DxbcOpcode::AtomicIMax: store = DxvkAccessOp::IMax; break; + case DxbcOpcode::AtomicIMin: store = DxvkAccessOp::IMin; break; + case DxbcOpcode::AtomicUMax: store = DxvkAccessOp::UMax; break; + case DxbcOpcode::AtomicUMin: store = DxvkAccessOp::UMin; break; + default: break; + } + + if (m_analysis->uavInfos[registerId].atomicStore == DxvkAccessOp::None) + m_analysis->uavInfos[registerId].atomicStore = store; + + // Maintain ordering if the UAV is accessed via other operations as well + if (store == DxvkAccessOp::None || m_analysis->uavInfos[registerId].atomicStore != store) + m_analysis->uavInfos[registerId].nonInvariantAccess = true; } } break; - + case DxbcInstClass::TextureSample: case DxbcInstClass::TextureGather: case DxbcInstClass::TextureQueryLod: case DxbcInstClass::VectorDeriv: { m_analysis->usesDerivatives = true; } break; - + case DxbcInstClass::ControlFlow: { if (ins.op == DxbcOpcode::Discard) m_analysis->usesKill = true; } break; - + case DxbcInstClass::BufferLoad: { uint32_t operandId = ins.op == DxbcOpcode::LdStructured ? 2 : 1; bool sparseFeedback = ins.dstCount == 2; @@ -58,16 +80,18 @@ namespace dxvk { const uint32_t registerId = ins.src[operandId].idx[0].offset; m_analysis->uavInfos[registerId].accessFlags |= VK_ACCESS_SHADER_READ_BIT; m_analysis->uavInfos[registerId].sparseFeedback |= sparseFeedback; + m_analysis->uavInfos[registerId].nonInvariantAccess = true; } else if (ins.src[operandId].type == DxbcOperandType::Resource) { const uint32_t registerId = ins.src[operandId].idx[0].offset; m_analysis->srvInfos[registerId].sparseFeedback |= sparseFeedback; } } break; - + case DxbcInstClass::BufferStore: { if (ins.dst[0].type == DxbcOperandType::UnorderedAccessView) { const uint32_t registerId = ins.dst[0].idx[0].offset; m_analysis->uavInfos[registerId].accessFlags |= VK_ACCESS_SHADER_WRITE_BIT; + m_analysis->uavInfos[registerId].nonInvariantAccess = true; } } break; @@ -75,13 +99,15 @@ namespace dxvk { const uint32_t registerId = ins.src[1].idx[0].offset; m_analysis->uavInfos[registerId].accessTypedLoad = true; m_analysis->uavInfos[registerId].accessFlags |= VK_ACCESS_SHADER_READ_BIT; + m_analysis->uavInfos[registerId].nonInvariantAccess = true; } break; case DxbcInstClass::TypedUavStore: { const uint32_t registerId = ins.dst[0].idx[0].offset; m_analysis->uavInfos[registerId].accessFlags |= VK_ACCESS_SHADER_WRITE_BIT; + m_analysis->uavInfos[registerId].nonInvariantAccess = true; } break; - + default: break; } diff --git a/src/dxbc/dxbc_analysis.h b/src/dxbc/dxbc_analysis.h index fcbc1ddad..fa589f4fd 100644 --- a/src/dxbc/dxbc_analysis.h +++ b/src/dxbc/dxbc_analysis.h @@ -17,9 +17,11 @@ namespace dxvk { * will be used to generate image types. */ struct DxbcUavInfo { - bool accessTypedLoad = false; - bool accessAtomicOp = false; - bool sparseFeedback = false; + bool accessTypedLoad = false; + bool accessAtomicOp = false; + bool sparseFeedback = false; + bool nonInvariantAccess = false; + DxvkAccessOp atomicStore = DxvkAccessOp::None; VkAccessFlags accessFlags = 0; }; diff --git a/src/dxbc/dxbc_compiler.cpp b/src/dxbc/dxbc_compiler.cpp index 6ab3e9163..f0f362334 100644 --- a/src/dxbc/dxbc_compiler.cpp +++ b/src/dxbc/dxbc_compiler.cpp @@ -1098,6 +1098,9 @@ namespace dxvk { : VK_DESCRIPTOR_TYPE_STORAGE_IMAGE; binding.access = m_analysis->uavInfos[registerId].accessFlags; + if (!m_analysis->uavInfos[registerId].nonInvariantAccess) + binding.accessOp = m_analysis->uavInfos[registerId].atomicStore; + if (!(binding.access & VK_ACCESS_SHADER_WRITE_BIT)) m_module.decorate(varId, spv::DecorationNonWritable); if (!(binding.access & VK_ACCESS_SHADER_READ_BIT)) @@ -1234,9 +1237,14 @@ namespace dxvk { : (isUav ? VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER : VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER); binding.viewType = VK_IMAGE_VIEW_TYPE_MAX_ENUM; binding.resourceBinding = bindingId; - binding.access = isUav - ? m_analysis->uavInfos[registerId].accessFlags - : VkAccessFlags(VK_ACCESS_SHADER_READ_BIT); + binding.access = VK_ACCESS_SHADER_READ_BIT; + + if (isUav) { + binding.access = m_analysis->uavInfos[registerId].accessFlags; + + if (!m_analysis->uavInfos[registerId].nonInvariantAccess) + binding.accessOp = m_analysis->uavInfos[registerId].atomicStore; + } if (useRawSsbo || isUav) { if (!(binding.access & VK_ACCESS_SHADER_WRITE_BIT)) From c47596075419e8d5d64cd313242fca588de7bbd0 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sat, 15 Feb 2025 22:14:36 +0100 Subject: [PATCH 26/44] [dxvk] Pass store op around for barrier tracking --- src/dxvk/dxvk_context.cpp | 361 +++++++++++++++++++++----------------- src/dxvk/dxvk_context.h | 48 +++-- 2 files changed, 231 insertions(+), 178 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 8dbdc8fde..b07bc2518 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -236,7 +236,7 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *image, subresources, image->info().layout, image->info().stages, 0, - layout, image->info().stages, image->info().access); + layout, image->info().stages, image->info().access, DxvkAccessOp::None); image->setLayout(layout); @@ -288,10 +288,9 @@ namespace dxvk { &value); } - accessBuffer(cmdBuffer, - *buffer, offset, length, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_WRITE_BIT); + accessBuffer(cmdBuffer, *buffer, offset, length, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, + DxvkAccessOp::None); m_cmd->track(buffer, DxvkAccess::Write); } @@ -361,7 +360,7 @@ namespace dxvk { accessBuffer(cmdBuffer, *bufferView, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_2_SHADER_WRITE_BIT); + VK_ACCESS_2_SHADER_WRITE_BIT, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(cmdBuffer); @@ -478,15 +477,13 @@ namespace dxvk { m_cmd->cmdCopyBuffer(cmdBuffer, ©Info); - accessBuffer(cmdBuffer, - *srcBuffer, srcOffset, numBytes, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_READ_BIT); + accessBuffer(cmdBuffer, *srcBuffer, srcOffset, numBytes, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT, + DxvkAccessOp::None); - accessBuffer(cmdBuffer, - *dstBuffer, dstOffset, numBytes, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_WRITE_BIT); + accessBuffer(cmdBuffer, *dstBuffer, dstOffset, numBytes, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, + DxvkAccessOp::None); m_cmd->track(dstBuffer, DxvkAccess::Write); m_cmd->track(srcBuffer, DxvkAccess::Read); @@ -818,10 +815,12 @@ namespace dxvk { extent.depth); accessBuffer(DxvkCmdBuffer::ExecBuffer, *dstView, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_WRITE_BIT); + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_WRITE_BIT, + DxvkAccessOp::None); accessBuffer(DxvkCmdBuffer::ExecBuffer, *srcView, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_READ_BIT); + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_READ_BIT, + DxvkAccessOp::None); // Track all involved resources m_cmd->track(dstBuffer, DxvkAccess::Write); @@ -1123,7 +1122,7 @@ namespace dxvk { if (initialLayout == VK_IMAGE_LAYOUT_PREINITIALIZED) { accessImage(DxvkCmdBuffer::InitBuffer, *image, subresources, initialLayout, - VK_PIPELINE_STAGE_2_NONE, 0); + VK_PIPELINE_STAGE_2_NONE, 0, DxvkAccessOp::None); m_cmd->track(image, DxvkAccess::None); } else { @@ -1202,10 +1201,8 @@ namespace dxvk { } } - accessImage(DxvkCmdBuffer::InitBuffer, - *image, subresources, clearLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_WRITE_BIT); + accessImage(DxvkCmdBuffer::InitBuffer, *image, subresources, clearLayout, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); m_cmd->track(image, DxvkAccess::Write); } @@ -1253,9 +1250,8 @@ namespace dxvk { } // Perform initial layout transition - accessImage(DxvkCmdBuffer::InitBuffer, - *image, image->getAvailableSubresources(), - VK_IMAGE_LAYOUT_UNDEFINED, VK_PIPELINE_STAGE_2_NONE, 0); + accessImage(DxvkCmdBuffer::InitBuffer, *image, image->getAvailableSubresources(), + VK_IMAGE_LAYOUT_UNDEFINED, VK_PIPELINE_STAGE_2_NONE, 0, DxvkAccessOp::None); m_cmd->track(image, DxvkAccess::Write); } @@ -1293,7 +1289,8 @@ namespace dxvk { accessBuffer(DxvkCmdBuffer::ExecBuffer, *resource, 0, resource->info().size, - srcStages, srcAccess, dstStages, dstAccess); + srcStages, srcAccess, dstStages, dstAccess, + DxvkAccessOp::None); m_cmd->track(resource, DxvkAccess::Write); } @@ -1315,7 +1312,8 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *resource, resource->getAvailableSubresources(), srcLayout, srcStages, srcAccess, - dstLayout, dstStages, dstAccess); + dstLayout, dstStages, dstAccess, + DxvkAccessOp::None); m_cmd->track(resource, DxvkAccess::Write); } @@ -1472,19 +1470,22 @@ namespace dxvk { VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_2_SHADER_READ_BIT); + VK_ACCESS_2_SHADER_READ_BIT, + DxvkAccessOp::None); } else { accessImage(DxvkCmdBuffer::ExecBuffer, *imageView->image(), mipGenerator.getAllSourceSubresources(), srcLayout, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT | VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_2_SHADER_READ_BIT); + VK_ACCESS_2_SHADER_READ_BIT, + DxvkAccessOp::None); accessImage(DxvkCmdBuffer::ExecBuffer, *imageView->image(), mipGenerator.getBottomSubresource(), dstLayout, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT); + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, + DxvkAccessOp::None); } if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) @@ -1584,7 +1585,7 @@ namespace dxvk { // If the image has any pending layout transitions, flush them accordingly. // There might be false positives here, but those do not affect correctness. - if (resourceHasAccess(*image, image->getAvailableSubresources(), DxvkAccess::Write)) { + if (resourceHasAccess(*image, image->getAvailableSubresources(), DxvkAccess::Write, DxvkAccessOp::None)) { spillRenderPass(true); flushBarriers(); @@ -1652,7 +1653,8 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *image, image->getAvailableSubresources(), oldLayout, image->info().stages, image->info().access, - newLayout, image->info().stages, image->info().access); + newLayout, image->info().stages, image->info().access, + DxvkAccessOp::None); m_cmd->track(image, DxvkAccess::Write); return true; @@ -1863,7 +1865,8 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, dstSubresources, srcLayout, dstImage->info().stages, dstImage->info().access, - dstLayout, dstImage->info().stages, dstImage->info().access); + dstLayout, dstImage->info().stages, dstImage->info().access, + DxvkAccessOp::None); m_cmd->track(dstImage, DxvkAccess::Write); } @@ -2029,7 +2032,7 @@ namespace dxvk { *imageView->image(), imageView->imageSubresources(), imageLayout, clearStages, clearAccess, storeLayout, imageView->image()->info().stages, - imageView->image()->info().access); + imageView->image()->info().access, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -2232,7 +2235,7 @@ namespace dxvk { // Record layout transition from attachment layout back to default // layout. This will be flushed after the render pass has ended. accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, - dstSubresource, newLayout, stages, access); + dstSubresource, newLayout, stages, access, DxvkAccessOp::None); if (!isDepthStencil) { uint32_t index = m_state.om.framebufferInfo.getColorAttachmentIndex(i); @@ -2343,7 +2346,7 @@ namespace dxvk { accessBuffer(cmdBuffer, *buffer, offset, size, VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_WRITE_BIT); + VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); m_cmd->track(buffer, DxvkAccess::Write); } @@ -2794,7 +2797,8 @@ namespace dxvk { accessBuffer(DxvkCmdBuffer::ExecBuffer, *r.first, 0, r.first->info().size, - VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, accessFlags); + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, + accessFlags, DxvkAccessOp::None); m_cmd->track(r.first, r.second.test(DxvkAccess::Write) ? DxvkAccess::Write : DxvkAccess::Read); } @@ -2804,7 +2808,7 @@ namespace dxvk { | (r.second.test(DxvkAccess::Write) * VK_ACCESS_SHADER_WRITE_BIT); accessImage(DxvkCmdBuffer::ExecBuffer, *r.first, r.first->getAvailableSubresources(), r.first->info().layout, - VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, accessFlags); + VK_PIPELINE_STAGE_2_ALL_COMMANDS_BIT, accessFlags, DxvkAccessOp::None); m_cmd->track(r.first, r.second.test(DxvkAccess::Write) ? DxvkAccess::Write : DxvkAccess::Read); } @@ -3089,12 +3093,12 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *dstView->image(), dstView->imageSubresources(), dstLayout, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT); + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, DxvkAccessOp::None); accessImage(DxvkCmdBuffer::ExecBuffer, *srcView->image(), srcView->imageSubresources(), srcLayout, VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, - VK_ACCESS_2_SHADER_READ_BIT); + VK_ACCESS_2_SHADER_READ_BIT, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -3144,14 +3148,12 @@ namespace dxvk { blitInfo.filter = filter; m_cmd->cmdBlitImage(&blitInfo); - - accessImage(DxvkCmdBuffer::ExecBuffer, - *dstView->image(), dstView->imageSubresources(), dstLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT); - - accessImage(DxvkCmdBuffer::ExecBuffer, - *srcView->image(), srcView->imageSubresources(), srcLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT); + + accessImage(DxvkCmdBuffer::ExecBuffer, *dstView->image(), dstView->imageSubresources(), dstLayout, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); + + accessImage(DxvkCmdBuffer::ExecBuffer, *srcView->image(), srcView->imageSubresources(), srcLayout, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT, DxvkAccessOp::None); m_cmd->track(dstView->image(), DxvkAccess::Write); m_cmd->track(srcView->image(), DxvkAccess::Read); @@ -3302,10 +3304,10 @@ namespace dxvk { bufferSlice, bufferRowAlignment, bufferSliceAlignment); accessImage(cmdBuffer, *image, dstSubresourceRange, dstImageLayoutTransfer, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT); + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); accessBuffer(cmdBuffer, *buffer, bufferOffset, dataSize, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT); + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT, DxvkAccessOp::None); m_cmd->track(image, DxvkAccess::Write); m_cmd->track(buffer, DxvkAccess::Read); @@ -3541,11 +3543,11 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *image, vk::makeSubresourceRange(imageSubresource), - imageLayout, stages, access); + imageLayout, stages, access, DxvkAccessOp::None); accessBuffer(DxvkCmdBuffer::ExecBuffer, *bufferView, VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, - VK_ACCESS_2_SHADER_READ_BIT); + VK_ACCESS_2_SHADER_READ_BIT, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -3593,12 +3595,11 @@ namespace dxvk { image, imageSubresource, imageOffset, imageExtent, srcImageLayoutTransfer, bufferSlice, bufferRowAlignment, bufferSliceAlignment); - accessImage(DxvkCmdBuffer::ExecBuffer, - *image, srcSubresourceRange, srcImageLayoutTransfer, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT); + accessImage(DxvkCmdBuffer::ExecBuffer, *image, srcSubresourceRange, srcImageLayoutTransfer, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT, DxvkAccessOp::None); accessBuffer(DxvkCmdBuffer::ExecBuffer, *buffer, bufferOffset, dataSize, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT); + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); m_cmd->track(buffer, DxvkAccess::Write); m_cmd->track(image, DxvkAccess::Read); @@ -3769,11 +3770,10 @@ namespace dxvk { workgroupCount.depth); accessBuffer(DxvkCmdBuffer::ExecBuffer, *bufferView, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_WRITE_BIT); + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_WRITE_BIT, DxvkAccessOp::None); - accessImage(DxvkCmdBuffer::ExecBuffer, *image, - vk::makeSubresourceRange(imageSubresource), imageLayout, - VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_READ_BIT); + accessImage(DxvkCmdBuffer::ExecBuffer, *image, vk::makeSubresourceRange(imageSubresource), imageLayout, + VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, VK_ACCESS_2_SHADER_READ_BIT, DxvkAccessOp::None); m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); @@ -3894,7 +3894,7 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *imageView->image(), imageView->imageSubresources(), - clearLayout, clearStages, clearAccess); + clearLayout, clearStages, clearAccess, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -3983,10 +3983,9 @@ namespace dxvk { m_cmd->cmdDispatch(cmdBuffer, workgroups.width, workgroups.height, workgroups.depth); - accessImage(cmdBuffer, - *imageView->image(), imageView->imageSubresources(), + accessImage(cmdBuffer, *imageView->image(), imageView->imageSubresources(), VK_IMAGE_LAYOUT_GENERAL, VK_PIPELINE_STAGE_2_COMPUTE_SHADER_BIT, - VK_ACCESS_2_SHADER_WRITE_BIT); + VK_ACCESS_2_SHADER_WRITE_BIT, DxvkAccessOp::None); if (cmdBuffer == DxvkCmdBuffer::ExecBuffer) m_flags.set(DxvkContextFlag::ForceWriteAfterWriteSync); @@ -4057,13 +4056,11 @@ namespace dxvk { m_cmd->cmdCopyImage(DxvkCmdBuffer::ExecBuffer, ©Info); } - accessImage(DxvkCmdBuffer::ExecBuffer, - *dstImage, dstSubresourceRange, dstImageLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT); + accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, dstSubresourceRange, dstImageLayout, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); - accessImage(DxvkCmdBuffer::ExecBuffer, - *srcImage, srcSubresourceRange, srcImageLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT); + accessImage(DxvkCmdBuffer::ExecBuffer, *srcImage, srcSubresourceRange, srcImageLayout, + VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_READ_BIT, DxvkAccessOp::None); m_cmd->track(dstImage, DxvkAccess::Write); m_cmd->track(srcImage, DxvkAccess::Read); @@ -4259,14 +4256,12 @@ namespace dxvk { m_cmd->cmdDraw(3, dstSubresource.layerCount, 0, 0); m_cmd->cmdEndRendering(); - accessImage(DxvkCmdBuffer::ExecBuffer, - *srcImage, srcSubresourceRange, srcLayout, - VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, - VK_ACCESS_2_SHADER_READ_BIT); + accessImage(DxvkCmdBuffer::ExecBuffer, *srcImage, srcSubresourceRange, + srcLayout, VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + VK_ACCESS_2_SHADER_READ_BIT, DxvkAccessOp::None); - accessImage(DxvkCmdBuffer::ExecBuffer, - *dstImage, dstSubresourceRange, - dstLayout, dstStages, dstAccess); + accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, dstSubresourceRange, + dstLayout, dstStages, dstAccess, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -4357,7 +4352,8 @@ namespace dxvk { accessBuffer(DxvkCmdBuffer::ExecBuffer, *buffer, offset, SparseMemoryPageSize * pageCount, VK_PIPELINE_STAGE_2_TRANSFER_BIT, - ToBuffer ? VK_ACCESS_2_TRANSFER_WRITE_BIT : VK_ACCESS_2_TRANSFER_READ_BIT); + ToBuffer ? VK_ACCESS_2_TRANSFER_WRITE_BIT : VK_ACCESS_2_TRANSFER_READ_BIT, + DxvkAccessOp::None); } @@ -4406,7 +4402,8 @@ namespace dxvk { accessBuffer(DxvkCmdBuffer::ExecBuffer, *sparse, 0, sparse->info().size, VK_PIPELINE_STAGE_2_TRANSFER_BIT, - ToBuffer ? VK_ACCESS_2_TRANSFER_READ_BIT : VK_ACCESS_2_TRANSFER_WRITE_BIT); + ToBuffer ? VK_ACCESS_2_TRANSFER_READ_BIT : VK_ACCESS_2_TRANSFER_WRITE_BIT, + DxvkAccessOp::None); m_cmd->track(sparse, ToBuffer ? DxvkAccess::Read : DxvkAccess::Write); m_cmd->track(buffer, ToBuffer ? DxvkAccess::Write : DxvkAccess::Read); @@ -4484,7 +4481,7 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *sparse, sparseSubresources, transferLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, transferAccess); + VK_PIPELINE_STAGE_2_TRANSFER_BIT, transferAccess, DxvkAccessOp::None); m_cmd->track(sparse, ToBuffer ? DxvkAccess::Read : DxvkAccess::Write); m_cmd->track(buffer, ToBuffer ? DxvkAccess::Write : DxvkAccess::Read); @@ -4533,12 +4530,12 @@ namespace dxvk { accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, dstSubresourceRange, dstLayout, VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_WRITE_BIT); + VK_ACCESS_2_TRANSFER_WRITE_BIT, DxvkAccessOp::None); accessImage(DxvkCmdBuffer::ExecBuffer, *srcImage, srcSubresourceRange, srcLayout, VK_PIPELINE_STAGE_2_TRANSFER_BIT, - VK_ACCESS_2_TRANSFER_READ_BIT); + VK_ACCESS_2_TRANSFER_READ_BIT, DxvkAccessOp::None); m_cmd->track(dstImage, DxvkAccess::Write); m_cmd->track(srcImage, DxvkAccess::Read); @@ -4631,13 +4628,15 @@ namespace dxvk { *srcImage, srcSubresourceRange, srcLayout, VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT); + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT, + DxvkAccessOp::None); accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, dstSubresourceRange, dstLayout, VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, - VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT); + VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT, + DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -4844,14 +4843,12 @@ namespace dxvk { m_cmd->cmdDraw(3, region.dstSubresource.layerCount, 0, 0); m_cmd->cmdEndRendering(); - accessImage(DxvkCmdBuffer::ExecBuffer, - *srcImage, srcSubresourceRange, srcLayout, - VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, - VK_ACCESS_2_SHADER_READ_BIT); + accessImage(DxvkCmdBuffer::ExecBuffer, *srcImage, srcSubresourceRange, + srcLayout, VK_PIPELINE_STAGE_2_FRAGMENT_SHADER_BIT, + VK_ACCESS_2_SHADER_READ_BIT, DxvkAccessOp::None); - accessImage(DxvkCmdBuffer::ExecBuffer, - *dstImage, dstSubresourceRange, - dstLayout, dstStages, dstAccess); + accessImage(DxvkCmdBuffer::ExecBuffer, *dstImage, dstSubresourceRange, + dstLayout, dstStages, dstAccess, DxvkAccessOp::None); if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) m_cmd->cmdEndDebugUtilsLabel(DxvkCmdBuffer::ExecBuffer); @@ -5148,9 +5145,9 @@ namespace dxvk { m_initBarriers.addImageBarrier(barrier); } else { - accessImage(DxvkCmdBuffer::SdmaBuffer, - *image, image->getAvailableSubresources(), transferLayout, - VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT); + accessImage(DxvkCmdBuffer::SdmaBuffer, *image, image->getAvailableSubresources(), + transferLayout, VK_PIPELINE_STAGE_2_TRANSFER_BIT, VK_ACCESS_2_TRANSFER_WRITE_BIT, + DxvkAccessOp::None); } m_cmd->track(source, DxvkAccess::Read); @@ -5296,7 +5293,7 @@ namespace dxvk { ops.depthOps.loadLayout, depthStages, 0, depthAttachment.layout, - depthStages, depthAccess); + depthStages, depthAccess, DxvkAccessOp::None); } for (uint32_t i = 0; i < MaxNumRenderTargets; i++) { @@ -5316,7 +5313,7 @@ namespace dxvk { VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, 0, colorAttachment.layout, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, - colorAccess); + colorAccess, DxvkAccessOp::None); } } @@ -5346,7 +5343,8 @@ namespace dxvk { VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, srcAccess, ops.depthOps.storeLayout, depthAttachment.view->image()->info().stages, - depthAttachment.view->image()->info().access); + depthAttachment.view->image()->info().access, + DxvkAccessOp::None); } for (uint32_t i = 0; i < MaxNumRenderTargets; i++) { @@ -5362,7 +5360,8 @@ namespace dxvk { VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT, ops.colorOps[i].storeLayout, colorAttachment.view->image()->info().stages, - colorAttachment.view->image()->info().access); + colorAttachment.view->image()->info().access, + DxvkAccessOp::None); } } @@ -5583,7 +5582,8 @@ namespace dxvk { accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.xfb.activeCounters[i], VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT, VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | - VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT); + VK_ACCESS_2_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, + DxvkAccessOp::None); m_cmd->track(m_state.xfb.activeCounters[i].buffer(), DxvkAccess::Write); } @@ -5968,7 +5968,7 @@ namespace dxvk { descriptorInfo.image.imageLayout = res.imageView->image()->info().layout; if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.imageView->image()->hasGfxStores())) - accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access); + accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access, DxvkAccessOp::None); m_cmd->track(res.imageView->image(), DxvkAccess::Read); } else { @@ -5992,7 +5992,7 @@ namespace dxvk { descriptorInfo.image.imageLayout = res.imageView->image()->info().layout; if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || res.imageView->image()->hasGfxStores()) - accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access); + accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access, binding.accessOp); m_cmd->track(res.imageView->image(), (binding.access & vk::AccessWriteMask) ? DxvkAccess::Write : DxvkAccess::Read); @@ -6017,7 +6017,7 @@ namespace dxvk { descriptorInfo.image.imageLayout = res.imageView->image()->info().layout; if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.imageView->image()->hasGfxStores())) - accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access); + accessImage(DxvkCmdBuffer::ExecBuffer, *res.imageView, util::pipelineStages(binding.stage), binding.access, DxvkAccessOp::None); m_cmd->track(res.sampler); m_cmd->track(res.imageView->image(), DxvkAccess::Read); @@ -6035,7 +6035,7 @@ namespace dxvk { descriptorInfo.texelBuffer = res.bufferView->handle(); if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.bufferView->buffer()->hasGfxStores())) - accessBuffer(DxvkCmdBuffer::ExecBuffer, *res.bufferView, util::pipelineStages(binding.stage), binding.access); + accessBuffer(DxvkCmdBuffer::ExecBuffer, *res.bufferView, util::pipelineStages(binding.stage), binding.access, DxvkAccessOp::None); m_cmd->track(res.bufferView->buffer(), DxvkAccess::Read); } else { @@ -6050,7 +6050,7 @@ namespace dxvk { descriptorInfo.texelBuffer = res.bufferView->handle(); if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || res.bufferView->buffer()->hasGfxStores()) - accessBuffer(DxvkCmdBuffer::ExecBuffer, *res.bufferView, util::pipelineStages(binding.stage), binding.access); + accessBuffer(DxvkCmdBuffer::ExecBuffer, *res.bufferView, util::pipelineStages(binding.stage), binding.access, binding.accessOp); m_cmd->track(res.bufferView->buffer(), (binding.access & vk::AccessWriteMask) ? DxvkAccess::Write : DxvkAccess::Read); @@ -6066,7 +6066,7 @@ namespace dxvk { descriptorInfo = res.bufferSlice.getDescriptor(); if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.bufferSlice.buffer()->hasGfxStores())) - accessBuffer(DxvkCmdBuffer::ExecBuffer, res.bufferSlice, util::pipelineStages(binding.stage), binding.access); + accessBuffer(DxvkCmdBuffer::ExecBuffer, res.bufferSlice, util::pipelineStages(binding.stage), binding.access, DxvkAccessOp::None); m_cmd->track(res.bufferSlice.buffer(), DxvkAccess::Read); } else { @@ -6083,7 +6083,7 @@ namespace dxvk { descriptorInfo = res.bufferSlice.getDescriptor(); if (BindPoint == VK_PIPELINE_BIND_POINT_COMPUTE || unlikely(res.bufferSlice.buffer()->hasGfxStores())) - accessBuffer(DxvkCmdBuffer::ExecBuffer, res.bufferSlice, util::pipelineStages(binding.stage), binding.access); + accessBuffer(DxvkCmdBuffer::ExecBuffer, res.bufferSlice, util::pipelineStages(binding.stage), binding.access, binding.accessOp); m_cmd->track(res.bufferSlice.buffer(), (binding.access & vk::AccessWriteMask) ? DxvkAccess::Write : DxvkAccess::Read); @@ -6222,7 +6222,8 @@ namespace dxvk { *attachment.view->image(), attachment.view->imageSubresources(), oldLayout, VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT); + VK_ACCESS_2_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_2_COLOR_ATTACHMENT_READ_BIT, DxvkAccessOp::None); m_cmd->track(attachment.view->image(), DxvkAccess::Write); } @@ -6233,14 +6234,17 @@ namespace dxvk { const DxvkAttachment& attachment, VkImageLayout oldLayout) { if (oldLayout != attachment.view->image()->info().layout) { + VkAccessFlags2 access = VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_READ_BIT; + + if (oldLayout != VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL) + access |= VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + accessImage(DxvkCmdBuffer::ExecBuffer, *attachment.view->image(), attachment.view->imageSubresources(), oldLayout, VK_PIPELINE_STAGE_2_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT, - oldLayout != VK_IMAGE_LAYOUT_DEPTH_STENCIL_READ_ONLY_OPTIMAL - ? VK_ACCESS_2_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT - : VK_ACCESS_2_NONE); + access, DxvkAccessOp::None); m_cmd->track(attachment.view->image(), DxvkAccess::Write); } @@ -6385,7 +6389,7 @@ namespace dxvk { if (unlikely(m_state.vi.indexBuffer.buffer()->hasGfxStores())) { accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.vi.indexBuffer, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_INDEX_READ_BIT); + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_INDEX_READ_BIT, DxvkAccessOp::None); } m_cmd->track(m_state.vi.indexBuffer.buffer(), DxvkAccess::Read); @@ -6427,7 +6431,7 @@ namespace dxvk { if (unlikely(m_state.vi.vertexBuffers[binding].buffer()->hasGfxStores())) { accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.vi.vertexBuffers[binding], - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT); + VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, DxvkAccessOp::None); } m_cmd->track(m_state.vi.vertexBuffers[binding].buffer(), DxvkAccess::Read); @@ -6489,7 +6493,7 @@ namespace dxvk { accessBuffer(DxvkCmdBuffer::ExecBuffer, m_state.xfb.buffers[i], VK_PIPELINE_STAGE_2_TRANSFORM_FEEDBACK_BIT_EXT, - VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); + VK_ACCESS_2_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, DxvkAccessOp::None); m_cmd->track(std::move(buffer), DxvkAccess::Write); } @@ -6814,7 +6818,7 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER: { if (slot.bufferView) { if (!IsGraphics || slot.bufferView->buffer()->hasGfxStores()) - requiresBarrier |= checkBufferViewBarrier(slot.bufferView, binding.access); + requiresBarrier |= checkBufferViewBarrier(slot.bufferView, binding.access, binding.accessOp); else if (binding.access & vk::AccessWriteMask) requiresBarrier |= !slot.bufferView->buffer()->trackGfxStores(); } @@ -6822,18 +6826,18 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_UNIFORM_TEXEL_BUFFER: { if (slot.bufferView && (!IsGraphics || slot.bufferView->buffer()->hasGfxStores())) - requiresBarrier |= checkBufferViewBarrier(slot.bufferView, binding.access); + requiresBarrier |= checkBufferViewBarrier(slot.bufferView, binding.access, DxvkAccessOp::None); } break; case VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER: { if (slot.bufferSlice.length() && (!IsGraphics || slot.bufferSlice.buffer()->hasGfxStores())) - requiresBarrier |= checkBufferBarrier(slot.bufferSlice, binding.access); + requiresBarrier |= checkBufferBarrier(slot.bufferSlice, binding.access, DxvkAccessOp::None); } break; case VK_DESCRIPTOR_TYPE_STORAGE_BUFFER: { if (slot.bufferSlice.length()) { if (!IsGraphics || slot.bufferSlice.buffer()->hasGfxStores()) - requiresBarrier |= checkBufferBarrier(slot.bufferSlice, binding.access); + requiresBarrier |= checkBufferBarrier(slot.bufferSlice, binding.access, binding.accessOp); else if (binding.access & vk::AccessWriteMask) requiresBarrier |= !slot.bufferSlice.buffer()->trackGfxStores(); } @@ -6842,7 +6846,7 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_STORAGE_IMAGE: { if (slot.imageView) { if (!IsGraphics || slot.imageView->image()->hasGfxStores()) - requiresBarrier |= checkImageViewBarrier(slot.imageView, binding.access); + requiresBarrier |= checkImageViewBarrier(slot.imageView, binding.access, binding.accessOp); else if (binding.access & vk::AccessWriteMask) requiresBarrier |= !slot.imageView->image()->trackGfxStores(); } @@ -6851,7 +6855,7 @@ namespace dxvk { case VK_DESCRIPTOR_TYPE_SAMPLED_IMAGE: case VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER: { if (slot.imageView && (!IsGraphics || slot.imageView->image()->hasGfxStores())) - requiresBarrier |= checkImageViewBarrier(slot.imageView, binding.access); + requiresBarrier |= checkImageViewBarrier(slot.imageView, binding.access, DxvkAccessOp::None); } break; default: @@ -6904,13 +6908,14 @@ namespace dxvk { if (xfbBufferSlice.length()) { requiresBarrier |= !xfbBufferSlice.buffer()->trackGfxStores(); requiresBarrier |= checkBufferBarrier( - xfbBufferSlice, VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT); + xfbBufferSlice, VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT, DxvkAccessOp::None); if (xfbCounterSlice.length()) { requiresBarrier |= !xfbCounterSlice.buffer()->trackGfxStores(); requiresBarrier |= checkBufferBarrier(xfbCounterSlice, VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | - VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT); + VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT, + DxvkAccessOp::None); } } } @@ -6930,7 +6935,8 @@ namespace dxvk { for (uint32_t i = 0; i < slices.size(); i++) { if (slices[i]->length() && slices[i]->buffer()->hasGfxStores()) { - if (checkBufferBarrier(*slices[i], VK_ACCESS_INDIRECT_COMMAND_READ_BIT)) + if (checkBufferBarrier(*slices[i], + VK_ACCESS_INDIRECT_COMMAND_READ_BIT, DxvkAccessOp::None)) return true; } } @@ -6942,7 +6948,8 @@ namespace dxvk { const auto& indexBufferSlice = m_state.vi.indexBuffer; if (indexBufferSlice.length() && indexBufferSlice.buffer()->hasGfxStores()) { - if (checkBufferBarrier(indexBufferSlice, VK_ACCESS_INDEX_READ_BIT)) + if (checkBufferBarrier(indexBufferSlice, + VK_ACCESS_INDEX_READ_BIT, DxvkAccessOp::None)) return true; } } @@ -6956,7 +6963,8 @@ namespace dxvk { const auto& vertexBufferSlice = m_state.vi.vertexBuffers[binding]; if (vertexBufferSlice.length() && vertexBufferSlice.buffer()->hasGfxStores()) { - if (checkBufferBarrier(vertexBufferSlice, VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT)) + if (checkBufferBarrier(vertexBufferSlice, + VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT, DxvkAccessOp::None)) return true; } } @@ -6969,10 +6977,11 @@ namespace dxvk { template bool DxvkContext::checkBufferBarrier( const DxvkBufferSlice& bufferSlice, - VkAccessFlags access) { - return checkResourceBarrier([this, &bufferSlice] (DxvkAccess access) { + VkAccessFlags access, + DxvkAccessOp accessOp) { + return checkResourceBarrier([this, &bufferSlice, accessOp] (DxvkAccess access) { return resourceHasAccess(*bufferSlice.buffer(), - bufferSlice.offset(), bufferSlice.length(), access); + bufferSlice.offset(), bufferSlice.length(), access, accessOp); }, access); } @@ -6980,9 +6989,10 @@ namespace dxvk { template bool DxvkContext::checkBufferViewBarrier( const Rc& bufferView, - VkAccessFlags access) { - return checkResourceBarrier([this, &bufferView] (DxvkAccess access) { - return resourceHasAccess(*bufferView, access); + VkAccessFlags access, + DxvkAccessOp accessOp) { + return checkResourceBarrier([this, &bufferView, accessOp] (DxvkAccess access) { + return resourceHasAccess(*bufferView, access, accessOp); }, access); } @@ -6990,9 +7000,10 @@ namespace dxvk { template bool DxvkContext::checkImageViewBarrier( const Rc& imageView, - VkAccessFlags access) { - return checkResourceBarrier([this, &imageView] (DxvkAccess access) { - return resourceHasAccess(*imageView, access); + VkAccessFlags access, + DxvkAccessOp accessOp) { + return checkResourceBarrier([this, &imageView, accessOp] (DxvkAccess access) { + return resourceHasAccess(*imageView, access, accessOp); }, access); } @@ -7717,12 +7728,14 @@ namespace dxvk { const VkImageSubresourceRange& subresources, VkImageLayout srcLayout, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess) { + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp) { accessImage(cmdBuffer, image, subresources, srcLayout, srcStages, srcAccess, image.info().layout, image.info().stages, - image.info().access); + image.info().access, + accessOp); } @@ -7730,11 +7743,12 @@ namespace dxvk { DxvkCmdBuffer cmdBuffer, const DxvkImageView& imageView, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess) { + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp) { accessImage(cmdBuffer, *imageView.image(), imageView.imageSubresources(), imageView.image()->info().layout, - srcStages, srcAccess); + srcStages, srcAccess, accessOp); } @@ -7747,7 +7761,8 @@ namespace dxvk { VkAccessFlags2 srcAccess, VkImageLayout dstLayout, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess) { + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp) { auto& batch = getBarrierBatch(cmdBuffer); if (srcLayout == VK_IMAGE_LAYOUT_UNDEFINED || srcLayout == VK_IMAGE_LAYOUT_PREINITIALIZED) @@ -7808,11 +7823,13 @@ namespace dxvk { VkDeviceSize offset, VkDeviceSize size, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess) { + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp) { accessBuffer(cmdBuffer, buffer, offset, size, srcStages, srcAccess, buffer.info().stages, - buffer.info().access); + buffer.info().access, + accessOp); } @@ -7824,7 +7841,8 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess) { + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp) { if (unlikely(!size)) return; @@ -7856,12 +7874,14 @@ namespace dxvk { DxvkCmdBuffer cmdBuffer, const DxvkBufferSlice& bufferSlice, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess) { + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp) { accessBuffer(cmdBuffer, *bufferSlice.buffer(), bufferSlice.offset(), bufferSlice.length(), - srcStages, srcAccess); + srcStages, srcAccess, + accessOp); } @@ -7871,13 +7891,15 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess) { + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp) { accessBuffer(cmdBuffer, *bufferSlice.buffer(), bufferSlice.offset(), bufferSlice.length(), srcStages, srcAccess, - dstStages, dstAccess); + dstStages, dstAccess, + accessOp); } @@ -7885,12 +7907,13 @@ namespace dxvk { DxvkCmdBuffer cmdBuffer, DxvkBufferView& bufferView, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess) { + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp) { accessBuffer(cmdBuffer, *bufferView.buffer(), bufferView.info().offset, bufferView.info().size, - srcStages, srcAccess); + srcStages, srcAccess, accessOp); } @@ -7900,13 +7923,15 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess) { + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp) { accessBuffer(cmdBuffer, *bufferView.buffer(), bufferView.info().offset, bufferView.info().size, srcStages, srcAccess, - dstStages, dstAccess); + dstStages, dstAccess, + accessOp); } @@ -7921,7 +7946,8 @@ namespace dxvk { *m_state.id.argBuffer.buffer(), m_state.id.argBuffer.offset() + offset, dataSize, VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT, - VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR); + VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, + DxvkAccessOp::None); } @@ -7931,7 +7957,8 @@ namespace dxvk { *m_state.id.cntBuffer.buffer(), m_state.id.cntBuffer.offset() + offset, sizeof(uint32_t), VK_PIPELINE_STAGE_2_DRAW_INDIRECT_BIT, - VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR); + VK_ACCESS_2_INDIRECT_COMMAND_READ_BIT_KHR, + DxvkAccessOp::None); } @@ -7940,10 +7967,13 @@ namespace dxvk { VkDeviceSize offset, VkDeviceSize size, DxvkAccess access) { - bool flush = resourceHasAccess(buffer, offset, size, DxvkAccess::Write); + bool flush = resourceHasAccess(buffer, offset, size, + DxvkAccess::Write, DxvkAccessOp::None); - if (access == DxvkAccess::Write && !flush) - flush = resourceHasAccess(buffer, offset, size, DxvkAccess::Read); + if (access == DxvkAccess::Write && !flush) { + flush = resourceHasAccess(buffer, offset, size, + DxvkAccess::Read, DxvkAccessOp::None); + } if (flush) flushBarriers(); @@ -7964,10 +7994,13 @@ namespace dxvk { DxvkImage& image, const VkImageSubresourceRange& subresources, DxvkAccess access) { - bool flush = resourceHasAccess(image, subresources, DxvkAccess::Write); + bool flush = resourceHasAccess(image, subresources, + DxvkAccess::Write, DxvkAccessOp::None); - if (access == DxvkAccess::Write && !flush) - flush = resourceHasAccess(image, subresources, DxvkAccess::Read); + if (access == DxvkAccess::Write && !flush) { + flush = resourceHasAccess(image, subresources, + DxvkAccess::Read, DxvkAccessOp::None); + } if (flush) flushBarriers(); @@ -7994,7 +8027,8 @@ namespace dxvk { DxvkBuffer& buffer, VkDeviceSize offset, VkDeviceSize size, - DxvkAccess access) { + DxvkAccess access, + DxvkAccessOp accessOp) { if (unlikely(!size)) return false; @@ -8009,17 +8043,19 @@ namespace dxvk { bool DxvkContext::resourceHasAccess( DxvkBufferView& bufferView, - DxvkAccess access) { + DxvkAccess access, + DxvkAccessOp accessOp) { return resourceHasAccess(*bufferView.buffer(), bufferView.info().offset, - bufferView.info().size, access); + bufferView.info().size, access, accessOp); } bool DxvkContext::resourceHasAccess( DxvkImage& image, const VkImageSubresourceRange& subresources, - DxvkAccess access) { + DxvkAccess access, + DxvkAccessOp accessOp) { uint32_t layerCount = image.info().numLayers; // Subresources are enumerated in such a way that array layers of @@ -8053,8 +8089,9 @@ namespace dxvk { bool DxvkContext::resourceHasAccess( DxvkImageView& imageView, - DxvkAccess access) { - return resourceHasAccess(*imageView.image(), imageView.imageSubresources(), access); + DxvkAccess access, + DxvkAccessOp accessOp) { + return resourceHasAccess(*imageView.image(), imageView.imageSubresources(), access, accessOp); } diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 725c9cc1a..d975545a6 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1771,17 +1771,20 @@ namespace dxvk { template bool checkBufferBarrier( const DxvkBufferSlice& bufferSlice, - VkAccessFlags access); + VkAccessFlags access, + DxvkAccessOp accessOp); template bool checkBufferViewBarrier( const Rc& bufferView, - VkAccessFlags access); + VkAccessFlags access, + DxvkAccessOp accessOp); template bool checkImageViewBarrier( const Rc& imageView, - VkAccessFlags access); + VkAccessFlags access, + DxvkAccessOp accessOp); template DxvkAccessFlags getAllowedStorageHazards() { @@ -1901,13 +1904,15 @@ namespace dxvk { const VkImageSubresourceRange& subresources, VkImageLayout srcLayout, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess); + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp); void accessImage( DxvkCmdBuffer cmdBuffer, const DxvkImageView& imageView, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess); + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp); void accessImage( DxvkCmdBuffer cmdBuffer, @@ -1918,7 +1923,8 @@ namespace dxvk { VkAccessFlags2 srcAccess, VkImageLayout dstLayout, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess); + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp); void accessBuffer( DxvkCmdBuffer cmdBuffer, @@ -1926,7 +1932,8 @@ namespace dxvk { VkDeviceSize offset, VkDeviceSize size, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess); + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp); void accessBuffer( DxvkCmdBuffer cmdBuffer, @@ -1936,13 +1943,15 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess); + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp); void accessBuffer( DxvkCmdBuffer cmdBuffer, const DxvkBufferSlice& bufferSlice, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess); + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp); void accessBuffer( DxvkCmdBuffer cmdBuffer, @@ -1950,13 +1959,15 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess); + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp); void accessBuffer( DxvkCmdBuffer cmdBuffer, DxvkBufferView& bufferView, VkPipelineStageFlags2 srcStages, - VkAccessFlags2 srcAccess); + VkAccessFlags2 srcAccess, + DxvkAccessOp accessOp); void accessBuffer( DxvkCmdBuffer cmdBuffer, @@ -1964,7 +1975,8 @@ namespace dxvk { VkPipelineStageFlags2 srcStages, VkAccessFlags2 srcAccess, VkPipelineStageFlags2 dstStages, - VkAccessFlags2 dstAccess); + VkAccessFlags2 dstAccess, + DxvkAccessOp accessOp); void accessDrawBuffer( VkDeviceSize offset, @@ -2000,20 +2012,24 @@ namespace dxvk { DxvkBuffer& buffer, VkDeviceSize offset, VkDeviceSize size, - DxvkAccess access); + DxvkAccess access, + DxvkAccessOp accessOp); bool resourceHasAccess( DxvkBufferView& bufferView, - DxvkAccess access); + DxvkAccess access, + DxvkAccessOp accessOp); bool resourceHasAccess( DxvkImage& image, const VkImageSubresourceRange& subresources, - DxvkAccess access); + DxvkAccess access, + DxvkAccessOp accessOp); bool resourceHasAccess( DxvkImageView& imageView, - DxvkAccess access); + DxvkAccess access, + DxvkAccessOp accessOp); DxvkBarrierBatch& getBarrierBatch( DxvkCmdBuffer cmdBuffer); From dd1ca4ce59ba10ef32e04c99b7d354628dc173e0 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Sat, 15 Feb 2025 22:16:03 +0100 Subject: [PATCH 27/44] [dxvk] Track order-invariant access ops in barrier tracker This elides barriers between draws or dispatches if we can prove order-invariance through atomic operations. --- src/dxvk/dxvk_barrier.cpp | 43 +++++++++++++++++++++++++++++++------- src/dxvk/dxvk_barrier.h | 24 +++++++++++++++++---- src/dxvk/dxvk_context.cpp | 18 ++++++++-------- src/dxvk/dxvk_pipelayout.h | 3 +++ 4 files changed, 67 insertions(+), 21 deletions(-) diff --git a/src/dxvk/dxvk_barrier.cpp b/src/dxvk/dxvk_barrier.cpp index 287a0ff2f..9d9feb191 100644 --- a/src/dxvk/dxvk_barrier.cpp +++ b/src/dxvk/dxvk_barrier.cpp @@ -20,20 +20,37 @@ namespace dxvk { bool DxvkBarrierTracker::findRange( const DxvkAddressRange& range, - DxvkAccess accessType) const { + DxvkAccess accessType, + DxvkAccessOp accessOp) const { uint32_t rootIndex = computeRootIndex(range, accessType); - return findNode(range, rootIndex); + uint32_t nodeIndex = findNode(range, rootIndex); + + if (likely(!nodeIndex || accessOp == DxvkAccessOp::None)) + return nodeIndex; + + // If we are checking for a specific order-invariant store + // op, the op must have been the only op used to access the + // resource, and the tracked range must cover the requested + // range in its entirety so we can rule out that other parts + // of the resource have been accessed in a different way. + auto& node = m_nodes[nodeIndex]; + + return node.payload.accessOps != DxvkAccessOps(accessOp) + || !node.addressRange.contains(range); } void DxvkBarrierTracker::insertRange( const DxvkAddressRange& range, - DxvkAccess accessType) { - uint32_t rootIndex = computeRootIndex(range, accessType); + DxvkAccess accessType, + DxvkAccessOp accessOp) { + DxvkBarrierPayload payload = { }; + payload.accessOps.set(accessOp); // If we can just insert the node with no conflicts, // we don't have to do anything. - uint32_t nodeIndex = insertNode(range, rootIndex); + uint32_t rootIndex = computeRootIndex(range, accessType); + uint32_t nodeIndex = insertNode(range, rootIndex, payload); if (likely(!nodeIndex)) return; @@ -41,6 +58,7 @@ namespace dxvk { // If there's an existing node and it contains the entire // range we want to add already, also don't do anything. auto& node = m_nodes[nodeIndex]; + node.payload.accessOps.set(payload.accessOps); if (node.addressRange.contains(range)) return; @@ -82,12 +100,14 @@ namespace dxvk { mergedRange.rangeStart = std::min(mergedRange.rangeStart, node.addressRange.rangeStart); mergedRange.rangeEnd = std::max(mergedRange.rangeEnd, node.addressRange.rangeEnd); + payload.accessOps.set(node.payload.accessOps); + removeNode(nodeIndex, rootIndex); nodeIndex = findNode(range, rootIndex); } - insertNode(mergedRange, rootIndex); + insertNode(mergedRange, rootIndex, payload); } @@ -166,7 +186,8 @@ namespace dxvk { uint32_t DxvkBarrierTracker::insertNode( const DxvkAddressRange& range, - uint32_t rootIndex) { + uint32_t rootIndex, + DxvkBarrierPayload payload) { // Check if the given root is valid at all uint64_t rootBit = uint64_t(1u) << (rootIndex - 1u); @@ -178,6 +199,7 @@ namespace dxvk { auto& node = m_nodes[rootIndex]; node.header = 0; node.addressRange = range; + node.payload = payload; return 0; } else { // Traverse tree and abort if we find any range @@ -209,6 +231,7 @@ namespace dxvk { node.setRed(true); node.setParent(parentIndex); node.addressRange = range; + node.payload = payload; // Only do the fixup to maintain red-black properties if // we haven't marked the root node as red in a deletion. @@ -238,6 +261,7 @@ namespace dxvk { childIndex = m_nodes[childIndex].child(0); node.addressRange = m_nodes[childIndex].addressRange; + node.payload = m_nodes[childIndex].payload; removeNode(childIndex, rootIndex); } else { // Deletion is expected to be exceptionally rare, to the point of @@ -268,6 +292,7 @@ namespace dxvk { node.setRed(child.isRed()); node.addressRange = child.addressRange; + node.payload = child.payload; if (cl) m_nodes[cl].setParent(nodeIndex); if (cr) m_nodes[cr].setParent(nodeIndex); @@ -378,6 +403,7 @@ namespace dxvk { node.setChild(1, rr); std::swap(node.addressRange, m_nodes[r].addressRange); + std::swap(node.payload, m_nodes[r].payload); } @@ -406,6 +432,7 @@ namespace dxvk { node.setChild(1, l); std::swap(node.addressRange, m_nodes[l].addressRange); + std::swap(node.payload, m_nodes[l].payload); } @@ -498,4 +525,4 @@ namespace dxvk { flush(list); } -} \ No newline at end of file +} diff --git a/src/dxvk/dxvk_barrier.h b/src/dxvk/dxvk_barrier.h index 3b4cdf9c5..fb0d0726f 100644 --- a/src/dxvk/dxvk_barrier.h +++ b/src/dxvk/dxvk_barrier.h @@ -42,6 +42,14 @@ namespace dxvk { }; + /** + * \brief Barrier node payload + */ + struct DxvkBarrierPayload { + DxvkAccessOps accessOps = 0u; + }; + + /** * \brief Barrier tree node * @@ -62,6 +70,9 @@ namespace dxvk { // Address range of the node DxvkAddressRange addressRange = { }; + // Node payload + DxvkBarrierPayload payload = { }; + void setRed(bool red) { header &= ~uint64_t(1u); header |= uint64_t(red); @@ -117,21 +128,25 @@ namespace dxvk { * * \param [in] range Resource range * \param [in] accessType Access type + * \param [in] accessOp Access operation * \returns \c true if the range has a pending access */ bool findRange( const DxvkAddressRange& range, - DxvkAccess accessType) const; + DxvkAccess accessType, + DxvkAccessOp accessOp) const; /** * \brief Inserts address range for a given access type * * \param [in] range Resource range * \param [in] accessType Access type + * \param [in] accessOp Access operation */ void insertRange( const DxvkAddressRange& range, - DxvkAccess accessType); + DxvkAccess accessType, + DxvkAccessOp accessOp); /** * \brief Clears the entire structure @@ -166,7 +181,8 @@ namespace dxvk { uint32_t insertNode( const DxvkAddressRange& range, - uint32_t rootIndex); + uint32_t rootIndex, + DxvkBarrierPayload payload); void removeNode( uint32_t nodeIndex, @@ -285,4 +301,4 @@ namespace dxvk { }; -} \ No newline at end of file +} diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index b07bc2518..2cbc83a13 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -7796,9 +7796,9 @@ namespace dxvk { + (subresources.baseArrayLayer + subresources.layerCount - 1u); if (hasWrite) - m_barrierTracker.insertRange(range, DxvkAccess::Write); + m_barrierTracker.insertRange(range, DxvkAccess::Write, accessOp); if (hasRead) - m_barrierTracker.insertRange(range, DxvkAccess::Read); + m_barrierTracker.insertRange(range, DxvkAccess::Read, accessOp); } else { DxvkAddressRange range; range.resource = image.getResourceId(); @@ -7808,9 +7808,9 @@ namespace dxvk { range.rangeEnd = range.rangeStart + subresources.layerCount - 1u; if (hasWrite) - m_barrierTracker.insertRange(range, DxvkAccess::Write); + m_barrierTracker.insertRange(range, DxvkAccess::Write, accessOp); if (hasRead) - m_barrierTracker.insertRange(range, DxvkAccess::Read); + m_barrierTracker.insertRange(range, DxvkAccess::Read, accessOp); } } } @@ -7863,9 +7863,9 @@ namespace dxvk { range.rangeEnd = offset + size - 1; if (srcAccess & vk::AccessWriteMask) - m_barrierTracker.insertRange(range, DxvkAccess::Write); + m_barrierTracker.insertRange(range, DxvkAccess::Write, accessOp); if (srcAccess & vk::AccessReadMask) - m_barrierTracker.insertRange(range, DxvkAccess::Read); + m_barrierTracker.insertRange(range, DxvkAccess::Read, accessOp); } } @@ -8037,7 +8037,7 @@ namespace dxvk { range.rangeStart = offset; range.rangeEnd = offset + size - 1; - return m_barrierTracker.findRange(range, access); + return m_barrierTracker.findRange(range, access, accessOp); } @@ -8071,7 +8071,7 @@ namespace dxvk { // Probe all subresources first, only check individual mip levels // if there are overlaps and if we are checking a subset of array // layers of multiple mips. - bool dirty = m_barrierTracker.findRange(range, access); + bool dirty = m_barrierTracker.findRange(range, access, accessOp); if (!dirty || subresources.levelCount == 1u || subresources.layerCount == layerCount) return dirty; @@ -8080,7 +8080,7 @@ namespace dxvk { range.rangeStart = i * layerCount + subresources.baseArrayLayer; range.rangeEnd = range.rangeStart + subresources.layerCount - 1u; - dirty = m_barrierTracker.findRange(range, access); + dirty = m_barrierTracker.findRange(range, access, accessOp); } return dirty; diff --git a/src/dxvk/dxvk_pipelayout.h b/src/dxvk/dxvk_pipelayout.h index 6271be481..07d3f8e41 100644 --- a/src/dxvk/dxvk_pipelayout.h +++ b/src/dxvk/dxvk_pipelayout.h @@ -29,6 +29,9 @@ namespace dxvk { UMax = 8, }; + using DxvkAccessOps = Flags; + + /** * \brief Descriptor set indices */ From 6f7a4681740e7656cd734d39fd1e63cab63885ff Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Mon, 17 Feb 2025 04:35:23 +0100 Subject: [PATCH 28/44] [dxvk] Fix global render pass barrier Only need to deal with common write-after-read scenarios, we can ignore writes since those will add extra barriers anyway. Also move this work out of the somewhat hot pipeline bind function. --- src/dxvk/dxvk_context.cpp | 79 ++++++++++++++------------------------ src/dxvk/dxvk_context.h | 8 ++-- src/dxvk/dxvk_graphics.cpp | 3 +- 3 files changed, 33 insertions(+), 57 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 2cbc83a13..f35ea2fbf 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -21,32 +21,17 @@ namespace dxvk { m_state.om.framebufferInfo = makeFramebufferInfo(m_state.om.renderTargets); m_descriptorManager = new DxvkDescriptorManager(device.ptr()); - // Default destination barriers for graphics pipelines - m_globalRoGraphicsBarrier.stages = m_device->getShaderPipelineStages() - | VK_PIPELINE_STAGE_TRANSFER_BIT - | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT - | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT - | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; - m_globalRoGraphicsBarrier.access = 0; + // Global barrier for graphics pipelines. This is only used to + // avoid write-after-read hazards after a render pass, so the + // access mask here can be zero. + m_renderPassBarrierDst.stages = m_device->getShaderPipelineStages() + | VK_PIPELINE_STAGE_TRANSFER_BIT + | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT + | VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT + | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT; if (m_device->features().extTransformFeedback.transformFeedback) - m_globalRoGraphicsBarrier.stages |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; - - m_globalRwGraphicsBarrier = m_globalRoGraphicsBarrier; - m_globalRwGraphicsBarrier.stages |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT - | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; - - m_globalRwGraphicsBarrier.access |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT - | VK_ACCESS_INDEX_READ_BIT - | VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT - | VK_ACCESS_UNIFORM_READ_BIT - | VK_ACCESS_SHADER_READ_BIT - | VK_ACCESS_COLOR_ATTACHMENT_READ_BIT - | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT - | VK_ACCESS_TRANSFER_READ_BIT; - - if (m_device->features().extTransformFeedback.transformFeedback) - m_globalRwGraphicsBarrier.access |= VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT; + m_renderPassBarrierDst.stages |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; // Store the lifetime tracking bit as a context feature so // that we don't have to scan device features at draw time @@ -5230,6 +5215,14 @@ namespace dxvk { else this->transitionRenderTargetLayouts(false); + if (m_renderPassBarrierSrc.stages) { + accessMemory(DxvkCmdBuffer::ExecBuffer, + m_renderPassBarrierSrc.stages, m_renderPassBarrierSrc.access, + m_renderPassBarrierDst.stages, m_renderPassBarrierDst.access); + + m_renderPassBarrierSrc = DxvkGlobalPipelineBarrier(); + } + flushBarriers(); flushResolves(); @@ -5733,7 +5726,7 @@ namespace dxvk { } - bool DxvkContext::updateGraphicsPipelineState(DxvkGlobalPipelineBarrier srcBarrier) { + bool DxvkContext::updateGraphicsPipelineState() { bool oldIndependentSets = m_flags.test(DxvkContextFlag::GpIndependentSets); // Check which dynamic states need to be active. States that @@ -5808,19 +5801,9 @@ namespace dxvk { // Emit barrier based on pipeline properties, in order to avoid // accidental write-after-read hazards after the render pass. - DxvkGlobalPipelineBarrier pipelineBarrier = m_state.gp.pipeline->getGlobalBarrier(m_state.gp.state); - srcBarrier.stages |= pipelineBarrier.stages; - srcBarrier.access |= pipelineBarrier.access; - - if (srcBarrier.stages) { - DxvkGlobalPipelineBarrier dstBarrier = (srcBarrier.access & vk::AccessWriteMask) - ? m_globalRwGraphicsBarrier - : m_globalRoGraphicsBarrier; - - accessMemory(DxvkCmdBuffer::ExecBuffer, - srcBarrier.stages, srcBarrier.access, - dstBarrier.stages, dstBarrier.access); - } + DxvkGlobalPipelineBarrier srcBarrier = m_state.gp.pipeline->getGlobalBarrier(m_state.gp.state); + m_renderPassBarrierSrc.stages |= srcBarrier.stages; + m_renderPassBarrierSrc.access |= srcBarrier.access; if (unlikely(m_features.test(DxvkContextFeature::DebugUtils))) { uint32_t color = getGraphicsPipelineDebugColor(); @@ -6392,6 +6375,9 @@ namespace dxvk { VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, VK_ACCESS_INDEX_READ_BIT, DxvkAccessOp::None); } + m_renderPassBarrierSrc.stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; + m_renderPassBarrierSrc.access |= VK_ACCESS_INDEX_READ_BIT; + m_cmd->track(m_state.vi.indexBuffer.buffer(), DxvkAccess::Read); return true; } @@ -6758,19 +6744,7 @@ namespace dxvk { this->updateSpecConstants(); if (m_flags.test(DxvkContextFlag::GpDirtyPipelineState)) { - DxvkGlobalPipelineBarrier barrier = { }; - - if (Indexed) { - barrier.stages |= VK_PIPELINE_STAGE_VERTEX_INPUT_BIT; - barrier.access |= VK_ACCESS_INDEX_READ_BIT; - } - - if (Indirect) { - barrier.stages |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT; - barrier.access |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT; - } - - if (unlikely(!this->updateGraphicsPipelineState(barrier))) + if (unlikely(!this->updateGraphicsPipelineState())) return false; } @@ -7031,6 +7005,9 @@ namespace dxvk { if (m_flags.test(DxvkContextFlag::DirtyDrawBuffer)) { m_flags.clr(DxvkContextFlag::DirtyDrawBuffer); + m_renderPassBarrierSrc.stages |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT; + m_renderPassBarrierSrc.access |= VK_ACCESS_INDIRECT_COMMAND_READ_BIT; + if (m_state.id.argBuffer.length()) m_cmd->track(m_state.id.argBuffer.buffer(), DxvkAccess::Read); diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index d975545a6..4b836d763 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1434,9 +1434,9 @@ namespace dxvk { DxvkBarrierControlFlags m_barrierControl; DxvkGpuQueryManager m_queryManager; - - DxvkGlobalPipelineBarrier m_globalRoGraphicsBarrier; - DxvkGlobalPipelineBarrier m_globalRwGraphicsBarrier; + + DxvkGlobalPipelineBarrier m_renderPassBarrierSrc = { }; + DxvkGlobalPipelineBarrier m_renderPassBarrierDst = { }; DxvkRenderTargetLayouts m_rtLayouts = { }; @@ -1690,7 +1690,7 @@ namespace dxvk { void unbindGraphicsPipeline(); bool updateGraphicsPipeline(); - bool updateGraphicsPipelineState(DxvkGlobalPipelineBarrier srcBarrier); + bool updateGraphicsPipelineState(); uint32_t getGraphicsPipelineDebugColor() const; diff --git a/src/dxvk/dxvk_graphics.cpp b/src/dxvk/dxvk_graphics.cpp index 7a8ed26a0..9ac5211de 100644 --- a/src/dxvk/dxvk_graphics.cpp +++ b/src/dxvk/dxvk_graphics.cpp @@ -955,8 +955,7 @@ namespace dxvk { if (m_shaders.gs->flags().test(DxvkShaderFlag::HasTransformFeedback)) { m_flags.set(DxvkGraphicsPipelineFlag::HasTransformFeedback); - m_barrier.stages |= VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT - | VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; + m_barrier.stages |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; m_barrier.access |= VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_READ_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT; From 1d8fb818fcf084c91615a9d42644327e88a7842e Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 19 Feb 2025 19:07:43 +0100 Subject: [PATCH 29/44] [dxvk] Check whether pipeline has potentially hazardous stores --- src/dxvk/dxvk_graphics.cpp | 6 +++++- src/dxvk/dxvk_graphics.h | 3 ++- src/dxvk/dxvk_pipelayout.cpp | 9 +++++++-- src/dxvk/dxvk_pipelayout.h | 11 +++++++++++ 4 files changed, 25 insertions(+), 4 deletions(-) diff --git a/src/dxvk/dxvk_graphics.cpp b/src/dxvk/dxvk_graphics.cpp index 9ac5211de..9c12ae1f5 100644 --- a/src/dxvk/dxvk_graphics.cpp +++ b/src/dxvk/dxvk_graphics.cpp @@ -965,9 +965,13 @@ namespace dxvk { m_flags.set(DxvkGraphicsPipelineFlag::HasRasterizerDiscard); } - if (m_barrier.access & VK_ACCESS_SHADER_WRITE_BIT) + if (m_barrier.access & VK_ACCESS_SHADER_WRITE_BIT) { m_flags.set(DxvkGraphicsPipelineFlag::HasStorageDescriptors); + if (layout->layout().getHazardousSetMask()) + m_flags.set(DxvkGraphicsPipelineFlag::UnrollMergedDraws); + } + if (m_shaders.fs != nullptr) { if (m_shaders.fs->flags().test(DxvkShaderFlag::HasSampleRateShading)) m_flags.set(DxvkGraphicsPipelineFlag::HasSampleRateShading); diff --git a/src/dxvk/dxvk_graphics.h b/src/dxvk/dxvk_graphics.h index 3503fd896..2dde3e162 100644 --- a/src/dxvk/dxvk_graphics.h +++ b/src/dxvk/dxvk_graphics.h @@ -31,6 +31,7 @@ namespace dxvk { HasStorageDescriptors, HasSampleRateShading, HasSampleMaskExport, + UnrollMergedDraws, }; using DxvkGraphicsPipelineFlags = Flags; @@ -660,4 +661,4 @@ namespace dxvk { }; -} \ No newline at end of file +} diff --git a/src/dxvk/dxvk_pipelayout.cpp b/src/dxvk/dxvk_pipelayout.cpp index 9132d025c..cf3f69553 100644 --- a/src/dxvk/dxvk_pipelayout.cpp +++ b/src/dxvk/dxvk_pipelayout.cpp @@ -205,7 +205,7 @@ namespace dxvk { DxvkBindingLayout::DxvkBindingLayout(VkShaderStageFlags stages) - : m_pushConst { 0, 0, 0 }, m_pushConstStages(0), m_stages(stages) { + : m_pushConst { 0, 0, 0 }, m_pushConstStages(0), m_stages(stages), m_hazards(0u) { } @@ -236,6 +236,9 @@ namespace dxvk { void DxvkBindingLayout::addBinding(const DxvkBindingInfo& binding) { uint32_t set = binding.computeSetIndex(); m_bindings[set].addBinding(binding); + + if ((binding.access & VK_ACCESS_2_SHADER_WRITE_BIT) && binding.accessOp == DxvkAccessOp::None) + m_hazards |= 1u << set; } @@ -260,6 +263,8 @@ namespace dxvk { addPushConstantRange(layout.m_pushConst); m_pushConstStages |= layout.m_pushConstStages; + + m_hazards |= layout.m_hazards; } @@ -400,4 +405,4 @@ namespace dxvk { return barrier; } -} \ No newline at end of file +} diff --git a/src/dxvk/dxvk_pipelayout.h b/src/dxvk/dxvk_pipelayout.h index 07d3f8e41..4f43b60ad 100644 --- a/src/dxvk/dxvk_pipelayout.h +++ b/src/dxvk/dxvk_pipelayout.h @@ -337,6 +337,16 @@ namespace dxvk { return m_stages; } + /** + * \brief Queries hazardous sets + * + * \returns Mask of sets with storage descriptors + * that are not accessed in an order-invariant way. + */ + uint32_t getHazardousSetMask() const { + return m_hazards; + } + /** * \brief Queries defined descriptor set layouts * @@ -394,6 +404,7 @@ namespace dxvk { VkPushConstantRange m_pushConst; VkShaderStageFlags m_pushConstStages; VkShaderStageFlags m_stages; + uint32_t m_hazards; }; From a135e01f896fdf46a39530b8f2073ea2b5028e31 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 19 Feb 2025 19:09:47 +0100 Subject: [PATCH 30/44] [dxvk] Unroll merged indirect draws as necessary --- src/d3d11/d3d11_context.cpp | 4 +- src/d3d11/d3d11_context_ext.cpp | 4 +- src/dxvk/dxvk_context.cpp | 76 ++++++++++++++++++++++----------- src/dxvk/dxvk_context.h | 19 +++++++-- 4 files changed, 72 insertions(+), 31 deletions(-) diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index 97af10d3a..16529a217 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -1121,7 +1121,7 @@ namespace dxvk { } else { cmdData = EmitCsCmd( [] (DxvkContext* ctx, const D3D11CmdDrawIndirectData* data) { - ctx->drawIndexedIndirect(data->offset, data->count, data->stride); + ctx->drawIndexedIndirect(data->offset, data->count, data->stride, true); }); cmdData->type = D3D11CmdType::DrawIndirectIndexed; @@ -1156,7 +1156,7 @@ namespace dxvk { } else { cmdData = EmitCsCmd( [] (DxvkContext* ctx, const D3D11CmdDrawIndirectData* data) { - ctx->drawIndirect(data->offset, data->count, data->stride); + ctx->drawIndirect(data->offset, data->count, data->stride, true); }); cmdData->type = D3D11CmdType::DrawIndirect; diff --git a/src/d3d11/d3d11_context_ext.cpp b/src/d3d11/d3d11_context_ext.cpp index 051610167..5254f480f 100644 --- a/src/d3d11/d3d11_context_ext.cpp +++ b/src/d3d11/d3d11_context_ext.cpp @@ -53,7 +53,7 @@ namespace dxvk { cOffset = ByteOffsetForArgs, cStride = ByteStrideForArgs ] (DxvkContext* ctx) { - ctx->drawIndirect(cOffset, cCount, cStride); + ctx->drawIndirect(cOffset, cCount, cStride, false); }); } @@ -72,7 +72,7 @@ namespace dxvk { cOffset = ByteOffsetForArgs, cStride = ByteStrideForArgs ] (DxvkContext* ctx) { - ctx->drawIndexedIndirect(cOffset, cCount, cStride); + ctx->drawIndexedIndirect(cOffset, cCount, cStride, false); }); } diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index f35ea2fbf..3ccf3835c 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -937,18 +937,9 @@ namespace dxvk { void DxvkContext::drawIndirect( VkDeviceSize offset, uint32_t count, - uint32_t stride) { - if (this->commitGraphicsState()) { - auto descriptor = m_state.id.argBuffer.getDescriptor(); - - m_cmd->cmdDrawIndirect( - descriptor.buffer.buffer, - descriptor.buffer.offset + offset, - count, stride); - - if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) - accessDrawBuffer(offset, count, stride, sizeof(VkDrawIndirectCommand)); - } + uint32_t stride, + bool unroll) { + drawIndirectGeneric(offset, count, stride, unroll); } @@ -995,18 +986,9 @@ namespace dxvk { void DxvkContext::drawIndexedIndirect( VkDeviceSize offset, uint32_t count, - uint32_t stride) { - if (this->commitGraphicsState()) { - auto descriptor = m_state.id.argBuffer.getDescriptor(); - - m_cmd->cmdDrawIndexedIndirect( - descriptor.buffer.buffer, - descriptor.buffer.offset + offset, - count, stride); - - if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) - accessDrawBuffer(offset, count, stride, sizeof(VkDrawIndexedIndirectCommand)); - } + uint32_t stride, + bool unroll) { + drawIndirectGeneric(offset, count, stride, unroll); } @@ -1739,6 +1721,52 @@ namespace dxvk { } + template + void DxvkContext::drawIndirectGeneric( + VkDeviceSize offset, + uint32_t count, + uint32_t stride, + bool unroll) { + if (this->commitGraphicsState()) { + auto descriptor = m_state.id.argBuffer.getDescriptor(); + + if (unroll) { + // Need to do this check after initially setting up the pipeline + unroll = m_state.gp.flags.test(DxvkGraphicsPipelineFlag::UnrollMergedDraws) + && !m_barrierControl.test(DxvkBarrierControl::GraphicsAllowReadWriteOverlap); + } + + // If draws are merged but the pipeline has order-dependent stores, submit + // one draw at a time as well as barriers in between. Otherwise, keep the + // draws merged. + uint32_t step = unroll ? 1u : count; + + for (uint32_t i = 0; i < count; i += step) { + if (unlikely(i)) { + // Insert barrier after the first iteration + this->commitGraphicsState(); + } + + if (Indexed) { + m_cmd->cmdDrawIndexedIndirect(descriptor.buffer.buffer, + descriptor.buffer.offset + offset, step, stride); + } else { + m_cmd->cmdDrawIndirect(descriptor.buffer.buffer, + descriptor.buffer.offset + offset, step, stride); + } + + if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) { + accessDrawBuffer(offset, step, stride, Indexed + ? sizeof(VkDrawIndexedIndirectCommand) + : sizeof(VkDrawIndirectCommand)); + } + + offset += step * stride; + } + } + } + + void DxvkContext::resolveImage( const Rc& dstImage, const Rc& srcImage, diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index 4b836d763..b27af8405 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -763,11 +763,14 @@ namespace dxvk { * \param [in] offset Draw buffer offset * \param [in] count Number of draws * \param [in] stride Stride between dispatch calls + * \param [in] unroll Whether to unroll multiple draws if + * there are any potential data dependencies between them. */ void drawIndirect( VkDeviceSize offset, uint32_t count, - uint32_t stride); + uint32_t stride, + bool unroll); /** * \brief Indirect draw call @@ -809,12 +812,15 @@ namespace dxvk { * \param [in] offset Draw buffer offset * \param [in] count Number of draws * \param [in] stride Stride between dispatch calls + * \param [in] unroll Whether to unroll multiple draws if + * there are any potential data dependencies between them. */ void drawIndexedIndirect( VkDeviceSize offset, uint32_t count, - uint32_t stride); - + uint32_t stride, + bool unroll); + /** * \brief Indirect indexed draw call * @@ -1589,6 +1595,13 @@ namespace dxvk { const Rc& buffer, VkDeviceSize offset); + template + void drawIndirectGeneric( + VkDeviceSize offset, + uint32_t count, + uint32_t stride, + bool unroll); + void resolveImageHw( const Rc& dstImage, const Rc& srcImage, From 0691a7fc46abaea42ca587f292bb260b09913677 Mon Sep 17 00:00:00 2001 From: Philip Rebohle Date: Wed, 19 Feb 2025 19:30:25 +0100 Subject: [PATCH 31/44] [dxvk] De-duplicate drawIndirectCount implementations No functional change, just some code cleanup. --- src/dxvk/dxvk_context.cpp | 78 +++++++++++++++++++++------------------ src/dxvk/dxvk_context.h | 7 ++++ 2 files changed, 49 insertions(+), 36 deletions(-) diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index 3ccf3835c..e14720e2f 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -948,23 +948,7 @@ namespace dxvk { VkDeviceSize countOffset, uint32_t maxCount, uint32_t stride) { - if (this->commitGraphicsState()) { - auto argDescriptor = m_state.id.argBuffer.getDescriptor(); - auto cntDescriptor = m_state.id.cntBuffer.getDescriptor(); - - m_cmd->cmdDrawIndirectCount( - argDescriptor.buffer.buffer, - argDescriptor.buffer.offset + offset, - cntDescriptor.buffer.buffer, - cntDescriptor.buffer.offset + countOffset, - maxCount, stride); - - if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) - accessDrawBuffer(offset, maxCount, stride, sizeof(VkDrawIndirectCommand)); - - if (unlikely(m_state.id.cntBuffer.buffer()->hasGfxStores())) - accessDrawCountBuffer(countOffset); - } + drawIndirectCountGeneric(offset, countOffset, maxCount, stride); } @@ -997,26 +981,10 @@ namespace dxvk { VkDeviceSize countOffset, uint32_t maxCount, uint32_t stride) { - if (this->commitGraphicsState()) { - auto argDescriptor = m_state.id.argBuffer.getDescriptor(); - auto cntDescriptor = m_state.id.cntBuffer.getDescriptor(); - - m_cmd->cmdDrawIndexedIndirectCount( - argDescriptor.buffer.buffer, - argDescriptor.buffer.offset + offset, - cntDescriptor.buffer.buffer, - cntDescriptor.buffer.offset + countOffset, - maxCount, stride); - - if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) - accessDrawBuffer(offset, maxCount, stride, sizeof(VkDrawIndexedIndirectCommand)); - - if (unlikely(m_state.id.cntBuffer.buffer()->hasGfxStores())) - accessDrawCountBuffer(countOffset); - } + drawIndirectCountGeneric(offset, countOffset, maxCount, stride); } - - + + void DxvkContext::drawIndirectXfb( VkDeviceSize counterOffset, uint32_t counterDivisor, @@ -1767,6 +1735,44 @@ namespace dxvk { } + template + void DxvkContext::drawIndirectCountGeneric( + VkDeviceSize offset, + VkDeviceSize countOffset, + uint32_t maxCount, + uint32_t stride) { + if (this->commitGraphicsState()) { + auto argDescriptor = m_state.id.argBuffer.getDescriptor(); + auto cntDescriptor = m_state.id.cntBuffer.getDescriptor(); + + if (Indexed) { + m_cmd->cmdDrawIndexedIndirectCount( + argDescriptor.buffer.buffer, + argDescriptor.buffer.offset + offset, + cntDescriptor.buffer.buffer, + cntDescriptor.buffer.offset + countOffset, + maxCount, stride); + } else { + m_cmd->cmdDrawIndirectCount( + argDescriptor.buffer.buffer, + argDescriptor.buffer.offset + offset, + cntDescriptor.buffer.buffer, + cntDescriptor.buffer.offset + countOffset, + maxCount, stride); + } + + if (unlikely(m_state.id.argBuffer.buffer()->hasGfxStores())) { + accessDrawBuffer(offset, maxCount, stride, Indexed + ? sizeof(VkDrawIndexedIndirectCommand) + : sizeof(VkDrawIndirectCommand)); + } + + if (unlikely(m_state.id.cntBuffer.buffer()->hasGfxStores())) + accessDrawCountBuffer(countOffset); + } + } + + void DxvkContext::resolveImage( const Rc& dstImage, const Rc& srcImage, diff --git a/src/dxvk/dxvk_context.h b/src/dxvk/dxvk_context.h index b27af8405..921405472 100644 --- a/src/dxvk/dxvk_context.h +++ b/src/dxvk/dxvk_context.h @@ -1602,6 +1602,13 @@ namespace dxvk { uint32_t stride, bool unroll); + template + void drawIndirectCountGeneric( + VkDeviceSize offset, + VkDeviceSize countOffset, + uint32_t maxCount, + uint32_t stride); + void resolveImageHw( const Rc& dstImage, const Rc& srcImage, From f7d56886c53cb1eff36b929dbc568e7da1cb5074 Mon Sep 17 00:00:00 2001 From: Robin Kertels Date: Fri, 21 Feb 2025 12:09:03 +0100 Subject: [PATCH 32/44] [d3d9] Fix sampler slot correction not respecting dmap texture --- src/d3d9/d3d9_device.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp index f43f3d74a..a984cee00 100644 --- a/src/d3d9/d3d9_device.cpp +++ b/src/d3d9/d3d9_device.cpp @@ -6415,12 +6415,15 @@ namespace dxvk { void D3D9DeviceEx::UpdateTextureTypeMismatchesForTexture(uint32_t stateSampler) { uint32_t shaderTextureIndex; const D3D9CommonShader* shader; - if (unlikely(stateSampler > caps::MaxTexturesPS + 1)) { + if (likely(stateSampler <= caps::MaxTexturesPS)) { + shader = GetCommonShader(m_state.pixelShader); + shaderTextureIndex = stateSampler; + } else if (unlikely(stateSampler >= caps::MaxTexturesPS + 1)) { shader = GetCommonShader(m_state.vertexShader); shaderTextureIndex = stateSampler - caps::MaxTexturesPS - 1; } else { - shader = GetCommonShader(m_state.pixelShader); - shaderTextureIndex = stateSampler; + // Do not type check the fixed function displacement map texture. + return; } if (unlikely(shader == nullptr || shader->GetInfo().majorVersion() < 2 || m_d3d9Options.forceSamplerTypeSpecConstants)) { From 94b48c16335ee55219a0292f994f4d1039172b24 Mon Sep 17 00:00:00 2001 From: Robin Kertels Date: Fri, 21 Feb 2025 12:49:42 +0100 Subject: [PATCH 33/44] [d3d9] Slightly clean up sampler slot handling --- src/d3d9/d3d9_device.cpp | 6 +-- src/d3d9/d3d9_device.h | 58 ++++++++++++++++++++++++++ src/d3d9/d3d9_shader.cpp | 2 +- src/d3d9/d3d9_stateblock.cpp | 4 +- src/d3d9/d3d9_util.h | 80 +++++++++++++++++++++++++++++++++++- src/dxso/dxso_compiler.cpp | 4 +- 6 files changed, 144 insertions(+), 10 deletions(-) diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp index a984cee00..f704723db 100644 --- a/src/d3d9/d3d9_device.cpp +++ b/src/d3d9/d3d9_device.cpp @@ -3387,7 +3387,7 @@ namespace dxvk { BindShader(GetCommonShader(shader)); m_vsShaderMasks = newShader->GetShaderMask(); - UpdateTextureTypeMismatchesForShader(newShader, m_vsShaderMasks.samplerMask, caps::MaxTexturesPS + 1); + UpdateTextureTypeMismatchesForShader(newShader, m_vsShaderMasks.samplerMask, FirstVSSamplerSlot); } else { m_vsShaderMasks = D3D9ShaderMasks(); @@ -6415,10 +6415,10 @@ namespace dxvk { void D3D9DeviceEx::UpdateTextureTypeMismatchesForTexture(uint32_t stateSampler) { uint32_t shaderTextureIndex; const D3D9CommonShader* shader; - if (likely(stateSampler <= caps::MaxTexturesPS)) { + if (likely(IsPSSampler(stateSampler))) { shader = GetCommonShader(m_state.pixelShader); shaderTextureIndex = stateSampler; - } else if (unlikely(stateSampler >= caps::MaxTexturesPS + 1)) { + } else if (unlikely(IsVSSampler(stateSampler))) { shader = GetCommonShader(m_state.vertexShader); shaderTextureIndex = stateSampler - caps::MaxTexturesPS - 1; } else { diff --git a/src/d3d9/d3d9_device.h b/src/d3d9/d3d9_device.h index cec35f3d4..3a2a4184d 100644 --- a/src/d3d9/d3d9_device.h +++ b/src/d3d9/d3d9_device.h @@ -653,15 +653,41 @@ namespace dxvk { const D3DDISPLAYMODEEX* pFullscreenDisplayMode, IDirect3DSwapChain9** ppSwapChain); + /** + * @brief Sets the given sampler state + * + * @param StateSampler Sampler index (according to our internal way of storing samplers) + * @param Type Sampler state type to change + * @param Value State value + */ HRESULT SetStateSamplerState( DWORD StateSampler, D3DSAMPLERSTATETYPE Type, DWORD Value); + /** + * @brief Sets the given sampler texture + * + * @param StateSampler Sampler index (according to our internal way of storing samplers) + * @param pTexture Texture to use + */ HRESULT SetStateTexture(DWORD StateSampler, IDirect3DBaseTexture9* pTexture); + /** + * @brief Sets the transform for the given sampler + * + * @param idx Sampler index (according to our internal way of storing samplers) + * @param pMatrix Transform matrix + */ HRESULT SetStateTransform(uint32_t idx, const D3DMATRIX* pMatrix); + /** + * @brief Sets the fixed function texture processing state + * + * @param Stage Sampler index (according to our internal way of storing samplers) + * @param Type Fixed function texture stage type + * @param Value Value for the state + */ HRESULT SetStateTextureStageState( DWORD Stage, D3D9TextureStageStateTypes Type, @@ -818,8 +844,40 @@ namespace dxvk { void UpdateActiveFetch4(uint32_t stateSampler); + /** + * @brief Sets the mismatching texture type bits for all samplers if necessary. + * + * This function will check all samplers the shader uses and set the set the mismatching texture type bit for the given sampler if it does not + * match the texture type expected by the respective shader. + * + * It will *not* unset the bit if the texture type does match. + * + * @param stateSampler Sampler index (according to our internal way of storing samplers) + */ + + /** + * @brief Sets the mismatching texture type bits for all samplers if necessary. + * + * This function will check all samplers the shader uses and set the set the mismatching texture type bit for the given sampler if it does not + * match the texture type expected by the shader. + * + * @param shader The shader + * @param shaderSamplerMask Mask of all samplers that the shader uses (according to our internal way of storing samplers) + * @param shaderSamplerOffset First index of the shader's samplers according to our internal way of storing samplers. + * Used to transform the sampler indices that are relative to the entire pipeline to ones relative to the shader. + */ void UpdateTextureTypeMismatchesForShader(const D3D9CommonShader* shader, uint32_t shaderSamplerMask, uint32_t shaderSamplerOffset); + /** + * @brief Sets the mismatching texture type bit for the given sampler. + * + * This function will set the mismatching texture type bit for the given sampler if it does not + * match the texture type expected by the respective shader. + * + * It will *not* unset the bit if the texture type does match. + * + * @param stateSampler Sampler index (according to our internal way of storing samplers) + */ void UpdateTextureTypeMismatchesForTexture(uint32_t stateSampler); void UploadManagedTexture(D3D9CommonTexture* pResource); diff --git a/src/d3d9/d3d9_shader.cpp b/src/d3d9/d3d9_shader.cpp index 6ab5df3d7..f8807e0cb 100644 --- a/src/d3d9/d3d9_shader.cpp +++ b/src/d3d9/d3d9_shader.cpp @@ -63,7 +63,7 @@ namespace dxvk { // do an or per-draw in the device. // We shift by 17 because 16 ps samplers + 1 dmap (tess) if (ShaderStage == VK_SHADER_STAGE_VERTEX_BIT) - m_usedSamplers <<= caps::MaxTexturesPS + 1; + m_usedSamplers <<= FirstVSSamplerSlot; m_usedRTs = pModule->usedRTs(); diff --git a/src/d3d9/d3d9_stateblock.cpp b/src/d3d9/d3d9_stateblock.cpp index 8b62c1d58..ef8f76bb6 100644 --- a/src/d3d9/d3d9_stateblock.cpp +++ b/src/d3d9/d3d9_stateblock.cpp @@ -436,7 +436,7 @@ namespace dxvk { void D3D9StateBlock::CapturePixelSamplerStates() { m_captures.flags.set(D3D9CapturedStateFlag::SamplerStates); - for (uint32_t i = 0; i < caps::MaxTexturesPS + 1; i++) { + for (uint32_t i = 0; i < FirstVSSamplerSlot; i++) { m_captures.samplers.set(i, true); m_captures.samplerStates[i].set(D3DSAMP_ADDRESSU, true); @@ -519,7 +519,7 @@ namespace dxvk { void D3D9StateBlock::CaptureVertexSamplerStates() { m_captures.flags.set(D3D9CapturedStateFlag::SamplerStates); - for (uint32_t i = caps::MaxTexturesPS + 1; i < SamplerCount; i++) { + for (uint32_t i = FirstVSSamplerSlot; i < SamplerCount; i++) { m_captures.samplers.set(i, true); m_captures.samplerStates[i].set(D3DSAMP_DMAPOFFSET, true); } diff --git a/src/d3d9/d3d9_util.h b/src/d3d9/d3d9_util.h index 76cd31a7c..4e9202282 100644 --- a/src/d3d9/d3d9_util.h +++ b/src/d3d9/d3d9_util.h @@ -44,6 +44,11 @@ namespace dxvk { } } + /** + * @brief Returns whether or not the sampler index is valid + * + * @param Sampler Sampler index (according to the API) + */ inline bool InvalidSampler(DWORD Sampler) { if (Sampler >= caps::MaxTexturesPS && Sampler < D3DDMAPSAMPLER) return true; @@ -54,6 +59,19 @@ namespace dxvk { return false; } + /** + * @brief The first sampler that belongs to the vertex shader according to our internal way of storing samplers + */ + constexpr uint32_t FirstVSSamplerSlot = caps::MaxTexturesPS + 1; + + /** + * @brief Remaps a sampler index by the API to an internal one + * + * Remaps the sampler index according to the way the API counts them to how we count and store them internally. + * + * @param Sampler Sampler index (according to API) + * @return DWORD Sampler index (according to our internal way of storing samplers) + */ inline DWORD RemapSamplerState(DWORD Sampler) { if (Sampler >= D3DDMAPSAMPLER) Sampler = caps::MaxTexturesPS + (Sampler - D3DDMAPSAMPLER); @@ -61,13 +79,62 @@ namespace dxvk { return Sampler; } + /** + * @brief Remaps the sampler from an index applying to the entire pipeline to one relative to the shader stage and returns the shader type + * + * The displacement map sampler will be treated as a 17th pixel shader sampler. + * + * @param Sampler Sampler index (according to our internal way of storing samplers) + * @return std::pair Shader stage that it belongs to and the relative sampler index + */ inline std::pair RemapStateSamplerShader(DWORD Sampler) { - if (Sampler >= caps::MaxTexturesPS + 1) - return std::make_pair(DxsoProgramTypes::VertexShader, Sampler - caps::MaxTexturesPS - 1); + if (Sampler >= FirstVSSamplerSlot) + return std::make_pair(DxsoProgramTypes::VertexShader, Sampler - FirstVSSamplerSlot); return std::make_pair(DxsoProgramTypes::PixelShader, Sampler); } + /** + * @brief Returns whether the sampler belongs to the vertex shader. + * + * The displacement map sampler is part of a fixed function feature, + * so it does not belong to the vertex shader. + * Use IsDMAPSampler to check for that. + * + * @param Sampler Sampler index (according to our internal way of storing samplers) + */ + inline bool IsVSSampler(uint32_t Sampler) { + return Sampler >= FirstVSSamplerSlot; + } + + /** + * @brief Returns whether the sampler belongs to the pixel shader. + * + * The displacement map sampler is part of a fixed function feature, + * so (unlike in RemapStateSamplerShader) it does not belong to the pixel shader. + * Use IsDMAPSampler to check for that. + * + * @param Sampler Sampler index (according to our internal way of storing samplers) + */ + inline bool IsPSSampler(uint32_t Sampler) { + return Sampler <= caps::MaxTexturesPS; + } + + /** + * @brief Returns whether the sampler is the displacement map sampler + * + * @param Sampler Sampler index (according to our internal way of storing samplers) + */ + inline bool IsDMAPSampler(uint32_t Sampler) { + return Sampler > caps::MaxTexturesPS; + } + + /** + * @brief Remaps the sampler from an index (counted according to the API) to one relative to the shader stage and returns the shader type + * + * @param Sampler Sampler index (according to the API) + * @return std::pair Shader stage that it belongs to and the relative sampler index + */ inline std::pair RemapSamplerShader(DWORD Sampler) { Sampler = RemapSamplerState(Sampler); @@ -243,6 +310,9 @@ namespace dxvk { uint32_t(offsets[1].y) > extent.height; } + /** + * @brief Mirrors D3DTEXTURESTAGESTATETYPE but starts at 0 + */ enum D3D9TextureStageStateTypes : uint32_t { DXVK_TSS_COLOROP = 0, @@ -272,6 +342,12 @@ namespace dxvk { constexpr uint32_t DXVK_TSS_TCI_CAMERASPACEREFLECTIONVECTOR = 0x00030000; constexpr uint32_t DXVK_TSS_TCI_SPHEREMAP = 0x00040000; + /** + * @brief Remaps a texture stage type by the API to an internal one + * + * @param Type Texture stage type according to the API + * @return D3D9TextureStageStateTypes Texture stage type according to our internal way of storing them + */ inline D3D9TextureStageStateTypes RemapTextureStageStateType(D3DTEXTURESTAGESTATETYPE Type) { return D3D9TextureStageStateTypes(Type - 1); } diff --git a/src/dxso/dxso_compiler.cpp b/src/dxso/dxso_compiler.cpp index 54c0f0543..1d7e1bf1d 100644 --- a/src/dxso/dxso_compiler.cpp +++ b/src/dxso/dxso_compiler.cpp @@ -2965,7 +2965,7 @@ void DxsoCompiler::emitControlFlowGenericLoop( uint32_t lOffset = m_module.opAccessChain(m_module.defPointerType(float_t, spv::StorageClassUniform), m_ps.sharedState, 1, &index); lOffset = m_module.opLoad(float_t, lOffset); - + uint32_t zIndex = 2; uint32_t scale = m_module.opCompositeExtract(float_t, result.id, 1, &zIndex); scale = m_module.opFMul(float_t, scale, lScale); @@ -2980,7 +2980,7 @@ void DxsoCompiler::emitControlFlowGenericLoop( auto SampleType = [&](DxsoSamplerType samplerType) { uint32_t bitOffset = m_programInfo.type() == DxsoProgramTypes::VertexShader - ? samplerIdx + caps::MaxTexturesPS + 1 + ? samplerIdx + FirstVSSamplerSlot : samplerIdx; uint32_t isNull = m_spec.get(m_module, m_specUbo, SpecSamplerNull, bitOffset, 1); From 0f3cbd6f667c76c30594144e16b0b9f493134f14 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Sat, 22 Feb 2025 18:21:02 +0100 Subject: [PATCH 34/44] [dxvk] Improve low-latency frame pacing for cpu-limited frames --- .../framepacer/dxvk_framepacer_mode_low_latency.h | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h index 06fdaf0dd..ec1d4f542 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -87,7 +87,13 @@ namespace dxvk { m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count(); int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync; - int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset; + int32_t gpuDelay = targetGpuSync - props.cpuUntilGpuSync; + + int32_t cpuReadyPrediction = duration_cast( + m->start + microseconds(props.csFinished) - now).count(); + int32_t cpuDelay = cpuReadyPrediction - props.csStart; + + int32_t delay = std::max(gpuDelay, cpuDelay) + m_lowLatencyOffset; m_lastStart = sleepFor( now, delay ); @@ -143,6 +149,8 @@ namespace dxvk { props.gpuSync = gpuRun[numLoop-1]; props.cpuUntilGpuSync = offset + duration_cast( m->gpuSubmit[numLoop-1] - m->start ).count(); props.optimizedGpuTime = optimizedGpuTime; + props.csStart = m->csStart; + props.csFinished = m->csFinished; props.isOutlier = isOutlier(frameId); m_propsFinished.store( frameId ); @@ -171,6 +179,8 @@ namespace dxvk { int32_t optimizedGpuTime; // gpu executing packed submits in one go int32_t gpuSync; // us after gpuStart int32_t cpuUntilGpuSync; + int32_t csStart; + int32_t csFinished; bool isOutlier; }; From a13b821f95ce234c47b80c1c616891a3a6438711 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Sun, 23 Feb 2025 09:54:07 +0100 Subject: [PATCH 35/44] [dxvk] Allow for fps limit < 50 fps in low-latency mode --- dxvk.conf | 4 ++-- src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h | 3 ++- src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h | 3 ++- src/util/util_fps_limiter.cpp | 3 ++- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/dxvk.conf b/dxvk.conf index 0b8deba8c..d8ae19950 100644 --- a/dxvk.conf +++ b/dxvk.conf @@ -39,7 +39,7 @@ # # Supported values: "max-frame-latency", "low-latency", "min-latency" -# dxvk.framePacing = "" +# dxvk.framePace = "" # Allows fine-tuning the low-latency frame pacing mode. @@ -149,7 +149,7 @@ # The implementation will either use VK_NV_low_latency2 if supported # by the driver, or a custom algorithm. # - False: Disable Reflex support as well as built-in latency reduction. -# This build defaults to False to enable dxvk.framePacing. You need to +# This build defaults to False to enable dxvk.framePace. You need to # enable Reflex manually (Auto) until we support switching back and # forth between Reflex and the low-latency frame pacing - for example # via the ingame options - and more critically we want to enable diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h index ec1d4f542..28f5f170b 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -164,7 +164,8 @@ namespace dxvk { int32_t frametime = std::chrono::duration_cast( t - m_lastStart ).count(); int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime ); delay = std::max( delay, frametimeDiff ); - delay = std::max( 0, std::min( delay, 20000 ) ); + int32_t maxDelay = std::max( m_fpsLimitFrametime.load(), 20000 ); + delay = std::max( 0, std::min( delay, maxDelay ) ); Sleep::TimePoint nextStart = t + microseconds(delay); Sleep::sleepUntil( t, nextStart ); diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h index 763a5368c..03e94d0de 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h @@ -28,7 +28,8 @@ namespace dxvk { now - m_lastStart ).count(); int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime ); int32_t delay = std::max( 0, frametimeDiff ); - delay = std::min( delay, 20000 ); + int32_t maxDelay = std::max( m_fpsLimitFrametime.load(), 20000 ); + delay = std::min( delay, maxDelay ); Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay); Sleep::sleepUntil( now, nextStart ); diff --git a/src/util/util_fps_limiter.cpp b/src/util/util_fps_limiter.cpp index 95fb79e7e..f8866ff16 100644 --- a/src/util/util_fps_limiter.cpp +++ b/src/util/util_fps_limiter.cpp @@ -57,6 +57,8 @@ namespace dxvk { return; } + m_isActive.store(false); + std::unique_lock lock(m_mutex); auto interval = m_targetInterval; auto latency = m_maxLatency; @@ -79,7 +81,6 @@ namespace dxvk { // that can be written by setTargetFrameRate lock.unlock(); - m_isActive.store(false); if (t1 < m_nextFrame) { m_isActive.store(true); Sleep::sleepUntil(t1, m_nextFrame); From c802bdf42e993b467f8fa18b2db52da5841164d1 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Sun, 23 Feb 2025 10:00:50 +0100 Subject: [PATCH 36/44] [dxvk] Remove dxvk.lowLatencyAllowCpuFramesOverlap config variable --- src/dxvk/dxvk_options.cpp | 2 -- src/dxvk/framepacer/dxvk_framepacer.h | 1 - src/dxvk/framepacer/dxvk_framepacer_mode.h | 4 ---- .../dxvk_framepacer_mode_low_latency.cpp | 19 ------------------- .../dxvk_framepacer_mode_low_latency.h | 11 ++--------- 5 files changed, 2 insertions(+), 35 deletions(-) diff --git a/src/dxvk/dxvk_options.cpp b/src/dxvk/dxvk_options.cpp index 85fc3ec3e..a31778e18 100644 --- a/src/dxvk/dxvk_options.cpp +++ b/src/dxvk/dxvk_options.cpp @@ -20,8 +20,6 @@ namespace dxvk { allowFse = config.getOption ("dxvk.allowFse", false); framePace = config.getOption("dxvk.framePace", ""); lowLatencyOffset = config.getOption ("dxvk.lowLatencyOffset", 0); - lowLatencyAllowCpuFramesOverlap - = config.getOption ("dxvk.lowLatencyAllowCpuFramesOverlap", true); deviceFilter = config.getOption("dxvk.deviceFilter", ""); tilerMode = config.getOption("dxvk.tilerMode", Tristate::Auto); } diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h index 264dcff57..0028e1353 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.h +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -53,7 +53,6 @@ namespace dxvk { auto now = high_resolution_clock::now(); LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); m->csFinished = std::chrono::duration_cast(now - m->start).count(); - m_mode->signalCsFinished( frameId ); } void notifySubmit() override { diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode.h b/src/dxvk/framepacer/dxvk_framepacer_mode.h index 109a240a2..2109bd57b 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode.h @@ -44,9 +44,6 @@ namespace dxvk { void signalGpuStart( uint64_t frameId ) { if (m_mode) m_fenceGpuStart.signal(frameId); } - void signalCsFinished( uint64_t frameId ) { - if (m_mode) m_fenceCsFinished.signal(frameId); } - void setTargetFrameRate( double frameRate ) { if (!m_fpsLimitEnvOverride && frameRate > 1.0) m_fpsLimitFrametime.store( 1'000'000/frameRate ); @@ -68,7 +65,6 @@ namespace dxvk { sync::Fence m_fenceGpuStart = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; - sync::Fence m_fenceCsFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) }; }; diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp index 4e39145b4..4af77f051 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp @@ -10,15 +10,6 @@ namespace dxvk { } - bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) { - int32_t o; - if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o)) - return false; - allowOverlap = (bool) o; - return true; - } - - int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) { int32_t offset = options.lowLatencyOffset; int32_t o; @@ -30,14 +21,4 @@ namespace dxvk { return offset; } - - bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) { - bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap; - bool o; - if (getLowLatencyAllowCpuFramesOverlapFromEnv(o)) - allowOverlap = o; - return allowOverlap; - } - - } diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h index 28f5f170b..2d17df1c7 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -44,20 +44,16 @@ namespace dxvk { LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options) : FramePacerMode(mode, storage), - m_lowLatencyOffset(getLowLatencyOffset(options)), - m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) { + m_lowLatencyOffset(getLowLatencyOffset(options)) { Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) ); - Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) ); } ~LowLatencyMode() {} void startFrame( uint64_t frameId ) override { - using std::chrono::duration_cast; - if (!m_allowCpuFramesOverlap) - m_fenceCsFinished.wait( frameId-1 ); + using std::chrono::duration_cast; m_fenceGpuStart.wait( frameId-1 ); @@ -249,10 +245,7 @@ namespace dxvk { int32_t getLowLatencyOffset( const DxvkOptions& options ); - bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ); - const int32_t m_lowLatencyOffset; - const bool m_allowCpuFramesOverlap; Sleep::TimePoint m_lastStart = { high_resolution_clock::now() }; std::array m_props; From 869cf25f7f2b23467c7bb31f22112684142f8e6f Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Sun, 23 Feb 2025 15:09:59 +0100 Subject: [PATCH 37/44] [dxvk] Optimize GPU start measurement for low-latency frame pacing In practice, this change affects oversubscribed threading situations where waking up the "dxvk-queue" thread potentially can cause delays in the 100s of microseconds. For a lot of situations this change isn't affecting measurements in a meaningful way. Possibly affects AMD where vkQueueSubmit execution time is non-zero. --- src/dxvk/framepacer/dxvk_framepacer.cpp | 4 +++- src/dxvk/framepacer/dxvk_framepacer.h | 13 ++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp index 63803f1ba..c520d854d 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.cpp +++ b/src/dxvk/framepacer/dxvk_framepacer.cpp @@ -54,8 +54,10 @@ namespace dxvk { } // be consistent that every frame has a gpuReady event from the previous frame - LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1); + uint64_t firstFrameId = DXGI_MAX_SWAP_CHAIN_BUFFERS+1; + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(firstFrameId); m->gpuReady.push_back(high_resolution_clock::now()); + m_gpuStarts[ firstFrameId % m_gpuStarts.size() ] = gpuReadyBit; } diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h index 0028e1353..f3e047195 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.h +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -34,13 +34,13 @@ namespace dxvk { // potentially wait some more if the cpu gets too much ahead m_mode->startFrame(frameId); m_latencyMarkersStorage.registerFrameStart(frameId); - m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0); } void notifyGpuPresentEnd( uint64_t frameId ) override { // the frame has been displayed to the screen m_latencyMarkersStorage.registerFrameEnd(frameId); m_mode->endFrame(frameId); + m_gpuStarts[ (frameId-1) % m_gpuStarts.size() ].store(0); } void notifyCsRenderBegin( uint64_t frameId ) override { @@ -95,12 +95,6 @@ namespace dxvk { } } - void notifyGpuExecutionBegin( uint64_t frameId ) override { - assert( frameId == m_lastFinishedFrameId+1 ); - LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); - gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now()); - } - void notifyGpuExecutionEnd( uint64_t frameId ) override { auto now = high_resolution_clock::now(); LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); @@ -120,7 +114,7 @@ namespace dxvk { next->gpuReady.clear(); next->gpuReady.push_back(now); - gpuExecutionCheckGpuStart(frameId, m, now); + gpuExecutionCheckGpuStart(frameId+1, next, now); m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId); m_mode->finishRender(frameId); @@ -149,6 +143,7 @@ namespace dxvk { void notifyCpuPresentBegin( uint64_t frameId) override { } void notifyCpuPresentEnd( uint64_t frameId ) override { } void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { } + void notifyGpuExecutionBegin( uint64_t frameId ) override { } void discardTimings() override { } DxvkLatencyStats getStatistics( uint64_t frameId ) override { return DxvkLatencyStats(); } @@ -181,7 +176,7 @@ namespace dxvk { uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; uint64_t m_lastFinishedFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; - std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { }; + std::array< std::atomic< uint16_t >, 8 > m_gpuStarts = { }; static constexpr uint16_t queueSubmitBit = 1; static constexpr uint16_t gpuReadyBit = 2; From 3720d3e0e8a0aaafed9ebeb5381dca1ee95a1361 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Sun, 23 Feb 2025 15:25:34 +0100 Subject: [PATCH 38/44] [dxvk] Removed redundant frameId tracking in frame pacer --- src/dxvk/dxvk_latency.h | 3 ++- src/dxvk/dxvk_queue.cpp | 2 +- src/dxvk/framepacer/dxvk_framepacer.h | 14 +++----------- 3 files changed, 6 insertions(+), 13 deletions(-) diff --git a/src/dxvk/dxvk_latency.h b/src/dxvk/dxvk_latency.h index f4e74a7ce..f886e065c 100644 --- a/src/dxvk/dxvk_latency.h +++ b/src/dxvk/dxvk_latency.h @@ -128,7 +128,8 @@ namespace dxvk { virtual void notifyCpuPresentEnd( uint64_t frameId) = 0; - virtual void notifySubmit() { } + virtual void notifySubmit( + uint64_t frameId) { } virtual void notifyPresent( uint64_t frameId) { } diff --git a/src/dxvk/dxvk_queue.cpp b/src/dxvk/dxvk_queue.cpp index 0c74428a0..0ddf05c8f 100644 --- a/src/dxvk/dxvk_queue.cpp +++ b/src/dxvk/dxvk_queue.cpp @@ -48,7 +48,7 @@ namespace dxvk { DxvkLatencyInfo latencyInfo, DxvkSubmitStatus* status) { if (latencyInfo.tracker) - latencyInfo.tracker->notifySubmit(); + latencyInfo.tracker->notifySubmit(latencyInfo.frameId); std::unique_lock lock(m_mutex); m_finishCond.wait(lock, [this] { diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h index f3e047195..1a8cf63f3 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.h +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -55,8 +55,8 @@ namespace dxvk { m->csFinished = std::chrono::duration_cast(now - m->start).count(); } - void notifySubmit() override { - LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1); + void notifySubmit( uint64_t frameId ) override { + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); m->gpuSubmit.push_back(high_resolution_clock::now()); } @@ -64,7 +64,6 @@ namespace dxvk { // dx to vk translation is finished if (frameId != 0) { auto now = high_resolution_clock::now(); - m_lastSubmitFrameId = frameId; LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); m->gpuSubmit.push_back(now); @@ -76,7 +75,6 @@ namespace dxvk { } void notifyQueueSubmit( uint64_t frameId ) override { - assert( frameId == m_lastQueueSubmitFrameId + 1 ); auto now = high_resolution_clock::now(); LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); m->gpuQueueSubmit.push_back(now); @@ -86,7 +84,6 @@ namespace dxvk { void notifyQueuePresentBegin( uint64_t frameId ) override { if (frameId != 0) { auto now = high_resolution_clock::now(); - m_lastQueueSubmitFrameId = frameId; LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); m->gpuQueueSubmit.push_back(now); @@ -97,14 +94,13 @@ namespace dxvk { void notifyGpuExecutionEnd( uint64_t frameId ) override { auto now = high_resolution_clock::now(); - LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); m->gpuReady.push_back(now); } virtual void notifyGpuPresentBegin( uint64_t frameId ) override { // we get frameId == 0 for repeated presents (SyncInterval) if (frameId != 0) { - m_lastFinishedFrameId = frameId; auto now = high_resolution_clock::now(); LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); @@ -172,10 +168,6 @@ namespace dxvk { std::unique_ptr m_mode; - uint64_t m_lastSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; - uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; - uint64_t m_lastFinishedFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; - std::array< std::atomic< uint16_t >, 8 > m_gpuStarts = { }; static constexpr uint16_t queueSubmitBit = 1; static constexpr uint16_t gpuReadyBit = 2; From bee72c27c86baa2fdd231a4b1350cb0c2d73bbf8 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Sun, 23 Feb 2025 23:07:52 +0100 Subject: [PATCH 39/44] [dxvk] Improve robustness of frame pacer initialization In d3d9 there were situations where the first frameId was 22, although in d3d11 it always started at 17. This did cause issues especially when waiting for fences which didn't get signaled for these frameIds. --- src/d3d9/d3d9_swapchain.cpp | 2 +- src/dxvk/dxvk_device.cpp | 7 ++++--- src/dxvk/dxvk_device.h | 3 ++- src/dxvk/framepacer/dxvk_framepacer.cpp | 18 +++++++++++++----- src/dxvk/framepacer/dxvk_framepacer.h | 2 +- 5 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/d3d9/d3d9_swapchain.cpp b/src/d3d9/d3d9_swapchain.cpp index 4ab616ed7..b858df2af 100644 --- a/src/d3d9/d3d9_swapchain.cpp +++ b/src/d3d9/d3d9_swapchain.cpp @@ -987,7 +987,7 @@ namespace dxvk { entry->second.presenter = CreatePresenter(m_window, entry->second.frameLatencySignal); if (m_presentParams.hDeviceWindow == m_window && m_latencyTracking) - m_latencyTracker = m_device->createLatencyTracker(entry->second.presenter); + m_latencyTracker = m_device->createLatencyTracker(entry->second.presenter, entry->second.frameId+1); } m_wctx = &entry->second; diff --git a/src/dxvk/dxvk_device.cpp b/src/dxvk/dxvk_device.cpp index cdb59e6c7..b05acf034 100644 --- a/src/dxvk/dxvk_device.cpp +++ b/src/dxvk/dxvk_device.cpp @@ -309,15 +309,16 @@ namespace dxvk { Rc DxvkDevice::createLatencyTracker( - const Rc& presenter) { + const Rc& presenter, + uint64_t firstFrameId ) { if (m_options.latencySleep == Tristate::False) - return new FramePacer(m_options); + return new FramePacer(m_options, firstFrameId); if (m_options.latencySleep == Tristate::Auto) { if (m_features.nvLowLatency2) return new DxvkReflexLatencyTrackerNv(presenter); else - return new FramePacer(m_options); + return new FramePacer(m_options, firstFrameId); } return new DxvkBuiltInLatencyTracker(presenter, diff --git a/src/dxvk/dxvk_device.h b/src/dxvk/dxvk_device.h index 2ec517c2e..859b30297 100644 --- a/src/dxvk/dxvk_device.h +++ b/src/dxvk/dxvk_device.h @@ -489,7 +489,8 @@ namespace dxvk { * \param [in] presenter Presenter instance */ Rc createLatencyTracker( - const Rc& presenter); + const Rc& presenter, + uint64_t firstFrameId = 17); /** * \brief Presents a swap chain image diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp index c520d854d..a71b2d31d 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.cpp +++ b/src/dxvk/framepacer/dxvk_framepacer.cpp @@ -8,7 +8,7 @@ namespace dxvk { - FramePacer::FramePacer( const DxvkOptions& options ) { + FramePacer::FramePacer( const DxvkOptions& options, uint64_t firstFrameId ) { // we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes, // highlighting the generally much better input lag and medium-term time consistency. // although MAX_FRAME_LATENCY has advantages in many games and is likely the better default, @@ -53,11 +53,19 @@ namespace dxvk { gpuStart.store(0); } - // be consistent that every frame has a gpuReady event from the previous frame - uint64_t firstFrameId = DXGI_MAX_SWAP_CHAIN_BUFFERS+1; - LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(firstFrameId); - m->gpuReady.push_back(high_resolution_clock::now()); + // be consistent that every frame has a gpuReady event from finishing the previous frame + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers( firstFrameId ); + m->gpuReady.push_back( high_resolution_clock::now() ); m_gpuStarts[ firstFrameId % m_gpuStarts.size() ] = gpuReadyBit; + + LatencyMarkersTimeline& timeline = m_latencyMarkersStorage.m_timeline; + timeline.cpuFinished.store ( firstFrameId-1 ); + timeline.gpuStart.store ( firstFrameId-1 ); + timeline.gpuFinished.store ( firstFrameId-1 ); + timeline.frameFinished.store ( firstFrameId-1 ); + + m_mode->signalGpuStart ( firstFrameId-1 ); + m_mode->signalRenderFinished ( firstFrameId-1 ); } diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h index 1a8cf63f3..38d740d1a 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.h +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -23,7 +23,7 @@ namespace dxvk { using microseconds = std::chrono::microseconds; public: - FramePacer( const DxvkOptions& options ); + FramePacer( const DxvkOptions& options, uint64_t firstFrameId ); ~FramePacer(); void sleepAndBeginFrame( From 0d018451fd5027140e8a9b86cf9cc0329c8a158e Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Fri, 28 Feb 2025 08:05:46 +0100 Subject: [PATCH 40/44] [dxvk] Optimize flush heuristic for low-latency Possibly can be optimized more, but just changing these numbers already had a huge effect, especially for games having a small number of submissions to begin with. --- src/dxvk/framepacer/dxvk_framepacer.cpp | 5 +++++ src/dxvk/framepacer/dxvk_framepacer.h | 2 +- src/util/util_flush.cpp | 16 ++++++++-------- src/util/util_flush.h | 5 +++++ 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp index a71b2d31d..944476817 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.cpp +++ b/src/dxvk/framepacer/dxvk_framepacer.cpp @@ -2,6 +2,7 @@ #include "dxvk_framepacer_mode_low_latency.h" #include "dxvk_framepacer_mode_min_latency.h" #include "dxvk_options.h" +#include "../../util/util_flush.h" #include "../../util/util_env.h" #include "../../util/log/log.h" @@ -40,11 +41,15 @@ namespace dxvk { case FramePacerMode::LOW_LATENCY: Logger::info( "Frame pace: low-latency" ); + GpuFlushTracker::m_minPendingSubmissions = 1; + GpuFlushTracker::m_minChunkCount = 1; m_mode = std::make_unique(mode, &m_latencyMarkersStorage, options); break; case FramePacerMode::MIN_LATENCY: Logger::info( "Frame pace: min-latency" ); + GpuFlushTracker::m_minPendingSubmissions = 1; + GpuFlushTracker::m_minChunkCount = 1; m_mode = std::make_unique(mode, &m_latencyMarkersStorage); break; } diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h index 38d740d1a..2ae4b76e8 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.h +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -136,7 +136,7 @@ namespace dxvk { // not implemented methods - void notifyCpuPresentBegin( uint64_t frameId) override { } + void notifyCpuPresentBegin( uint64_t frameId ) override { } void notifyCpuPresentEnd( uint64_t frameId ) override { } void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { } void notifyGpuExecutionBegin( uint64_t frameId ) override { } diff --git a/src/util/util_flush.cpp b/src/util/util_flush.cpp index 17d266901..4a2cd854b 100644 --- a/src/util/util_flush.cpp +++ b/src/util/util_flush.cpp @@ -2,6 +2,10 @@ namespace dxvk { + std::atomic GpuFlushTracker::m_minPendingSubmissions = { 2 }; + std::atomic GpuFlushTracker::m_minChunkCount = { 3 }; + std::atomic GpuFlushTracker::m_maxChunkCount = { 20 }; + GpuFlushTracker::GpuFlushTracker(GpuFlushType maxType) : m_maxType(maxType) { @@ -11,10 +15,6 @@ namespace dxvk { GpuFlushType flushType, uint64_t chunkId, uint32_t lastCompleteSubmissionId) { - constexpr uint32_t minPendingSubmissions = 2; - - constexpr uint32_t minChunkCount = 3u; - constexpr uint32_t maxChunkCount = 20u; // Do not flush if there is nothing to flush uint32_t chunkCount = uint32_t(chunkId - m_lastFlushChunkId); @@ -42,14 +42,14 @@ namespace dxvk { case GpuFlushType::ImplicitStrongHint: { // Flush aggressively with a strong hint to reduce readback latency. - return chunkCount >= minChunkCount; + return chunkCount >= m_minChunkCount; } case GpuFlushType::ImplicitMediumHint: case GpuFlushType::ImplicitWeakHint: { // Aim for a higher number of chunks per submission with // a weak hint in order to avoid submitting too often. - if (chunkCount < 2 * minChunkCount) + if (chunkCount < 2 * m_minChunkCount) return false; // Actual heuristic is shared with synchronization commands @@ -60,13 +60,13 @@ namespace dxvk { // required if the application is spinning on a query or resource. uint32_t pendingSubmissions = uint32_t(m_lastFlushSubmissionId - lastCompleteSubmissionId); - if (pendingSubmissions < minPendingSubmissions) + if (pendingSubmissions < m_minPendingSubmissions) return true; // Use the number of pending submissions to decide whether to flush. Other // than ignoring the minimum chunk count condition, we should treat this // the same as weak hints to avoid unnecessary synchronization. - uint32_t threshold = std::min(maxChunkCount, pendingSubmissions * minChunkCount); + uint32_t threshold = std::min(m_maxChunkCount.load(), pendingSubmissions * m_minChunkCount.load()); return chunkCount >= threshold; } } diff --git a/src/util/util_flush.h b/src/util/util_flush.h index 5d593649d..8673f404f 100644 --- a/src/util/util_flush.h +++ b/src/util/util_flush.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace dxvk { @@ -64,6 +65,10 @@ namespace dxvk { uint64_t chunkId, uint64_t submissionId); + static std::atomic m_minPendingSubmissions; + static std::atomic m_minChunkCount; + static std::atomic m_maxChunkCount; + private: GpuFlushType m_maxType = GpuFlushType::ImplicitWeakHint; From 91c6793b559becd442b86ca4d5be07b67745cf5b Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Fri, 28 Feb 2025 08:37:11 +0100 Subject: [PATCH 41/44] [dxvk] Disallow flush for beginLatencyTracking's emitCs Not sure if this does anything, but better be safe to correctly track when the first succeeding Cs will get executed. --- src/d3d11/d3d11_context.h | 5 +++-- src/d3d11/d3d11_swapchain.cpp | 2 +- src/d3d9/d3d9_device.cpp | 2 +- src/d3d9/d3d9_device.h | 5 +++-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/d3d11/d3d11_context.h b/src/d3d11/d3d11_context.h index a2d6c3659..76b4d7811 100644 --- a/src/d3d11/d3d11_context.h +++ b/src/d3d11/d3d11_context.h @@ -1084,7 +1084,7 @@ namespace dxvk { UINT SampleMask); template - void EmitCs(Cmd&& command) { + void EmitCs(Cmd&& command, bool disableFlush=false ) { m_cmdData = nullptr; if (unlikely(!m_csChunk->push(command))) { @@ -1092,7 +1092,8 @@ namespace dxvk { m_csChunk = AllocCsChunk(); if constexpr (AllowFlush) - GetTypedContext()->ConsiderFlush(GpuFlushType::ImplicitWeakHint); + if (!disableFlush) + GetTypedContext()->ConsiderFlush(GpuFlushType::ImplicitWeakHint); m_csChunk->push(command); } diff --git a/src/d3d11/d3d11_swapchain.cpp b/src/d3d11/d3d11_swapchain.cpp index e2100eb8d..3eb636309 100644 --- a/src/d3d11/d3d11_swapchain.cpp +++ b/src/d3d11/d3d11_swapchain.cpp @@ -472,7 +472,7 @@ namespace dxvk { cFrameId = m_frameId ] (DxvkContext* ctx) { ctx->beginLatencyTracking(cLatency, cFrameId + 1u); - }); + }, true); } } diff --git a/src/d3d9/d3d9_device.cpp b/src/d3d9/d3d9_device.cpp index f704723db..8d4b86826 100644 --- a/src/d3d9/d3d9_device.cpp +++ b/src/d3d9/d3d9_device.cpp @@ -6129,7 +6129,7 @@ namespace dxvk { ] (DxvkContext* ctx) { if (cTracker && cTracker->needsAutoMarkers()) ctx->beginLatencyTracking(cTracker, cFrameId); - }); + }, true); } diff --git a/src/d3d9/d3d9_device.h b/src/d3d9/d3d9_device.h index 3a2a4184d..88a60a4b7 100644 --- a/src/d3d9/d3d9_device.h +++ b/src/d3d9/d3d9_device.h @@ -1180,13 +1180,14 @@ namespace dxvk { private: template - void EmitCs(Cmd&& command) { + void EmitCs(Cmd&& command, bool disableFlush=false) { if (unlikely(!m_csChunk->push(command))) { EmitCsChunk(std::move(m_csChunk)); m_csChunk = AllocCsChunk(); if constexpr (AllowFlush) - ConsiderFlush(GpuFlushType::ImplicitWeakHint); + if (!disableFlush) + ConsiderFlush(GpuFlushType::ImplicitWeakHint); m_csChunk->push(command); } From 988dcdc08c241d874fb1f91ec3c0767f1017d09d Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Fri, 28 Feb 2025 08:39:40 +0100 Subject: [PATCH 42/44] [dxvk] Improve low-latency frame pacing when gpuStart gets signaled earlier than expected --- .../dxvk_framepacer_mode_low_latency.h | 32 ++++--------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h index 2d17df1c7..9193f438e 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -79,8 +79,10 @@ namespace dxvk { // and calculate backwards when we want to start this frame const SyncProps props = getSyncPrediction(); - int32_t gpuReadyPrediction = duration_cast( - m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count(); + int32_t lastFrameStart = duration_cast( m->start - now ).count(); + int32_t gpuReadyPrediction = lastFrameStart + + std::max( props.cpuUntilGpuStart, m->gpuStart ) + + props.optimizedGpuTime; int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync; int32_t gpuDelay = targetGpuSync - props.cpuUntilGpuSync; @@ -144,6 +146,7 @@ namespace dxvk { SyncProps& props = m_props[frameId % m_props.size()]; props.gpuSync = gpuRun[numLoop-1]; props.cpuUntilGpuSync = offset + duration_cast( m->gpuSubmit[numLoop-1] - m->start ).count(); + props.cpuUntilGpuStart = props.cpuUntilGpuSync - props.gpuSync; props.optimizedGpuTime = optimizedGpuTime; props.csStart = m->csStart; props.csFinished = m->csFinished; @@ -176,6 +179,7 @@ namespace dxvk { int32_t optimizedGpuTime; // gpu executing packed submits in one go int32_t gpuSync; // us after gpuStart int32_t cpuUntilGpuSync; + int32_t cpuUntilGpuStart; int32_t csStart; int32_t csFinished; bool isOutlier; @@ -203,30 +207,6 @@ namespace dxvk { }; - int32_t getGpuStartToFinishPrediction() { - uint64_t id = m_propsFinished; - if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7) - return 0; - - for (size_t i=0; i<7; ++i) { - const SyncProps& props = m_props[ (id-i) % m_props.size() ]; - if (!props.isOutlier) { - const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i); - if (m->gpuReady.empty() || m->gpuSubmit.empty()) - return m->gpuFinished - m->gpuStart; - - time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] ); - return std::chrono::duration_cast( t - m->start ).count() - + props.optimizedGpuTime - - m->gpuStart; - } - } - - const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id); - return m->gpuFinished - m->gpuStart; - }; - - bool isOutlier( uint64_t frameId ) { constexpr size_t numLoop = 7; int32_t totalCpuTime = 0; From fd68a08572d2be961294c323e6487720eb191ab2 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Fri, 28 Feb 2025 08:48:36 +0100 Subject: [PATCH 43/44] [dxvk] Improve handling of cpu-frametime related stutters for low-latency frame pacing Stutters less this way because we increase the sensitivity to mark frames as outliers, so that they don't get used for predicting the next frame. The actual "optimal" threshold is still to be fine-tuned, but this one worked really well. --- .../dxvk_framepacer_mode_low_latency.h | 23 ++++++++++--------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h index 9193f438e..24d8ab60e 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -116,9 +116,7 @@ namespace dxvk { // where gpuSubmit[i] <= gpuRun[i] for all i std::vector& gpuRun = m_tempGpuRun; - std::vector& gpuRunDurations = m_tempGpuRunDurations; gpuRun.clear(); - gpuRunDurations.clear(); int32_t optimizedGpuTime = 0; gpuRun.push_back(optimizedGpuTime); @@ -127,7 +125,6 @@ namespace dxvk { int32_t duration = duration_cast( m->gpuReady[i+1] - _gpuRun ).count(); optimizedGpuTime += duration; gpuRun.push_back(optimizedGpuTime); - gpuRunDurations.push_back(duration); } int32_t alignment = duration_cast( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count() @@ -177,7 +174,7 @@ namespace dxvk { struct SyncProps { int32_t optimizedGpuTime; // gpu executing packed submits in one go - int32_t gpuSync; // us after gpuStart + int32_t gpuSync; // gpuStart to this sync point, in microseconds int32_t cpuUntilGpuSync; int32_t cpuUntilGpuStart; int32_t csStart; @@ -187,9 +184,14 @@ namespace dxvk { SyncProps getSyncPrediction() { - // in the future we might use more samples to get a prediction - // however, simple averaging gives a slightly artificial mouse input - // more advanced methods will be investigated + // In the future we might use more samples to get a prediction. + // Possibly this will be optional, as until now, basing it on + // just the previous frame gave us the best mouse input feel. + // Simple averaging or median filtering is surely not the way + // to go, but more advanced methods will be investigated. + // The best place to filter should be on the Present() timeline, + // so not sure if we really will do any filtering here other + // than outlier removal, which will dampen stuttering effects. SyncProps res = {}; uint64_t id = m_propsFinished; if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7) @@ -210,14 +212,14 @@ namespace dxvk { bool isOutlier( uint64_t frameId ) { constexpr size_t numLoop = 7; int32_t totalCpuTime = 0; - for (size_t i=0; igetConstMarkers(frameId-i); totalCpuTime += m->cpuFinished; } - int32_t avgCpuTime = totalCpuTime / numLoop; + int32_t avgCpuTime = totalCpuTime / (numLoop-1); const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId); - if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) ) + if (m->cpuFinished > 1.3*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) ) return true; return false; @@ -232,7 +234,6 @@ namespace dxvk { std::atomic m_propsFinished = { 0 }; std::vector m_tempGpuRun; - std::vector m_tempGpuRunDurations; }; From 775b3dfe715859bf0031ad9531938ebaf5670312 Mon Sep 17 00:00:00 2001 From: netborg <137700136+netborg-afps@users.noreply.github.com> Date: Fri, 28 Feb 2025 11:23:02 +0100 Subject: [PATCH 44/44] Revert "[dxvk] Remove dxvk.lowLatencyAllowCpuFramesOverlap config variable" This reverts commit c802bdf42e993b467f8fa18b2db52da5841164d1 and makes small adjustments. Until we have a proper synchronization in place between emitting Cs triggered by the app thread, and fetching them from the queue, to measure the CsThread-caused delay, this config option is still useful for running some rare CsThread-limited games. --- src/dxvk/dxvk_options.cpp | 2 ++ src/dxvk/framepacer/dxvk_framepacer.cpp | 1 + src/dxvk/framepacer/dxvk_framepacer.h | 1 + src/dxvk/framepacer/dxvk_framepacer_mode.h | 4 ++++ .../dxvk_framepacer_mode_low_latency.cpp | 19 +++++++++++++++++++ .../dxvk_framepacer_mode_low_latency.h | 11 +++++++++-- 6 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/dxvk/dxvk_options.cpp b/src/dxvk/dxvk_options.cpp index a31778e18..85fc3ec3e 100644 --- a/src/dxvk/dxvk_options.cpp +++ b/src/dxvk/dxvk_options.cpp @@ -20,6 +20,8 @@ namespace dxvk { allowFse = config.getOption ("dxvk.allowFse", false); framePace = config.getOption("dxvk.framePace", ""); lowLatencyOffset = config.getOption ("dxvk.lowLatencyOffset", 0); + lowLatencyAllowCpuFramesOverlap + = config.getOption ("dxvk.lowLatencyAllowCpuFramesOverlap", true); deviceFilter = config.getOption("dxvk.deviceFilter", ""); tilerMode = config.getOption("dxvk.tilerMode", Tristate::Auto); } diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp index 944476817..3ff8791a9 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.cpp +++ b/src/dxvk/framepacer/dxvk_framepacer.cpp @@ -71,6 +71,7 @@ namespace dxvk { m_mode->signalGpuStart ( firstFrameId-1 ); m_mode->signalRenderFinished ( firstFrameId-1 ); + m_mode->signalCsFinished ( firstFrameId ); } diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h index 2ae4b76e8..0d5befc02 100644 --- a/src/dxvk/framepacer/dxvk_framepacer.h +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -53,6 +53,7 @@ namespace dxvk { auto now = high_resolution_clock::now(); LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); m->csFinished = std::chrono::duration_cast(now - m->start).count(); + m_mode->signalCsFinished( frameId ); } void notifySubmit( uint64_t frameId ) override { diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode.h b/src/dxvk/framepacer/dxvk_framepacer_mode.h index 2109bd57b..98404ee71 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode.h @@ -44,6 +44,9 @@ namespace dxvk { void signalGpuStart( uint64_t frameId ) { if (m_mode) m_fenceGpuStart.signal(frameId); } + void signalCsFinished( uint64_t frameId ) { + if (m_mode) m_fenceCsFinished.signal(frameId); } + void setTargetFrameRate( double frameRate ) { if (!m_fpsLimitEnvOverride && frameRate > 1.0) m_fpsLimitFrametime.store( 1'000'000/frameRate ); @@ -65,6 +68,7 @@ namespace dxvk { sync::Fence m_fenceGpuStart = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; + sync::Fence m_fenceCsFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; }; diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp index 4af77f051..4e39145b4 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp @@ -10,6 +10,15 @@ namespace dxvk { } + bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) { + int32_t o; + if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o)) + return false; + allowOverlap = (bool) o; + return true; + } + + int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) { int32_t offset = options.lowLatencyOffset; int32_t o; @@ -21,4 +30,14 @@ namespace dxvk { return offset; } + + bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) { + bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap; + bool o; + if (getLowLatencyAllowCpuFramesOverlapFromEnv(o)) + allowOverlap = o; + return allowOverlap; + } + + } diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h index 24d8ab60e..59d10909e 100644 --- a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -44,17 +44,21 @@ namespace dxvk { LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options) : FramePacerMode(mode, storage), - m_lowLatencyOffset(getLowLatencyOffset(options)) { + m_lowLatencyOffset(getLowLatencyOffset(options)), + m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) { Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) ); + Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) ); } ~LowLatencyMode() {} void startFrame( uint64_t frameId ) override { - using std::chrono::duration_cast; + if (!m_allowCpuFramesOverlap) + m_fenceCsFinished.wait( frameId-1 ); + m_fenceGpuStart.wait( frameId-1 ); time_point now = high_resolution_clock::now(); @@ -227,7 +231,10 @@ namespace dxvk { int32_t getLowLatencyOffset( const DxvkOptions& options ); + bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ); + const int32_t m_lowLatencyOffset; + const bool m_allowCpuFramesOverlap; Sleep::TimePoint m_lastStart = { high_resolution_clock::now() }; std::array m_props;