diff --git a/dxvk.conf b/dxvk.conf index 8e8eb0166..482af8cf8 100644 --- a/dxvk.conf +++ b/dxvk.conf @@ -18,6 +18,51 @@ # dxgi.enableHDR = True +# Frame pacing mode managing CPU-GPU synchronization. +# Defaults to "low-latency" in the draft-PR for demonstration purposes. +# +# "max-frame-latency" provides stable latency in the GPU-limit as long as +# GPU render times are stable. Latency generally is higher but offers great +# visual smoothness. +# +# "low-latency" provides lower latency in the GPU-limit and can be fine-tuned +# via dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap. +# +# "min-latency" possibly provides the lowest latency (low-latency can be +# quicker in some situations), and offers less fps in the GPU-limit +# due to stalling the GPU between frames. Generally not recommended, +# but helpful to get insights to fine-tune the low-latency mode and +# possibly is useful for running games in the CPU-limit. +# +# "low/min-latency" also supports its own fps-limiting enabled via common +# variables. +# +# Supported values: "max-frame-latency", "low-latency", "min-latency" + +# dxvk.framePacing = "" + + +# Allows fine-tuning the low-latency frame pacing mode. +# Positive values make a frame begin later which might improve responsiveness, +# although only very slightly, but may be relevant for edge cases. +# Negative values make a frame begin earlier which might improve fps. +# Values are given in microseconds. Defaults to 0. +# +# Supported values: -10000 to 10000 + +# dxvk.lowLatencyOffset = 0 + + +# Determines whether a frame is allowed to begin before finishing processing +# the cpu-part of the previous one, when low-latency frame pacing is used. +# Snappiness may be improved when disallowing overlap. On the other hand, this +# might also decrease fps in certain cases. Defaults to True. +# +# Supported values: True, False + +# dxvk.lowLatencyAllowCpuFramesOverlap = True + + # Expose support for dcomp swap chains with a dummy window. # # This is not a valid implementation of DirectComposition swapchains, @@ -104,8 +149,13 @@ # The implementation will either use VK_NV_low_latency2 if supported # by the driver, or a custom algorithm. # - False: Disable Reflex support as well as built-in latency reduction. +# This build defaults to False to enable dxvk.framePacing. You need to +# enable Reflex manually (Auto) until we support switching back and +# forth between Reflex and the low-latency frame pacing - for example +# via the ingame options - and more critically we want to enable +# low-latency frame pacing if the game doesn't support Reflex. -# dxvk.latencySleep = Auto +# dxvk.latencySleep = False # Tolerance for the latency sleep heuristic, in microseconds. Higher values diff --git a/src/d3d11/d3d11_swapchain.cpp b/src/d3d11/d3d11_swapchain.cpp index 0b533c93e..264f3e639 100644 --- a/src/d3d11/d3d11_swapchain.cpp +++ b/src/d3d11/d3d11_swapchain.cpp @@ -3,6 +3,7 @@ #include "d3d11_swapchain.h" #include "../dxvk/dxvk_latency_builtin.h" +#include "../dxvk/framepacer/dxvk_framepacer.h" #include "../util/util_win32_compat.h" @@ -294,6 +295,9 @@ namespace dxvk { if (m_latencyHud) m_latencyHud->accumulateStats(latencyStats); + if (m_renderLatencyHud) + m_renderLatencyHud->updateLatencyTracker(m_latency); + return hr; } @@ -354,6 +358,10 @@ namespace dxvk { if (m_presenter != nullptr) m_presenter->setFrameRateLimit(m_targetFrameRate, GetActualFrameLatency()); + + FramePacer* framePacer = dynamic_cast(m_latency.ptr()); + if (framePacer != nullptr) + framePacer->setTargetFrameRate(FrameRate); } @@ -599,8 +607,14 @@ namespace dxvk { if (hud) { hud->addItem("api", 1, GetApiName()); - if (m_latency) + if (m_latency) { m_latencyHud = hud->addItem("latency", 4); + FramePacer* framePacer = dynamic_cast(m_latency.ptr()); + if (framePacer) { + int32_t fpsItemPos = hud->getItemPos(); + m_renderLatencyHud = hud->addItem("renderlatency", fpsItemPos+1); + } + } } m_blitter = new DxvkSwapchainBlitter(m_device, std::move(hud)); diff --git a/src/d3d11/d3d11_swapchain.h b/src/d3d11/d3d11_swapchain.h index 99f09450c..6a77c7351 100644 --- a/src/d3d11/d3d11_swapchain.h +++ b/src/d3d11/d3d11_swapchain.h @@ -125,7 +125,8 @@ namespace dxvk { dxvk::mutex m_frameStatisticsLock; DXGI_VK_FRAME_STATISTICS m_frameStatistics = { }; - Rc m_latencyHud; + Rc m_latencyHud; + Rc m_renderLatencyHud; Rc GetBackBufferView(); diff --git a/src/d3d9/d3d9_swapchain.cpp b/src/d3d9/d3d9_swapchain.cpp index 8f97079be..81074d7b6 100644 --- a/src/d3d9/d3d9_swapchain.cpp +++ b/src/d3d9/d3d9_swapchain.cpp @@ -5,6 +5,8 @@ #include "d3d9_hud.h" #include "d3d9_window.h" +#include "../dxvk/framepacer/dxvk_framepacer.h" + namespace dxvk { static uint16_t MapGammaControlPoint(float x) { @@ -923,6 +925,9 @@ namespace dxvk { if (m_latencyHud) m_latencyHud->accumulateStats(latencyStats); + if (m_renderLatencyHud) + m_renderLatencyHud->updateLatencyTracker(m_latencyTracker); + // Rotate swap chain buffers so that the back // buffer at index 0 becomes the front buffer. for (uint32_t i = 1; i < m_backBuffers.size(); i++) @@ -1060,8 +1065,14 @@ namespace dxvk { if (hud) { m_apiHud = hud->addItem("api", 1, GetApiName()); - if (m_latencyTracking) + if (m_latencyTracking) { m_latencyHud = hud->addItem("latency", 4); + FramePacer* framePacer = dynamic_cast(m_latencyTracker.ptr()); + if (framePacer) { + int32_t fpsItemPos = hud->getItemPos(); + m_renderLatencyHud = hud->addItem("renderlatency", fpsItemPos+1); + } + } hud->addItem("samplers", -1, m_parent); hud->addItem("ffshaders", -1, m_parent); @@ -1112,6 +1123,9 @@ namespace dxvk { } m_wctx->presenter->setFrameRateLimit(frameRate, GetActualFrameLatency()); + FramePacer* framePacer = dynamic_cast(m_latencyTracker.ptr()); + if (framePacer != nullptr) + framePacer->setTargetFrameRate(frameRate); m_targetFrameRate = frameRate; } diff --git a/src/d3d9/d3d9_swapchain.h b/src/d3d9/d3d9_swapchain.h index 6ea0d96cb..d06c388a9 100644 --- a/src/d3d9/d3d9_swapchain.h +++ b/src/d3d9/d3d9_swapchain.h @@ -183,8 +183,9 @@ namespace dxvk { bool m_latencyTracking = false; Rc m_latencyTracker = nullptr; - Rc m_apiHud; - Rc m_latencyHud; + Rc m_apiHud; + Rc m_latencyHud; + Rc m_renderLatencyHud; std::optional m_hdrMetadata; bool m_unlockAdditionalFormats = false; diff --git a/src/dxvk/dxvk_context.cpp b/src/dxvk/dxvk_context.cpp index c05b5c281..03e9b0781 100644 --- a/src/dxvk/dxvk_context.cpp +++ b/src/dxvk/dxvk_context.cpp @@ -110,7 +110,7 @@ namespace dxvk { void DxvkContext::beginLatencyTracking( const Rc& tracker, uint64_t frameId) { - if (tracker && (!m_latencyTracker || m_latencyTracker == tracker)) { + if (tracker && m_latencyTracker != tracker) { tracker->notifyCsRenderBegin(frameId); m_latencyTracker = tracker; diff --git a/src/dxvk/dxvk_device.cpp b/src/dxvk/dxvk_device.cpp index f16603adf..a889285fa 100644 --- a/src/dxvk/dxvk_device.cpp +++ b/src/dxvk/dxvk_device.cpp @@ -2,6 +2,7 @@ #include "dxvk_instance.h" #include "dxvk_latency_builtin.h" #include "dxvk_latency_reflex.h" +#include "framepacer/dxvk_framepacer.h" namespace dxvk { @@ -310,13 +311,13 @@ namespace dxvk { Rc DxvkDevice::createLatencyTracker( const Rc& presenter) { if (m_options.latencySleep == Tristate::False) - return nullptr; + return new FramePacer(m_options); if (m_options.latencySleep == Tristate::Auto) { if (m_features.nvLowLatency2) return new DxvkReflexLatencyTrackerNv(presenter); else - return nullptr; + return new FramePacer(m_options); } return new DxvkBuiltInLatencyTracker(presenter, diff --git a/src/dxvk/dxvk_latency.h b/src/dxvk/dxvk_latency.h index c9ac93c5d..f4e74a7ce 100644 --- a/src/dxvk/dxvk_latency.h +++ b/src/dxvk/dxvk_latency.h @@ -128,6 +128,10 @@ namespace dxvk { virtual void notifyCpuPresentEnd( uint64_t frameId) = 0; + virtual void notifySubmit() { } + virtual void notifyPresent( + uint64_t frameId) { } + /** * \brief Called when a command list is submitted to the GPU * @@ -174,6 +178,9 @@ namespace dxvk { virtual void notifyGpuExecutionEnd( uint64_t frameId) = 0; + virtual void notifyGpuPresentBegin( + uint64_t frameId) { } + /** * \brief Called when presentation of a given frame finishes on the GPU * diff --git a/src/dxvk/dxvk_options.cpp b/src/dxvk/dxvk_options.cpp index d2d455c33..85fc3ec3e 100644 --- a/src/dxvk/dxvk_options.cpp +++ b/src/dxvk/dxvk_options.cpp @@ -12,12 +12,16 @@ namespace dxvk { useRawSsbo = config.getOption("dxvk.useRawSsbo", Tristate::Auto); hud = config.getOption("dxvk.hud", ""); tearFree = config.getOption("dxvk.tearFree", Tristate::Auto); - latencySleep = config.getOption("dxvk.latencySleep", Tristate::Auto); + latencySleep = config.getOption("dxvk.latencySleep", Tristate::False); latencyTolerance = config.getOption ("dxvk.latencyTolerance", 1000); disableNvLowLatency2 = config.getOption("dxvk.disableNvLowLatency2", Tristate::Auto); hideIntegratedGraphics = config.getOption ("dxvk.hideIntegratedGraphics", false); zeroMappedMemory = config.getOption ("dxvk.zeroMappedMemory", false); allowFse = config.getOption ("dxvk.allowFse", false); + framePace = config.getOption("dxvk.framePace", ""); + lowLatencyOffset = config.getOption ("dxvk.lowLatencyOffset", 0); + lowLatencyAllowCpuFramesOverlap + = config.getOption ("dxvk.lowLatencyAllowCpuFramesOverlap", true); deviceFilter = config.getOption("dxvk.deviceFilter", ""); tilerMode = config.getOption("dxvk.tilerMode", Tristate::Auto); } diff --git a/src/dxvk/dxvk_options.h b/src/dxvk/dxvk_options.h index 5351ac68b..fd2977143 100644 --- a/src/dxvk/dxvk_options.h +++ b/src/dxvk/dxvk_options.h @@ -38,7 +38,9 @@ namespace dxvk { Tristate tearFree = Tristate::Auto; /// Enables latency sleep - Tristate latencySleep = Tristate::Auto; + /// Defaults to false in this build to activate the FramePacer, + /// especially for the case when the game doesn't support Reflex + Tristate latencySleep = Tristate::False; /// Latency tolerance, in microseconds int32_t latencyTolerance = 0u; @@ -61,6 +63,18 @@ namespace dxvk { /// Whether to enable tiler optimizations Tristate tilerMode = Tristate::Auto; + /// Frame pacing + std::string framePace; + + /// A value in microseconds to fine-tune the low-latency frame pacing. + /// Positive values make a frame begin later which might improve responsiveness. + /// Negative values make a frame begin earlier which might improve fps. + int32_t lowLatencyOffset; + + /// Determines whether a frame is allowed to begin before finishing processing + /// the cpu-part of the previous one, when low-latency frame pacing is used. + bool lowLatencyAllowCpuFramesOverlap; + // Device name std::string deviceFilter; }; diff --git a/src/dxvk/dxvk_presenter.cpp b/src/dxvk/dxvk_presenter.cpp index 0e3c87762..3297d14a0 100644 --- a/src/dxvk/dxvk_presenter.cpp +++ b/src/dxvk/dxvk_presenter.cpp @@ -259,18 +259,11 @@ namespace dxvk { return; if (m_device->features().khrPresentWait.presentWait) { - bool canSignal = false; - - { std::unique_lock lock(m_frameMutex); - - m_lastSignaled = frameId; - canSignal = m_lastCompleted >= frameId; - } - - if (canSignal) - m_signal->signal(frameId); + std::lock_guard lock(m_frameMutex); + m_lastSignaled = frameId; + m_frameCond.notify_one(); } else { - m_fpsLimiter.delay(); + m_fpsLimiter.delay(tracker); m_signal->signal(frameId); if (tracker) @@ -1210,26 +1203,25 @@ namespace dxvk { void Presenter::runFrameThread() { env::setThreadName("dxvk-frame"); - while (true) { - PresenterFrame frame = { }; + std::unique_lock lock(m_frameMutex); + while (true) { // Wait for all GPU work for this frame to complete in order to maintain // ordering guarantees of the frame signal w.r.t. objects being released - { std::unique_lock lock(m_frameMutex); + m_frameCond.wait(lock, [this] { + return !m_frameQueue.empty() && m_frameQueue.front().frameId <= m_lastSignaled; + }); - m_frameCond.wait(lock, [this] { - return !m_frameQueue.empty(); - }); + // Use a frame ID of 0 as an exit condition + PresenterFrame frame = m_frameQueue.front(); - // Use a frame ID of 0 as an exit condition - frame = m_frameQueue.front(); - - if (!frame.frameId) { - m_frameQueue.pop(); - return; - } + if (!frame.frameId) { + m_frameQueue.pop(); + return; } + lock.unlock(); + // If the present operation has succeeded, actually wait for it to complete. // Don't bother with it on MAILBOX / IMMEDIATE modes since doing so would // restrict us to the display refresh rate on some platforms (XWayland). @@ -1243,32 +1235,24 @@ namespace dxvk { // Signal latency tracker right away to get more accurate // measurements if the frame rate limiter is enabled. - if (frame.tracker) { + if (frame.tracker) frame.tracker->notifyGpuPresentEnd(frame.frameId); - frame.tracker = nullptr; - } - // Apply FPS limiter here to align it as closely with scanout as we can, + // Apply FPS limtier here to align it as closely with scanout as we can, // and delay signaling the frame latency event to emulate behaviour of a // low refresh rate display as closely as we can. - m_fpsLimiter.delay(); - - // Wake up any thread that may be waiting for the queue to become empty - bool canSignal = false; - - { std::unique_lock lock(m_frameMutex); - - m_frameQueue.pop(); - m_frameDrain.notify_one(); - - m_lastCompleted = frame.frameId; - canSignal = m_lastSignaled >= frame.frameId; - } + m_fpsLimiter.delay(frame.tracker); + frame.tracker = nullptr; // Always signal even on error, since failures here // are transparent to the front-end. - if (canSignal) - m_signal->signal(frame.frameId); + m_signal->signal(frame.frameId); + + // Wake up any thread that may be waiting for the queue to become empty + lock.lock(); + + m_frameQueue.pop(); + m_frameDrain.notify_one(); } } diff --git a/src/dxvk/dxvk_presenter.h b/src/dxvk/dxvk_presenter.h index 8e403b244..afbe465c3 100644 --- a/src/dxvk/dxvk_presenter.h +++ b/src/dxvk/dxvk_presenter.h @@ -315,7 +315,6 @@ namespace dxvk { std::queue m_frameQueue; uint64_t m_lastSignaled = 0u; - uint64_t m_lastCompleted = 0u; alignas(CACHE_LINE_SIZE) FpsLimiter m_fpsLimiter; diff --git a/src/dxvk/dxvk_queue.cpp b/src/dxvk/dxvk_queue.cpp index 6d2d153b6..0c74428a0 100644 --- a/src/dxvk/dxvk_queue.cpp +++ b/src/dxvk/dxvk_queue.cpp @@ -1,5 +1,6 @@ #include "dxvk_device.h" #include "dxvk_queue.h" +#include "framepacer/dxvk_framepacer.h" namespace dxvk { @@ -46,6 +47,8 @@ namespace dxvk { DxvkSubmitInfo submitInfo, DxvkLatencyInfo latencyInfo, DxvkSubmitStatus* status) { + if (latencyInfo.tracker) + latencyInfo.tracker->notifySubmit(); std::unique_lock lock(m_mutex); m_finishCond.wait(lock, [this] { @@ -66,6 +69,8 @@ namespace dxvk { DxvkPresentInfo presentInfo, DxvkLatencyInfo latencyInfo, DxvkSubmitStatus* status) { + if (latencyInfo.tracker) + latencyInfo.tracker->notifyPresent(presentInfo.frameId); std::unique_lock lock(m_mutex); DxvkSubmitEntry entry = { }; @@ -274,7 +279,9 @@ namespace dxvk { } else if (entry.present.presenter != nullptr) { // Signal the frame and then immediately destroy the reference. // This is necessary since the front-end may want to explicitly - // destroy the presenter object. + // destroy the presenter object. + if (entry.latency.tracker) + entry.latency.tracker->notifyGpuPresentBegin(entry.present.frameId); entry.present.presenter->signalFrame(entry.present.frameId, entry.latency.tracker); entry.present.presenter = nullptr; } diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp new file mode 100644 index 000000000..63803f1ba --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer.cpp @@ -0,0 +1,64 @@ +#include "dxvk_framepacer.h" +#include "dxvk_framepacer_mode_low_latency.h" +#include "dxvk_framepacer_mode_min_latency.h" +#include "dxvk_options.h" +#include "../../util/util_env.h" +#include "../../util/log/log.h" + +namespace dxvk { + + + FramePacer::FramePacer( const DxvkOptions& options ) { + // we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes, + // highlighting the generally much better input lag and medium-term time consistency. + // although MAX_FRAME_LATENCY has advantages in many games and is likely the better default, + // for its higher fps throughput and less susceptibility to short-term time inconsistencies. + // which mode being smoother depends on the game. + FramePacerMode::Mode mode = FramePacerMode::LOW_LATENCY; + + std::string configStr = env::getEnvVar("DXVK_FRAME_PACE"); + + if (configStr.find("max-frame-latency") != std::string::npos) { + mode = FramePacerMode::MAX_FRAME_LATENCY; + } else if (configStr.find("low-latency") != std::string::npos) { + mode = FramePacerMode::LOW_LATENCY; + } else if (configStr.find("min-latency") != std::string::npos) { + mode = FramePacerMode::MIN_LATENCY; + } else if (options.framePace.find("max-frame-latency") != std::string::npos) { + mode = FramePacerMode::MAX_FRAME_LATENCY; + } else if (options.framePace.find("low-latency") != std::string::npos) { + mode = FramePacerMode::LOW_LATENCY; + } else if (options.framePace.find("min-latency") != std::string::npos) { + mode = FramePacerMode::MIN_LATENCY; + } + + switch (mode) { + case FramePacerMode::MAX_FRAME_LATENCY: + Logger::info( "Frame pace: max-frame-latency" ); + m_mode = std::make_unique(FramePacerMode::MAX_FRAME_LATENCY, &m_latencyMarkersStorage); + break; + + case FramePacerMode::LOW_LATENCY: + Logger::info( "Frame pace: low-latency" ); + m_mode = std::make_unique(mode, &m_latencyMarkersStorage, options); + break; + + case FramePacerMode::MIN_LATENCY: + Logger::info( "Frame pace: min-latency" ); + m_mode = std::make_unique(mode, &m_latencyMarkersStorage); + break; + } + + for (auto& gpuStart: m_gpuStarts) { + gpuStart.store(0); + } + + // be consistent that every frame has a gpuReady event from the previous frame + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1); + m->gpuReady.push_back(high_resolution_clock::now()); + } + + + FramePacer::~FramePacer() {} + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h new file mode 100644 index 000000000..264dcff57 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -0,0 +1,191 @@ +#pragma once + +#include "dxvk_framepacer_mode.h" +#include "dxvk_latency_markers.h" +#include "../dxvk_latency.h" +#include "../../util/util_time.h" +#include + + +namespace dxvk { + + struct DxvkOptions; + + /* \brief Frame pacer interface managing the CPU - GPU synchronization. + * + * GPUs render frames asynchronously to the game's and dxvk's CPU-side work + * in order to improve fps-throughput. Aligning the cpu work to chosen time- + * points allows to tune certain characteristics of the video presentation, + * like smoothness and latency. + */ + + class FramePacer : public DxvkLatencyTracker { + using microseconds = std::chrono::microseconds; + public: + + FramePacer( const DxvkOptions& options ); + ~FramePacer(); + + void sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate) override { + // wait for finished rendering of a previous frame, typically the one before last + m_mode->waitRenderFinished(frameId); + // potentially wait some more if the cpu gets too much ahead + m_mode->startFrame(frameId); + m_latencyMarkersStorage.registerFrameStart(frameId); + m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0); + } + + void notifyGpuPresentEnd( uint64_t frameId ) override { + // the frame has been displayed to the screen + m_latencyMarkersStorage.registerFrameEnd(frameId); + m_mode->endFrame(frameId); + } + + void notifyCsRenderBegin( uint64_t frameId ) override { + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + m->csStart = std::chrono::duration_cast(now - m->start).count(); + } + + void notifyCsRenderEnd( uint64_t frameId ) override { + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + m->csFinished = std::chrono::duration_cast(now - m->start).count(); + m_mode->signalCsFinished( frameId ); + } + + void notifySubmit() override { + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1); + m->gpuSubmit.push_back(high_resolution_clock::now()); + } + + void notifyPresent( uint64_t frameId ) override { + // dx to vk translation is finished + if (frameId != 0) { + auto now = high_resolution_clock::now(); + m_lastSubmitFrameId = frameId; + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); + m->gpuSubmit.push_back(now); + m->cpuFinished = std::chrono::duration_cast(now - m->start).count(); + next->gpuSubmit.clear(); + + m_latencyMarkersStorage.m_timeline.cpuFinished.store(frameId); + } + } + + void notifyQueueSubmit( uint64_t frameId ) override { + assert( frameId == m_lastQueueSubmitFrameId + 1 ); + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + m->gpuQueueSubmit.push_back(now); + queueSubmitCheckGpuStart(frameId, m, now); + } + + void notifyQueuePresentBegin( uint64_t frameId ) override { + if (frameId != 0) { + auto now = high_resolution_clock::now(); + m_lastQueueSubmitFrameId = frameId; + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); + m->gpuQueueSubmit.push_back(now); + next->gpuQueueSubmit.clear(); + queueSubmitCheckGpuStart(frameId, m, now); + } + } + + void notifyGpuExecutionBegin( uint64_t frameId ) override { + assert( frameId == m_lastFinishedFrameId+1 ); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); + gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now()); + } + + void notifyGpuExecutionEnd( uint64_t frameId ) override { + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); + m->gpuReady.push_back(now); + } + + virtual void notifyGpuPresentBegin( uint64_t frameId ) override { + // we get frameId == 0 for repeated presents (SyncInterval) + if (frameId != 0) { + m_lastFinishedFrameId = frameId; + auto now = high_resolution_clock::now(); + + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); + m->gpuReady.push_back(now); + m->gpuFinished = std::chrono::duration_cast(now - m->start).count(); + next->gpuReady.clear(); + next->gpuReady.push_back(now); + + gpuExecutionCheckGpuStart(frameId, m, now); + + m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId); + m_mode->finishRender(frameId); + m_mode->signalRenderFinished(frameId); + } + } + + FramePacerMode::Mode getMode() const { + return m_mode->m_mode; + } + + void setTargetFrameRate( double frameRate ) { + m_mode->setTargetFrameRate(frameRate); + } + + bool needsAutoMarkers() override { + return true; + } + + LatencyMarkersStorage m_latencyMarkersStorage; + + + // not implemented methods + + + void notifyCpuPresentBegin( uint64_t frameId) override { } + void notifyCpuPresentEnd( uint64_t frameId ) override { } + void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { } + void discardTimings() override { } + DxvkLatencyStats getStatistics( uint64_t frameId ) override + { return DxvkLatencyStats(); } + + private: + + void signalGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) { + m->gpuStart = std::chrono::duration_cast(t - m->start).count(); + m_latencyMarkersStorage.m_timeline.gpuStart.store(frameId); + m_mode->signalGpuStart(frameId); + } + + void queueSubmitCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) { + auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ]; + uint16_t val = gpuStart.fetch_or(queueSubmitBit); + if (val == gpuReadyBit) + signalGpuStart( frameId, m, t ); + } + + void gpuExecutionCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) { + auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ]; + uint16_t val = gpuStart.fetch_or(gpuReadyBit); + if (val == queueSubmitBit) + signalGpuStart( frameId, m, t ); + } + + std::unique_ptr m_mode; + + uint64_t m_lastSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + uint64_t m_lastFinishedFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + + std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { }; + static constexpr uint16_t queueSubmitBit = 1; + static constexpr uint16_t gpuReadyBit = 2; + + }; + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode.h b/src/dxvk/framepacer/dxvk_framepacer_mode.h new file mode 100644 index 000000000..109a240a2 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode.h @@ -0,0 +1,117 @@ +#pragma once + +#include "dxvk_latency_markers.h" +#include "../../util/sync/sync_signal.h" +#include "../../util/util_env.h" +#include + +namespace dxvk { + + /* + * /brief Abstract frame pacer mode in order to support different strategies of synchronization. + */ + + class FramePacerMode { + + public: + + enum Mode { + MAX_FRAME_LATENCY = 0, + LOW_LATENCY, + MIN_LATENCY + }; + + FramePacerMode( Mode mode, LatencyMarkersStorage* markerStorage, uint32_t maxFrameLatency=1 ) + : m_mode( mode ), + m_waitLatency( maxFrameLatency+1 ), + m_latencyMarkersStorage( markerStorage ) { + setFpsLimitFrametimeFromEnv(); + } + + virtual ~FramePacerMode() { } + + virtual void startFrame( uint64_t frameId ) { } + virtual void endFrame( uint64_t frameId ) { } + + virtual void finishRender( uint64_t frameId ) { } + + void waitRenderFinished( uint64_t frameId ) { + if (m_mode) m_fenceGpuFinished.wait(frameId-m_waitLatency); } + + void signalRenderFinished( uint64_t frameId ) { + if (m_mode) m_fenceGpuFinished.signal(frameId); } + + void signalGpuStart( uint64_t frameId ) { + if (m_mode) m_fenceGpuStart.signal(frameId); } + + void signalCsFinished( uint64_t frameId ) { + if (m_mode) m_fenceCsFinished.signal(frameId); } + + void setTargetFrameRate( double frameRate ) { + if (!m_fpsLimitEnvOverride && frameRate > 1.0) + m_fpsLimitFrametime.store( 1'000'000/frameRate ); + } + + const Mode m_mode; + + static bool getDoubleFromEnv( const char* name, double* result ); + static bool getIntFromEnv( const char* name, int* result ); + + protected: + + void setFpsLimitFrametimeFromEnv(); + + const uint32_t m_waitLatency; + LatencyMarkersStorage* m_latencyMarkersStorage; + std::atomic m_fpsLimitFrametime = { 0 }; + bool m_fpsLimitEnvOverride = { false }; + + sync::Fence m_fenceGpuStart = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; + sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; + sync::Fence m_fenceCsFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) }; + + }; + + + + inline bool FramePacerMode::getDoubleFromEnv( const char* name, double* result ) { + std::string env = env::getEnvVar(name); + if (env.empty()) + return false; + + try { + *result = std::stod(env); + return true; + } catch (const std::invalid_argument&) { + return false; + } + } + + + inline bool FramePacerMode::getIntFromEnv( const char* name, int* result ) { + std::string env = env::getEnvVar(name); + if (env.empty()) + return false; + + try { + *result = std::stoi(env); + return true; + } catch (const std::invalid_argument&) { + return false; + } + } + + + inline void FramePacerMode::setFpsLimitFrametimeFromEnv() { + double fpsLimit; + if (!getDoubleFromEnv("DXVK_FRAME_RATE", &fpsLimit)) + return; + + m_fpsLimitEnvOverride = true; + if (fpsLimit < 1.0) + return; + + m_fpsLimitFrametime = 1'000'000/fpsLimit; + } + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp new file mode 100644 index 000000000..4e39145b4 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp @@ -0,0 +1,43 @@ +#include "dxvk_framepacer_mode_low_latency.h" + +namespace dxvk { + + + bool getLowLatencyOffsetFromEnv( int32_t& offset ) { + if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_OFFSET", &offset)) + return false; + return true; + } + + + bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) { + int32_t o; + if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o)) + return false; + allowOverlap = (bool) o; + return true; + } + + + int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) { + int32_t offset = options.lowLatencyOffset; + int32_t o; + if (getLowLatencyOffsetFromEnv(o)) + offset = o; + + offset = std::max( -10000, offset ); + offset = std::min( 10000, offset ); + return offset; + } + + + bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) { + bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap; + bool o; + if (getLowLatencyAllowCpuFramesOverlapFromEnv(o)) + allowOverlap = o; + return allowOverlap; + } + + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h new file mode 100644 index 000000000..06fdaf0dd --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -0,0 +1,255 @@ +#pragma once + +#include "dxvk_framepacer_mode.h" +#include "../dxvk_options.h" +#include "../../util/log/log.h" +#include "../../util/util_string.h" +#include + +namespace dxvk { + + /* + * This low-latency mode aims to reduce latency with minimal impact in fps. + * Effective when operating in the GPU-limit. Efficient to be used in the CPU-limit as well. + * + * Greatly reduces input lag variations when switching between CPU- and GPU-limit, and + * compared to the max-frame-latency approach, it has a much more stable input lag when + * GPU running times change dramatically, which can happen for example when rotating within a scene. + * + * The current implementation rather generates fluctuations alternating frame-by-frame + * depending on the game's and dxvk's CPU-time variations. This might be visible as a loss + * in smoothness, which is an area this implementation can be further improved. Unsuitable + * smoothing however might degrade input-lag feel, so it's not implemented for now, but + * more advanced smoothing techniques will be investigated in the future. + * In some situations however, this low-latency pacing actually improves smoothing though, + * it will depend on the game. + * + * An interesting observation while playtesting was that not only the input lag was affected, + * but the video generated did progress more cleanly in time as well with regards to + * medium-term time consistency, in other words, the video playback speed remained more steady. + * + * Optimized for VRR and VK_PRESENT_MODE_IMMEDIATE_KHR. It also comes with its own fps-limiter + * which is typically used to prevent the game's fps exceeding the monitor's refresh rate, + * and which is tightly integrated into the pacing logic. + * + * Can be fine-tuned via the dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap + * variables (or their respective environment variables) + * Compared to maxFrameLatency = 3, render-latency reductions of up to 67% are achieved. + */ + + class LowLatencyMode : public FramePacerMode { + using microseconds = std::chrono::microseconds; + using time_point = high_resolution_clock::time_point; + public: + + LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options) + : FramePacerMode(mode, storage), + m_lowLatencyOffset(getLowLatencyOffset(options)), + m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) { + Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) ); + Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) ); + } + + ~LowLatencyMode() {} + + + void startFrame( uint64_t frameId ) override { + using std::chrono::duration_cast; + + if (!m_allowCpuFramesOverlap) + m_fenceCsFinished.wait( frameId-1 ); + + m_fenceGpuStart.wait( frameId-1 ); + + time_point now = high_resolution_clock::now(); + uint64_t finishedId = m_latencyMarkersStorage->getTimeline()->gpuFinished.load(); + if (finishedId <= DXGI_MAX_SWAP_CHAIN_BUFFERS+1ull) + return; + + if (finishedId == frameId-1) { + // we are the only in-flight frame, nothing to do other then to apply fps-limiter if needed + m_lastStart = sleepFor( now, 0 ); + return; + } + + if (finishedId != frameId-2) { + Logger::err( str::format("internal error during low-latency frame pacing: expected finished frameId=", + frameId-2, ", got: ", finishedId) ); + } + + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-1); + + // estimate the target gpu sync point for this frame + // and calculate backwards when we want to start this frame + + const SyncProps props = getSyncPrediction(); + int32_t gpuReadyPrediction = duration_cast( + m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count(); + + int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync; + int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset; + + m_lastStart = sleepFor( now, delay ); + + } + + + void finishRender( uint64_t frameId ) override { + + using std::chrono::duration_cast; + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId); + + int32_t numLoop = (int32_t)(m->gpuReady.size())-1; + if (numLoop <= 1) { + m_props[frameId % m_props.size()] = SyncProps(); + m_props[frameId % m_props.size()].isOutlier = true; + m_propsFinished.store( frameId ); + return; + } + + // estimates the optimal overlap for cpu/gpu work by optimizing gpu scheduling first + // such that the gpu doesn't go into idle for this frame, and then aligning cpu submits + // where gpuSubmit[i] <= gpuRun[i] for all i + + std::vector& gpuRun = m_tempGpuRun; + std::vector& gpuRunDurations = m_tempGpuRunDurations; + gpuRun.clear(); + gpuRunDurations.clear(); + int32_t optimizedGpuTime = 0; + gpuRun.push_back(optimizedGpuTime); + + for (int i=0; igpuReady[i], m->gpuQueueSubmit[i] ); + int32_t duration = duration_cast( m->gpuReady[i+1] - _gpuRun ).count(); + optimizedGpuTime += duration; + gpuRun.push_back(optimizedGpuTime); + gpuRunDurations.push_back(duration); + } + + int32_t alignment = duration_cast( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count() + - gpuRun[numLoop-1]; + + int32_t offset = 0; + for (int i=numLoop-2; i>=0; --i) { + int32_t curSubmit = duration_cast( m->gpuSubmit[i] - m->gpuSubmit[0] ).count(); + int32_t diff = curSubmit - gpuRun[i] - alignment; + diff = std::max( 0, diff ); + offset += diff; + alignment += diff; + } + + + SyncProps& props = m_props[frameId % m_props.size()]; + props.gpuSync = gpuRun[numLoop-1]; + props.cpuUntilGpuSync = offset + duration_cast( m->gpuSubmit[numLoop-1] - m->start ).count(); + props.optimizedGpuTime = optimizedGpuTime; + props.isOutlier = isOutlier(frameId); + + m_propsFinished.store( frameId ); + + } + + + Sleep::TimePoint sleepFor( const Sleep::TimePoint t, int32_t delay ) { + + // account for the fps limit and ensure we won't sleep too long, just in case + int32_t frametime = std::chrono::duration_cast( t - m_lastStart ).count(); + int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime ); + delay = std::max( delay, frametimeDiff ); + delay = std::max( 0, std::min( delay, 20000 ) ); + + Sleep::TimePoint nextStart = t + microseconds(delay); + Sleep::sleepUntil( t, nextStart ); + return nextStart; + + } + + + private: + + struct SyncProps { + int32_t optimizedGpuTime; // gpu executing packed submits in one go + int32_t gpuSync; // us after gpuStart + int32_t cpuUntilGpuSync; + bool isOutlier; + }; + + + SyncProps getSyncPrediction() { + // in the future we might use more samples to get a prediction + // however, simple averaging gives a slightly artificial mouse input + // more advanced methods will be investigated + SyncProps res = {}; + uint64_t id = m_propsFinished; + if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7) + return res; + + for (size_t i=0; i<7; ++i) { + const SyncProps& props = m_props[ (id-i) % m_props.size() ]; + if (!props.isOutlier) { + id = id-i; + break; + } + } + + return m_props[ id % m_props.size() ]; + }; + + + int32_t getGpuStartToFinishPrediction() { + uint64_t id = m_propsFinished; + if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7) + return 0; + + for (size_t i=0; i<7; ++i) { + const SyncProps& props = m_props[ (id-i) % m_props.size() ]; + if (!props.isOutlier) { + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i); + if (m->gpuReady.empty() || m->gpuSubmit.empty()) + return m->gpuFinished - m->gpuStart; + + time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] ); + return std::chrono::duration_cast( t - m->start ).count() + + props.optimizedGpuTime + - m->gpuStart; + } + } + + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id); + return m->gpuFinished - m->gpuStart; + }; + + + bool isOutlier( uint64_t frameId ) { + constexpr size_t numLoop = 7; + int32_t totalCpuTime = 0; + for (size_t i=0; igetConstMarkers(frameId-i); + totalCpuTime += m->cpuFinished; + } + + int32_t avgCpuTime = totalCpuTime / numLoop; + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId); + if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) ) + return true; + + return false; + } + + + int32_t getLowLatencyOffset( const DxvkOptions& options ); + bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ); + + const int32_t m_lowLatencyOffset; + const bool m_allowCpuFramesOverlap; + + Sleep::TimePoint m_lastStart = { high_resolution_clock::now() }; + std::array m_props; + std::atomic m_propsFinished = { 0 }; + + std::vector m_tempGpuRun; + std::vector m_tempGpuRunDurations; + + }; + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h new file mode 100644 index 000000000..763a5368c --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h @@ -0,0 +1,45 @@ +#pragma once + +#include "dxvk_framepacer_mode.h" + +namespace dxvk { + + /* + * Minimal latency is achieved here by waiting for the previous + * frame to complete, which results in very much reduced fps. + * Generally not recommended, but helpful to get insights to fine-tune + * the low-latency mode, and possibly is useful for running games + * in the cpu limit. + */ + + class MinLatencyMode : public FramePacerMode { + + public: + + MinLatencyMode(Mode mode, LatencyMarkersStorage* storage) + : FramePacerMode(mode, storage, 0) {} + + ~MinLatencyMode() {} + + void startFrame( uint64_t frameId ) override { + + Sleep::TimePoint now = high_resolution_clock::now(); + int32_t frametime = std::chrono::duration_cast( + now - m_lastStart ).count(); + int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime ); + int32_t delay = std::max( 0, frametimeDiff ); + delay = std::min( delay, 20000 ); + + Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay); + Sleep::sleepUntil( now, nextStart ); + m_lastStart = nextStart; + + } + + private: + + Sleep::TimePoint m_lastStart = { high_resolution_clock::now() }; + + }; + +} diff --git a/src/dxvk/framepacer/dxvk_latency_markers.h b/src/dxvk/framepacer/dxvk_latency_markers.h new file mode 100644 index 000000000..7658f0737 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_latency_markers.h @@ -0,0 +1,148 @@ +#pragma once + +#include +#include +#include +#include +#include +#include "../../util/util_sleep.h" +#include "../../util/log/log.h" +#include "../../util/util_string.h" + + +namespace dxvk { + + class FramePacer; + class LatencyMarkersStorage; + + + struct LatencyMarkers { + + using time_point = high_resolution_clock::time_point; + + time_point start; + time_point end; + + int32_t csStart; + int32_t csFinished; + int32_t cpuFinished; + int32_t gpuStart; + int32_t gpuFinished; + int32_t presentFinished; + + std::vector gpuReady; + std::vector gpuSubmit; + std::vector gpuQueueSubmit; + + }; + + + /* + * stores which information is accessible for which frame + */ + struct LatencyMarkersTimeline { + + std::atomic cpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + std::atomic gpuStart = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + std::atomic gpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + std::atomic frameFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + + }; + + + class LatencyMarkersReader { + + public: + + LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries ); + bool getNext( const LatencyMarkers*& result ); + + private: + + const LatencyMarkersStorage* m_storage; + uint64_t m_index; + + }; + + + class LatencyMarkersStorage { + friend class LatencyMarkersReader; + friend class FramePacer; + public: + + LatencyMarkersStorage() { } + ~LatencyMarkersStorage() { } + + LatencyMarkersReader getReader( uint32_t numEntries ) const { + return LatencyMarkersReader(this, numEntries); + } + + void registerFrameStart( uint64_t frameId ) { + if (frameId <= m_timeline.frameFinished.load()) { + Logger::warn( str::format("internal error during registerFrameStart: expected frameId=", + m_timeline.frameFinished.load()+1, ", got: ", frameId) ); + } + auto now = high_resolution_clock::now(); + + LatencyMarkers* markers = getMarkers(frameId); + markers->start = now; + } + + void registerFrameEnd( uint64_t frameId ) { + if (frameId <= m_timeline.frameFinished.load()) { + Logger::warn( str::format("internal error during registerFrameEnd: expected frameId=", + m_timeline.frameFinished.load()+1, ", got: ", frameId) ); + } + auto now = high_resolution_clock::now(); + + LatencyMarkers* markers = getMarkers(frameId); + markers->presentFinished = std::chrono::duration_cast( + now - markers->start).count(); + markers->end = now; + + m_timeline.frameFinished.store(frameId); + } + + const LatencyMarkersTimeline* getTimeline() const { + return &m_timeline; + } + + const LatencyMarkers* getConstMarkers( uint64_t frameId ) const { + return &m_markers[frameId % m_numMarkers]; + } + + + private: + + LatencyMarkers* getMarkers( uint64_t frameId ) { + return &m_markers[frameId % m_numMarkers]; + } + + // simple modulo hash mapping is used for frameIds. They are expected to monotonically increase by one. + // select the size large enough, so we never come into a situation where the reader cannot keep up with the producer + static constexpr uint16_t m_numMarkers = 128; + std::array m_markers = { }; + LatencyMarkersTimeline m_timeline; + + }; + + + + inline LatencyMarkersReader::LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries ) + : m_storage(storage) { + m_index = 0; + if (m_storage->m_timeline.frameFinished.load() > numEntries + DXGI_MAX_SWAP_CHAIN_BUFFERS + 2) + m_index = m_storage->m_timeline.frameFinished.load() - numEntries; + } + + + inline bool LatencyMarkersReader::getNext( const LatencyMarkers*& result ) { + if (m_index == 0 || m_index > m_storage->m_timeline.frameFinished.load()) + return false; + + result = &m_storage->m_markers[m_index % m_storage->m_numMarkers]; + m_index++; + return true; + } + +} diff --git a/src/dxvk/hud/dxvk_hud.h b/src/dxvk/hud/dxvk_hud.h index 58c383f07..388cbf22a 100644 --- a/src/dxvk/hud/dxvk_hud.h +++ b/src/dxvk/hud/dxvk_hud.h @@ -59,6 +59,11 @@ namespace dxvk::hud { Rc addItem(const char* name, int32_t at, Args... args) { return m_hudItems.add(name, at, std::forward(args)...); } + + template + int32_t getItemPos() { + return m_hudItems.getItemPos(); + } /** * \brief Creates the HUD diff --git a/src/dxvk/hud/dxvk_hud_item.cpp b/src/dxvk/hud/dxvk_hud_item.cpp index 9c2a2f873..5edf2c258 100644 --- a/src/dxvk/hud/dxvk_hud_item.cpp +++ b/src/dxvk/hud/dxvk_hud_item.cpp @@ -1,4 +1,5 @@ #include "dxvk_hud_item.h" +#include "../framepacer/dxvk_framepacer.h" #include #include @@ -213,6 +214,63 @@ namespace dxvk::hud { } + HudRenderLatencyItem::HudRenderLatencyItem() { } + HudRenderLatencyItem::~HudRenderLatencyItem() { } + + void HudRenderLatencyItem::update(dxvk::high_resolution_clock::time_point time) { + // we cannot measure latency when fps-limiting is performed in Presenter::runFrameThread() + // because it's interfering with getting the right timestamp from vkWaitForPresent() + // if we truely wanted to measure it, we would need one additional thread + if (FpsLimiter::m_isActive) { + m_latency = "N/A"; + return; + } + + const Rc tracker = m_tracker; + const FramePacer* framePacer = dynamic_cast( tracker.ptr() ); + if (!framePacer) + return; + + auto elapsed = std::chrono::duration_cast(time - m_lastUpdate); + + if (elapsed.count() >= UpdateInterval) { + m_lastUpdate = time; + + LatencyMarkersReader reader = framePacer->m_latencyMarkersStorage.getReader(100); + const LatencyMarkers* markers; + uint32_t count = 0; + uint64_t totalLatency = 0; + while (reader.getNext(markers)) { + totalLatency += markers->presentFinished; + ++count; + } + + if (!count) + return; + + uint64_t latency = totalLatency / count; + m_latency = str::format(latency / 1000, ".", (latency/100) % 10, " ms"); + } + } + + + HudPos HudRenderLatencyItem::render( + const DxvkContextObjects& ctx, + const HudPipelineKey& key, + const HudOptions& options, + HudRenderer& renderer, + HudPos position) { + + position.y += 12; + renderer.drawText(16, position, 0xff4040ffu, "Render latency:"); + renderer.drawText(16, { position.x + 195, position.y }, + 0xffffffffu, m_latency); + + position.y += 8; + return position; + } + + HudFrameTimeItem::HudFrameTimeItem(const Rc& device, HudRenderer* renderer) : m_device (device), m_gfxSetLayout (createDescriptorSetLayout()), diff --git a/src/dxvk/hud/dxvk_hud_item.h b/src/dxvk/hud/dxvk_hud_item.h index 9e6274d17..b0eb44d16 100644 --- a/src/dxvk/hud/dxvk_hud_item.h +++ b/src/dxvk/hud/dxvk_hud_item.h @@ -131,6 +131,15 @@ namespace dxvk::hud { return value; } + template + int32_t getItemPos() { + for (int i=0; i<(int)m_items.size(); ++i) { + if (dynamic_cast(m_items[i].ptr())) + return i; + } + return -1; + } + private: bool m_enableFull = false; @@ -244,6 +253,42 @@ namespace dxvk::hud { }; + /** + * \brief HUD item to display render latency + */ + class HudRenderLatencyItem : public HudItem { + constexpr static int64_t UpdateInterval = 500'000; + public: + + HudRenderLatencyItem(); + + ~HudRenderLatencyItem(); + + void updateLatencyTracker( const Rc& tracker ) { + m_tracker = tracker; + } + + void update(dxvk::high_resolution_clock::time_point time); + + HudPos render( + const DxvkContextObjects& ctx, + const HudPipelineKey& key, + const HudOptions& options, + HudRenderer& renderer, + HudPos position); + + private: + + Rc m_tracker; + + dxvk::high_resolution_clock::time_point m_lastUpdate + = dxvk::high_resolution_clock::now(); + + std::string m_latency; + + }; + + /** * \brief HUD item to display the frame rate */ diff --git a/src/dxvk/meson.build b/src/dxvk/meson.build index 9b2b07356..e5d990543 100644 --- a/src/dxvk/meson.build +++ b/src/dxvk/meson.build @@ -120,6 +120,9 @@ dxvk_src = [ 'hud/dxvk_hud_font.cpp', 'hud/dxvk_hud_item.cpp', 'hud/dxvk_hud_renderer.cpp', + + 'framepacer/dxvk_framepacer.cpp', + 'framepacer/dxvk_framepacer_mode_low_latency.cpp', ] if platform == 'windows' diff --git a/src/util/util_fps_limiter.cpp b/src/util/util_fps_limiter.cpp index 621e9a453..95fb79e7e 100644 --- a/src/util/util_fps_limiter.cpp +++ b/src/util/util_fps_limiter.cpp @@ -5,12 +5,15 @@ #include "util_fps_limiter.h" #include "util_sleep.h" #include "util_string.h" +#include "../dxvk/framepacer/dxvk_framepacer.h" #include "./log/log.h" using namespace std::chrono_literals; namespace dxvk { + + std::atomic FpsLimiter::m_isActive = { false }; FpsLimiter::FpsLimiter() { auto override = getEnvironmentOverride(); @@ -48,7 +51,12 @@ namespace dxvk { } - void FpsLimiter::delay() { + void FpsLimiter::delay(const Rc& tracker) { + FramePacer* framePacer = dynamic_cast(tracker.ptr()); + if (framePacer && framePacer->getMode()) { + return; + } + std::unique_lock lock(m_mutex); auto interval = m_targetInterval; auto latency = m_maxLatency; @@ -71,8 +79,11 @@ namespace dxvk { // that can be written by setTargetFrameRate lock.unlock(); - if (t1 < m_nextFrame) + m_isActive.store(false); + if (t1 < m_nextFrame) { + m_isActive.store(true); Sleep::sleepUntil(t1, m_nextFrame); + } m_nextFrame = (t1 < m_nextFrame + interval) ? m_nextFrame + interval diff --git a/src/util/util_fps_limiter.h b/src/util/util_fps_limiter.h index 7c33a559f..d5610afd4 100644 --- a/src/util/util_fps_limiter.h +++ b/src/util/util_fps_limiter.h @@ -7,6 +7,8 @@ #include "util_time.h" namespace dxvk { + + class DxvkLatencyTracker; /** * \brief Frame rate limiter @@ -38,7 +40,7 @@ namespace dxvk { * and the time since the last call to \ref delay is * shorter than the target interval. */ - void delay(); + void delay(const Rc& tracker); /** * \brief Queries environment override @@ -46,6 +48,8 @@ namespace dxvk { */ static std::optional getEnvironmentOverride(); + static std::atomic m_isActive; + private: using TimePoint = dxvk::high_resolution_clock::time_point;