diff --git a/src/dxvk/dxvk_latency.h b/src/dxvk/dxvk_latency.h index c9ac93c5d..f4e74a7ce 100644 --- a/src/dxvk/dxvk_latency.h +++ b/src/dxvk/dxvk_latency.h @@ -128,6 +128,10 @@ namespace dxvk { virtual void notifyCpuPresentEnd( uint64_t frameId) = 0; + virtual void notifySubmit() { } + virtual void notifyPresent( + uint64_t frameId) { } + /** * \brief Called when a command list is submitted to the GPU * @@ -174,6 +178,9 @@ namespace dxvk { virtual void notifyGpuExecutionEnd( uint64_t frameId) = 0; + virtual void notifyGpuPresentBegin( + uint64_t frameId) { } + /** * \brief Called when presentation of a given frame finishes on the GPU * diff --git a/src/dxvk/framepacer/dxvk_framepacer.cpp b/src/dxvk/framepacer/dxvk_framepacer.cpp new file mode 100644 index 000000000..63803f1ba --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer.cpp @@ -0,0 +1,64 @@ +#include "dxvk_framepacer.h" +#include "dxvk_framepacer_mode_low_latency.h" +#include "dxvk_framepacer_mode_min_latency.h" +#include "dxvk_options.h" +#include "../../util/util_env.h" +#include "../../util/log/log.h" + +namespace dxvk { + + + FramePacer::FramePacer( const DxvkOptions& options ) { + // we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes, + // highlighting the generally much better input lag and medium-term time consistency. + // although MAX_FRAME_LATENCY has advantages in many games and is likely the better default, + // for its higher fps throughput and less susceptibility to short-term time inconsistencies. + // which mode being smoother depends on the game. + FramePacerMode::Mode mode = FramePacerMode::LOW_LATENCY; + + std::string configStr = env::getEnvVar("DXVK_FRAME_PACE"); + + if (configStr.find("max-frame-latency") != std::string::npos) { + mode = FramePacerMode::MAX_FRAME_LATENCY; + } else if (configStr.find("low-latency") != std::string::npos) { + mode = FramePacerMode::LOW_LATENCY; + } else if (configStr.find("min-latency") != std::string::npos) { + mode = FramePacerMode::MIN_LATENCY; + } else if (options.framePace.find("max-frame-latency") != std::string::npos) { + mode = FramePacerMode::MAX_FRAME_LATENCY; + } else if (options.framePace.find("low-latency") != std::string::npos) { + mode = FramePacerMode::LOW_LATENCY; + } else if (options.framePace.find("min-latency") != std::string::npos) { + mode = FramePacerMode::MIN_LATENCY; + } + + switch (mode) { + case FramePacerMode::MAX_FRAME_LATENCY: + Logger::info( "Frame pace: max-frame-latency" ); + m_mode = std::make_unique(FramePacerMode::MAX_FRAME_LATENCY, &m_latencyMarkersStorage); + break; + + case FramePacerMode::LOW_LATENCY: + Logger::info( "Frame pace: low-latency" ); + m_mode = std::make_unique(mode, &m_latencyMarkersStorage, options); + break; + + case FramePacerMode::MIN_LATENCY: + Logger::info( "Frame pace: min-latency" ); + m_mode = std::make_unique(mode, &m_latencyMarkersStorage); + break; + } + + for (auto& gpuStart: m_gpuStarts) { + gpuStart.store(0); + } + + // be consistent that every frame has a gpuReady event from the previous frame + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1); + m->gpuReady.push_back(high_resolution_clock::now()); + } + + + FramePacer::~FramePacer() {} + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer.h b/src/dxvk/framepacer/dxvk_framepacer.h new file mode 100644 index 000000000..264dcff57 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer.h @@ -0,0 +1,191 @@ +#pragma once + +#include "dxvk_framepacer_mode.h" +#include "dxvk_latency_markers.h" +#include "../dxvk_latency.h" +#include "../../util/util_time.h" +#include + + +namespace dxvk { + + struct DxvkOptions; + + /* \brief Frame pacer interface managing the CPU - GPU synchronization. + * + * GPUs render frames asynchronously to the game's and dxvk's CPU-side work + * in order to improve fps-throughput. Aligning the cpu work to chosen time- + * points allows to tune certain characteristics of the video presentation, + * like smoothness and latency. + */ + + class FramePacer : public DxvkLatencyTracker { + using microseconds = std::chrono::microseconds; + public: + + FramePacer( const DxvkOptions& options ); + ~FramePacer(); + + void sleepAndBeginFrame( + uint64_t frameId, + double maxFrameRate) override { + // wait for finished rendering of a previous frame, typically the one before last + m_mode->waitRenderFinished(frameId); + // potentially wait some more if the cpu gets too much ahead + m_mode->startFrame(frameId); + m_latencyMarkersStorage.registerFrameStart(frameId); + m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0); + } + + void notifyGpuPresentEnd( uint64_t frameId ) override { + // the frame has been displayed to the screen + m_latencyMarkersStorage.registerFrameEnd(frameId); + m_mode->endFrame(frameId); + } + + void notifyCsRenderBegin( uint64_t frameId ) override { + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + m->csStart = std::chrono::duration_cast(now - m->start).count(); + } + + void notifyCsRenderEnd( uint64_t frameId ) override { + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + m->csFinished = std::chrono::duration_cast(now - m->start).count(); + m_mode->signalCsFinished( frameId ); + } + + void notifySubmit() override { + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1); + m->gpuSubmit.push_back(high_resolution_clock::now()); + } + + void notifyPresent( uint64_t frameId ) override { + // dx to vk translation is finished + if (frameId != 0) { + auto now = high_resolution_clock::now(); + m_lastSubmitFrameId = frameId; + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); + m->gpuSubmit.push_back(now); + m->cpuFinished = std::chrono::duration_cast(now - m->start).count(); + next->gpuSubmit.clear(); + + m_latencyMarkersStorage.m_timeline.cpuFinished.store(frameId); + } + } + + void notifyQueueSubmit( uint64_t frameId ) override { + assert( frameId == m_lastQueueSubmitFrameId + 1 ); + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + m->gpuQueueSubmit.push_back(now); + queueSubmitCheckGpuStart(frameId, m, now); + } + + void notifyQueuePresentBegin( uint64_t frameId ) override { + if (frameId != 0) { + auto now = high_resolution_clock::now(); + m_lastQueueSubmitFrameId = frameId; + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); + m->gpuQueueSubmit.push_back(now); + next->gpuQueueSubmit.clear(); + queueSubmitCheckGpuStart(frameId, m, now); + } + } + + void notifyGpuExecutionBegin( uint64_t frameId ) override { + assert( frameId == m_lastFinishedFrameId+1 ); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); + gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now()); + } + + void notifyGpuExecutionEnd( uint64_t frameId ) override { + auto now = high_resolution_clock::now(); + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1); + m->gpuReady.push_back(now); + } + + virtual void notifyGpuPresentBegin( uint64_t frameId ) override { + // we get frameId == 0 for repeated presents (SyncInterval) + if (frameId != 0) { + m_lastFinishedFrameId = frameId; + auto now = high_resolution_clock::now(); + + LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId); + LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1); + m->gpuReady.push_back(now); + m->gpuFinished = std::chrono::duration_cast(now - m->start).count(); + next->gpuReady.clear(); + next->gpuReady.push_back(now); + + gpuExecutionCheckGpuStart(frameId, m, now); + + m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId); + m_mode->finishRender(frameId); + m_mode->signalRenderFinished(frameId); + } + } + + FramePacerMode::Mode getMode() const { + return m_mode->m_mode; + } + + void setTargetFrameRate( double frameRate ) { + m_mode->setTargetFrameRate(frameRate); + } + + bool needsAutoMarkers() override { + return true; + } + + LatencyMarkersStorage m_latencyMarkersStorage; + + + // not implemented methods + + + void notifyCpuPresentBegin( uint64_t frameId) override { } + void notifyCpuPresentEnd( uint64_t frameId ) override { } + void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { } + void discardTimings() override { } + DxvkLatencyStats getStatistics( uint64_t frameId ) override + { return DxvkLatencyStats(); } + + private: + + void signalGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) { + m->gpuStart = std::chrono::duration_cast(t - m->start).count(); + m_latencyMarkersStorage.m_timeline.gpuStart.store(frameId); + m_mode->signalGpuStart(frameId); + } + + void queueSubmitCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) { + auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ]; + uint16_t val = gpuStart.fetch_or(queueSubmitBit); + if (val == gpuReadyBit) + signalGpuStart( frameId, m, t ); + } + + void gpuExecutionCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) { + auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ]; + uint16_t val = gpuStart.fetch_or(gpuReadyBit); + if (val == queueSubmitBit) + signalGpuStart( frameId, m, t ); + } + + std::unique_ptr m_mode; + + uint64_t m_lastSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + uint64_t m_lastFinishedFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + + std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { }; + static constexpr uint16_t queueSubmitBit = 1; + static constexpr uint16_t gpuReadyBit = 2; + + }; + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode.h b/src/dxvk/framepacer/dxvk_framepacer_mode.h new file mode 100644 index 000000000..109a240a2 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode.h @@ -0,0 +1,117 @@ +#pragma once + +#include "dxvk_latency_markers.h" +#include "../../util/sync/sync_signal.h" +#include "../../util/util_env.h" +#include + +namespace dxvk { + + /* + * /brief Abstract frame pacer mode in order to support different strategies of synchronization. + */ + + class FramePacerMode { + + public: + + enum Mode { + MAX_FRAME_LATENCY = 0, + LOW_LATENCY, + MIN_LATENCY + }; + + FramePacerMode( Mode mode, LatencyMarkersStorage* markerStorage, uint32_t maxFrameLatency=1 ) + : m_mode( mode ), + m_waitLatency( maxFrameLatency+1 ), + m_latencyMarkersStorage( markerStorage ) { + setFpsLimitFrametimeFromEnv(); + } + + virtual ~FramePacerMode() { } + + virtual void startFrame( uint64_t frameId ) { } + virtual void endFrame( uint64_t frameId ) { } + + virtual void finishRender( uint64_t frameId ) { } + + void waitRenderFinished( uint64_t frameId ) { + if (m_mode) m_fenceGpuFinished.wait(frameId-m_waitLatency); } + + void signalRenderFinished( uint64_t frameId ) { + if (m_mode) m_fenceGpuFinished.signal(frameId); } + + void signalGpuStart( uint64_t frameId ) { + if (m_mode) m_fenceGpuStart.signal(frameId); } + + void signalCsFinished( uint64_t frameId ) { + if (m_mode) m_fenceCsFinished.signal(frameId); } + + void setTargetFrameRate( double frameRate ) { + if (!m_fpsLimitEnvOverride && frameRate > 1.0) + m_fpsLimitFrametime.store( 1'000'000/frameRate ); + } + + const Mode m_mode; + + static bool getDoubleFromEnv( const char* name, double* result ); + static bool getIntFromEnv( const char* name, int* result ); + + protected: + + void setFpsLimitFrametimeFromEnv(); + + const uint32_t m_waitLatency; + LatencyMarkersStorage* m_latencyMarkersStorage; + std::atomic m_fpsLimitFrametime = { 0 }; + bool m_fpsLimitEnvOverride = { false }; + + sync::Fence m_fenceGpuStart = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; + sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) }; + sync::Fence m_fenceCsFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) }; + + }; + + + + inline bool FramePacerMode::getDoubleFromEnv( const char* name, double* result ) { + std::string env = env::getEnvVar(name); + if (env.empty()) + return false; + + try { + *result = std::stod(env); + return true; + } catch (const std::invalid_argument&) { + return false; + } + } + + + inline bool FramePacerMode::getIntFromEnv( const char* name, int* result ) { + std::string env = env::getEnvVar(name); + if (env.empty()) + return false; + + try { + *result = std::stoi(env); + return true; + } catch (const std::invalid_argument&) { + return false; + } + } + + + inline void FramePacerMode::setFpsLimitFrametimeFromEnv() { + double fpsLimit; + if (!getDoubleFromEnv("DXVK_FRAME_RATE", &fpsLimit)) + return; + + m_fpsLimitEnvOverride = true; + if (fpsLimit < 1.0) + return; + + m_fpsLimitFrametime = 1'000'000/fpsLimit; + } + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp new file mode 100644 index 000000000..4e39145b4 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp @@ -0,0 +1,43 @@ +#include "dxvk_framepacer_mode_low_latency.h" + +namespace dxvk { + + + bool getLowLatencyOffsetFromEnv( int32_t& offset ) { + if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_OFFSET", &offset)) + return false; + return true; + } + + + bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) { + int32_t o; + if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o)) + return false; + allowOverlap = (bool) o; + return true; + } + + + int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) { + int32_t offset = options.lowLatencyOffset; + int32_t o; + if (getLowLatencyOffsetFromEnv(o)) + offset = o; + + offset = std::max( -10000, offset ); + offset = std::min( 10000, offset ); + return offset; + } + + + bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) { + bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap; + bool o; + if (getLowLatencyAllowCpuFramesOverlapFromEnv(o)) + allowOverlap = o; + return allowOverlap; + } + + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h new file mode 100644 index 000000000..06fdaf0dd --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h @@ -0,0 +1,255 @@ +#pragma once + +#include "dxvk_framepacer_mode.h" +#include "../dxvk_options.h" +#include "../../util/log/log.h" +#include "../../util/util_string.h" +#include + +namespace dxvk { + + /* + * This low-latency mode aims to reduce latency with minimal impact in fps. + * Effective when operating in the GPU-limit. Efficient to be used in the CPU-limit as well. + * + * Greatly reduces input lag variations when switching between CPU- and GPU-limit, and + * compared to the max-frame-latency approach, it has a much more stable input lag when + * GPU running times change dramatically, which can happen for example when rotating within a scene. + * + * The current implementation rather generates fluctuations alternating frame-by-frame + * depending on the game's and dxvk's CPU-time variations. This might be visible as a loss + * in smoothness, which is an area this implementation can be further improved. Unsuitable + * smoothing however might degrade input-lag feel, so it's not implemented for now, but + * more advanced smoothing techniques will be investigated in the future. + * In some situations however, this low-latency pacing actually improves smoothing though, + * it will depend on the game. + * + * An interesting observation while playtesting was that not only the input lag was affected, + * but the video generated did progress more cleanly in time as well with regards to + * medium-term time consistency, in other words, the video playback speed remained more steady. + * + * Optimized for VRR and VK_PRESENT_MODE_IMMEDIATE_KHR. It also comes with its own fps-limiter + * which is typically used to prevent the game's fps exceeding the monitor's refresh rate, + * and which is tightly integrated into the pacing logic. + * + * Can be fine-tuned via the dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap + * variables (or their respective environment variables) + * Compared to maxFrameLatency = 3, render-latency reductions of up to 67% are achieved. + */ + + class LowLatencyMode : public FramePacerMode { + using microseconds = std::chrono::microseconds; + using time_point = high_resolution_clock::time_point; + public: + + LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options) + : FramePacerMode(mode, storage), + m_lowLatencyOffset(getLowLatencyOffset(options)), + m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) { + Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) ); + Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) ); + } + + ~LowLatencyMode() {} + + + void startFrame( uint64_t frameId ) override { + using std::chrono::duration_cast; + + if (!m_allowCpuFramesOverlap) + m_fenceCsFinished.wait( frameId-1 ); + + m_fenceGpuStart.wait( frameId-1 ); + + time_point now = high_resolution_clock::now(); + uint64_t finishedId = m_latencyMarkersStorage->getTimeline()->gpuFinished.load(); + if (finishedId <= DXGI_MAX_SWAP_CHAIN_BUFFERS+1ull) + return; + + if (finishedId == frameId-1) { + // we are the only in-flight frame, nothing to do other then to apply fps-limiter if needed + m_lastStart = sleepFor( now, 0 ); + return; + } + + if (finishedId != frameId-2) { + Logger::err( str::format("internal error during low-latency frame pacing: expected finished frameId=", + frameId-2, ", got: ", finishedId) ); + } + + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-1); + + // estimate the target gpu sync point for this frame + // and calculate backwards when we want to start this frame + + const SyncProps props = getSyncPrediction(); + int32_t gpuReadyPrediction = duration_cast( + m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count(); + + int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync; + int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset; + + m_lastStart = sleepFor( now, delay ); + + } + + + void finishRender( uint64_t frameId ) override { + + using std::chrono::duration_cast; + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId); + + int32_t numLoop = (int32_t)(m->gpuReady.size())-1; + if (numLoop <= 1) { + m_props[frameId % m_props.size()] = SyncProps(); + m_props[frameId % m_props.size()].isOutlier = true; + m_propsFinished.store( frameId ); + return; + } + + // estimates the optimal overlap for cpu/gpu work by optimizing gpu scheduling first + // such that the gpu doesn't go into idle for this frame, and then aligning cpu submits + // where gpuSubmit[i] <= gpuRun[i] for all i + + std::vector& gpuRun = m_tempGpuRun; + std::vector& gpuRunDurations = m_tempGpuRunDurations; + gpuRun.clear(); + gpuRunDurations.clear(); + int32_t optimizedGpuTime = 0; + gpuRun.push_back(optimizedGpuTime); + + for (int i=0; igpuReady[i], m->gpuQueueSubmit[i] ); + int32_t duration = duration_cast( m->gpuReady[i+1] - _gpuRun ).count(); + optimizedGpuTime += duration; + gpuRun.push_back(optimizedGpuTime); + gpuRunDurations.push_back(duration); + } + + int32_t alignment = duration_cast( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count() + - gpuRun[numLoop-1]; + + int32_t offset = 0; + for (int i=numLoop-2; i>=0; --i) { + int32_t curSubmit = duration_cast( m->gpuSubmit[i] - m->gpuSubmit[0] ).count(); + int32_t diff = curSubmit - gpuRun[i] - alignment; + diff = std::max( 0, diff ); + offset += diff; + alignment += diff; + } + + + SyncProps& props = m_props[frameId % m_props.size()]; + props.gpuSync = gpuRun[numLoop-1]; + props.cpuUntilGpuSync = offset + duration_cast( m->gpuSubmit[numLoop-1] - m->start ).count(); + props.optimizedGpuTime = optimizedGpuTime; + props.isOutlier = isOutlier(frameId); + + m_propsFinished.store( frameId ); + + } + + + Sleep::TimePoint sleepFor( const Sleep::TimePoint t, int32_t delay ) { + + // account for the fps limit and ensure we won't sleep too long, just in case + int32_t frametime = std::chrono::duration_cast( t - m_lastStart ).count(); + int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime ); + delay = std::max( delay, frametimeDiff ); + delay = std::max( 0, std::min( delay, 20000 ) ); + + Sleep::TimePoint nextStart = t + microseconds(delay); + Sleep::sleepUntil( t, nextStart ); + return nextStart; + + } + + + private: + + struct SyncProps { + int32_t optimizedGpuTime; // gpu executing packed submits in one go + int32_t gpuSync; // us after gpuStart + int32_t cpuUntilGpuSync; + bool isOutlier; + }; + + + SyncProps getSyncPrediction() { + // in the future we might use more samples to get a prediction + // however, simple averaging gives a slightly artificial mouse input + // more advanced methods will be investigated + SyncProps res = {}; + uint64_t id = m_propsFinished; + if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7) + return res; + + for (size_t i=0; i<7; ++i) { + const SyncProps& props = m_props[ (id-i) % m_props.size() ]; + if (!props.isOutlier) { + id = id-i; + break; + } + } + + return m_props[ id % m_props.size() ]; + }; + + + int32_t getGpuStartToFinishPrediction() { + uint64_t id = m_propsFinished; + if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7) + return 0; + + for (size_t i=0; i<7; ++i) { + const SyncProps& props = m_props[ (id-i) % m_props.size() ]; + if (!props.isOutlier) { + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i); + if (m->gpuReady.empty() || m->gpuSubmit.empty()) + return m->gpuFinished - m->gpuStart; + + time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] ); + return std::chrono::duration_cast( t - m->start ).count() + + props.optimizedGpuTime + - m->gpuStart; + } + } + + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id); + return m->gpuFinished - m->gpuStart; + }; + + + bool isOutlier( uint64_t frameId ) { + constexpr size_t numLoop = 7; + int32_t totalCpuTime = 0; + for (size_t i=0; igetConstMarkers(frameId-i); + totalCpuTime += m->cpuFinished; + } + + int32_t avgCpuTime = totalCpuTime / numLoop; + const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId); + if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) ) + return true; + + return false; + } + + + int32_t getLowLatencyOffset( const DxvkOptions& options ); + bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ); + + const int32_t m_lowLatencyOffset; + const bool m_allowCpuFramesOverlap; + + Sleep::TimePoint m_lastStart = { high_resolution_clock::now() }; + std::array m_props; + std::atomic m_propsFinished = { 0 }; + + std::vector m_tempGpuRun; + std::vector m_tempGpuRunDurations; + + }; + +} diff --git a/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h new file mode 100644 index 000000000..763a5368c --- /dev/null +++ b/src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h @@ -0,0 +1,45 @@ +#pragma once + +#include "dxvk_framepacer_mode.h" + +namespace dxvk { + + /* + * Minimal latency is achieved here by waiting for the previous + * frame to complete, which results in very much reduced fps. + * Generally not recommended, but helpful to get insights to fine-tune + * the low-latency mode, and possibly is useful for running games + * in the cpu limit. + */ + + class MinLatencyMode : public FramePacerMode { + + public: + + MinLatencyMode(Mode mode, LatencyMarkersStorage* storage) + : FramePacerMode(mode, storage, 0) {} + + ~MinLatencyMode() {} + + void startFrame( uint64_t frameId ) override { + + Sleep::TimePoint now = high_resolution_clock::now(); + int32_t frametime = std::chrono::duration_cast( + now - m_lastStart ).count(); + int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime ); + int32_t delay = std::max( 0, frametimeDiff ); + delay = std::min( delay, 20000 ); + + Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay); + Sleep::sleepUntil( now, nextStart ); + m_lastStart = nextStart; + + } + + private: + + Sleep::TimePoint m_lastStart = { high_resolution_clock::now() }; + + }; + +} diff --git a/src/dxvk/framepacer/dxvk_latency_markers.h b/src/dxvk/framepacer/dxvk_latency_markers.h new file mode 100644 index 000000000..7658f0737 --- /dev/null +++ b/src/dxvk/framepacer/dxvk_latency_markers.h @@ -0,0 +1,148 @@ +#pragma once + +#include +#include +#include +#include +#include +#include "../../util/util_sleep.h" +#include "../../util/log/log.h" +#include "../../util/util_string.h" + + +namespace dxvk { + + class FramePacer; + class LatencyMarkersStorage; + + + struct LatencyMarkers { + + using time_point = high_resolution_clock::time_point; + + time_point start; + time_point end; + + int32_t csStart; + int32_t csFinished; + int32_t cpuFinished; + int32_t gpuStart; + int32_t gpuFinished; + int32_t presentFinished; + + std::vector gpuReady; + std::vector gpuSubmit; + std::vector gpuQueueSubmit; + + }; + + + /* + * stores which information is accessible for which frame + */ + struct LatencyMarkersTimeline { + + std::atomic cpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + std::atomic gpuStart = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + std::atomic gpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + std::atomic frameFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS }; + + }; + + + class LatencyMarkersReader { + + public: + + LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries ); + bool getNext( const LatencyMarkers*& result ); + + private: + + const LatencyMarkersStorage* m_storage; + uint64_t m_index; + + }; + + + class LatencyMarkersStorage { + friend class LatencyMarkersReader; + friend class FramePacer; + public: + + LatencyMarkersStorage() { } + ~LatencyMarkersStorage() { } + + LatencyMarkersReader getReader( uint32_t numEntries ) const { + return LatencyMarkersReader(this, numEntries); + } + + void registerFrameStart( uint64_t frameId ) { + if (frameId <= m_timeline.frameFinished.load()) { + Logger::warn( str::format("internal error during registerFrameStart: expected frameId=", + m_timeline.frameFinished.load()+1, ", got: ", frameId) ); + } + auto now = high_resolution_clock::now(); + + LatencyMarkers* markers = getMarkers(frameId); + markers->start = now; + } + + void registerFrameEnd( uint64_t frameId ) { + if (frameId <= m_timeline.frameFinished.load()) { + Logger::warn( str::format("internal error during registerFrameEnd: expected frameId=", + m_timeline.frameFinished.load()+1, ", got: ", frameId) ); + } + auto now = high_resolution_clock::now(); + + LatencyMarkers* markers = getMarkers(frameId); + markers->presentFinished = std::chrono::duration_cast( + now - markers->start).count(); + markers->end = now; + + m_timeline.frameFinished.store(frameId); + } + + const LatencyMarkersTimeline* getTimeline() const { + return &m_timeline; + } + + const LatencyMarkers* getConstMarkers( uint64_t frameId ) const { + return &m_markers[frameId % m_numMarkers]; + } + + + private: + + LatencyMarkers* getMarkers( uint64_t frameId ) { + return &m_markers[frameId % m_numMarkers]; + } + + // simple modulo hash mapping is used for frameIds. They are expected to monotonically increase by one. + // select the size large enough, so we never come into a situation where the reader cannot keep up with the producer + static constexpr uint16_t m_numMarkers = 128; + std::array m_markers = { }; + LatencyMarkersTimeline m_timeline; + + }; + + + + inline LatencyMarkersReader::LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries ) + : m_storage(storage) { + m_index = 0; + if (m_storage->m_timeline.frameFinished.load() > numEntries + DXGI_MAX_SWAP_CHAIN_BUFFERS + 2) + m_index = m_storage->m_timeline.frameFinished.load() - numEntries; + } + + + inline bool LatencyMarkersReader::getNext( const LatencyMarkers*& result ) { + if (m_index == 0 || m_index > m_storage->m_timeline.frameFinished.load()) + return false; + + result = &m_storage->m_markers[m_index % m_storage->m_numMarkers]; + m_index++; + return true; + } + +} diff --git a/src/dxvk/meson.build b/src/dxvk/meson.build index 9b2b07356..e5d990543 100644 --- a/src/dxvk/meson.build +++ b/src/dxvk/meson.build @@ -120,6 +120,9 @@ dxvk_src = [ 'hud/dxvk_hud_font.cpp', 'hud/dxvk_hud_item.cpp', 'hud/dxvk_hud_renderer.cpp', + + 'framepacer/dxvk_framepacer.cpp', + 'framepacer/dxvk_framepacer_mode_low_latency.cpp', ] if platform == 'windows'