[dxvk] Add low-latency frame pacing

This commit is contained in:
netborg 2025-02-18 21:18:22 +01:00
parent 13ae0d218b
commit 01ccd1e776
9 changed files with 873 additions and 0 deletions

View file

@ -128,6 +128,10 @@ namespace dxvk {
virtual void notifyCpuPresentEnd( virtual void notifyCpuPresentEnd(
uint64_t frameId) = 0; uint64_t frameId) = 0;
virtual void notifySubmit() { }
virtual void notifyPresent(
uint64_t frameId) { }
/** /**
* \brief Called when a command list is submitted to the GPU * \brief Called when a command list is submitted to the GPU
* *
@ -174,6 +178,9 @@ namespace dxvk {
virtual void notifyGpuExecutionEnd( virtual void notifyGpuExecutionEnd(
uint64_t frameId) = 0; uint64_t frameId) = 0;
virtual void notifyGpuPresentBegin(
uint64_t frameId) { }
/** /**
* \brief Called when presentation of a given frame finishes on the GPU * \brief Called when presentation of a given frame finishes on the GPU
* *

View file

@ -0,0 +1,64 @@
#include "dxvk_framepacer.h"
#include "dxvk_framepacer_mode_low_latency.h"
#include "dxvk_framepacer_mode_min_latency.h"
#include "dxvk_options.h"
#include "../../util/util_env.h"
#include "../../util/log/log.h"
namespace dxvk {
FramePacer::FramePacer( const DxvkOptions& options ) {
// we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes,
// highlighting the generally much better input lag and medium-term time consistency.
// although MAX_FRAME_LATENCY has advantages in many games and is likely the better default,
// for its higher fps throughput and less susceptibility to short-term time inconsistencies.
// which mode being smoother depends on the game.
FramePacerMode::Mode mode = FramePacerMode::LOW_LATENCY;
std::string configStr = env::getEnvVar("DXVK_FRAME_PACE");
if (configStr.find("max-frame-latency") != std::string::npos) {
mode = FramePacerMode::MAX_FRAME_LATENCY;
} else if (configStr.find("low-latency") != std::string::npos) {
mode = FramePacerMode::LOW_LATENCY;
} else if (configStr.find("min-latency") != std::string::npos) {
mode = FramePacerMode::MIN_LATENCY;
} else if (options.framePace.find("max-frame-latency") != std::string::npos) {
mode = FramePacerMode::MAX_FRAME_LATENCY;
} else if (options.framePace.find("low-latency") != std::string::npos) {
mode = FramePacerMode::LOW_LATENCY;
} else if (options.framePace.find("min-latency") != std::string::npos) {
mode = FramePacerMode::MIN_LATENCY;
}
switch (mode) {
case FramePacerMode::MAX_FRAME_LATENCY:
Logger::info( "Frame pace: max-frame-latency" );
m_mode = std::make_unique<FramePacerMode>(FramePacerMode::MAX_FRAME_LATENCY, &m_latencyMarkersStorage);
break;
case FramePacerMode::LOW_LATENCY:
Logger::info( "Frame pace: low-latency" );
m_mode = std::make_unique<LowLatencyMode>(mode, &m_latencyMarkersStorage, options);
break;
case FramePacerMode::MIN_LATENCY:
Logger::info( "Frame pace: min-latency" );
m_mode = std::make_unique<MinLatencyMode>(mode, &m_latencyMarkersStorage);
break;
}
for (auto& gpuStart: m_gpuStarts) {
gpuStart.store(0);
}
// be consistent that every frame has a gpuReady event from the previous frame
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1);
m->gpuReady.push_back(high_resolution_clock::now());
}
FramePacer::~FramePacer() {}
}

View file

@ -0,0 +1,191 @@
#pragma once
#include "dxvk_framepacer_mode.h"
#include "dxvk_latency_markers.h"
#include "../dxvk_latency.h"
#include "../../util/util_time.h"
#include <dxgi.h>
namespace dxvk {
struct DxvkOptions;
/* \brief Frame pacer interface managing the CPU - GPU synchronization.
*
* GPUs render frames asynchronously to the game's and dxvk's CPU-side work
* in order to improve fps-throughput. Aligning the cpu work to chosen time-
* points allows to tune certain characteristics of the video presentation,
* like smoothness and latency.
*/
class FramePacer : public DxvkLatencyTracker {
using microseconds = std::chrono::microseconds;
public:
FramePacer( const DxvkOptions& options );
~FramePacer();
void sleepAndBeginFrame(
uint64_t frameId,
double maxFrameRate) override {
// wait for finished rendering of a previous frame, typically the one before last
m_mode->waitRenderFinished(frameId);
// potentially wait some more if the cpu gets too much ahead
m_mode->startFrame(frameId);
m_latencyMarkersStorage.registerFrameStart(frameId);
m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0);
}
void notifyGpuPresentEnd( uint64_t frameId ) override {
// the frame has been displayed to the screen
m_latencyMarkersStorage.registerFrameEnd(frameId);
m_mode->endFrame(frameId);
}
void notifyCsRenderBegin( uint64_t frameId ) override {
auto now = high_resolution_clock::now();
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
m->csStart = std::chrono::duration_cast<microseconds>(now - m->start).count();
}
void notifyCsRenderEnd( uint64_t frameId ) override {
auto now = high_resolution_clock::now();
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
m->csFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
m_mode->signalCsFinished( frameId );
}
void notifySubmit() override {
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1);
m->gpuSubmit.push_back(high_resolution_clock::now());
}
void notifyPresent( uint64_t frameId ) override {
// dx to vk translation is finished
if (frameId != 0) {
auto now = high_resolution_clock::now();
m_lastSubmitFrameId = frameId;
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
m->gpuSubmit.push_back(now);
m->cpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
next->gpuSubmit.clear();
m_latencyMarkersStorage.m_timeline.cpuFinished.store(frameId);
}
}
void notifyQueueSubmit( uint64_t frameId ) override {
assert( frameId == m_lastQueueSubmitFrameId + 1 );
auto now = high_resolution_clock::now();
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
m->gpuQueueSubmit.push_back(now);
queueSubmitCheckGpuStart(frameId, m, now);
}
void notifyQueuePresentBegin( uint64_t frameId ) override {
if (frameId != 0) {
auto now = high_resolution_clock::now();
m_lastQueueSubmitFrameId = frameId;
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
m->gpuQueueSubmit.push_back(now);
next->gpuQueueSubmit.clear();
queueSubmitCheckGpuStart(frameId, m, now);
}
}
void notifyGpuExecutionBegin( uint64_t frameId ) override {
assert( frameId == m_lastFinishedFrameId+1 );
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now());
}
void notifyGpuExecutionEnd( uint64_t frameId ) override {
auto now = high_resolution_clock::now();
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
m->gpuReady.push_back(now);
}
virtual void notifyGpuPresentBegin( uint64_t frameId ) override {
// we get frameId == 0 for repeated presents (SyncInterval)
if (frameId != 0) {
m_lastFinishedFrameId = frameId;
auto now = high_resolution_clock::now();
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
m->gpuReady.push_back(now);
m->gpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
next->gpuReady.clear();
next->gpuReady.push_back(now);
gpuExecutionCheckGpuStart(frameId, m, now);
m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId);
m_mode->finishRender(frameId);
m_mode->signalRenderFinished(frameId);
}
}
FramePacerMode::Mode getMode() const {
return m_mode->m_mode;
}
void setTargetFrameRate( double frameRate ) {
m_mode->setTargetFrameRate(frameRate);
}
bool needsAutoMarkers() override {
return true;
}
LatencyMarkersStorage m_latencyMarkersStorage;
// not implemented methods
void notifyCpuPresentBegin( uint64_t frameId) override { }
void notifyCpuPresentEnd( uint64_t frameId ) override { }
void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { }
void discardTimings() override { }
DxvkLatencyStats getStatistics( uint64_t frameId ) override
{ return DxvkLatencyStats(); }
private:
void signalGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
m->gpuStart = std::chrono::duration_cast<microseconds>(t - m->start).count();
m_latencyMarkersStorage.m_timeline.gpuStart.store(frameId);
m_mode->signalGpuStart(frameId);
}
void queueSubmitCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
uint16_t val = gpuStart.fetch_or(queueSubmitBit);
if (val == gpuReadyBit)
signalGpuStart( frameId, m, t );
}
void gpuExecutionCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
uint16_t val = gpuStart.fetch_or(gpuReadyBit);
if (val == queueSubmitBit)
signalGpuStart( frameId, m, t );
}
std::unique_ptr<FramePacerMode> m_mode;
uint64_t m_lastSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
uint64_t m_lastFinishedFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { };
static constexpr uint16_t queueSubmitBit = 1;
static constexpr uint16_t gpuReadyBit = 2;
};
}

View file

@ -0,0 +1,117 @@
#pragma once
#include "dxvk_latency_markers.h"
#include "../../util/sync/sync_signal.h"
#include "../../util/util_env.h"
#include <dxgi.h>
namespace dxvk {
/*
* /brief Abstract frame pacer mode in order to support different strategies of synchronization.
*/
class FramePacerMode {
public:
enum Mode {
MAX_FRAME_LATENCY = 0,
LOW_LATENCY,
MIN_LATENCY
};
FramePacerMode( Mode mode, LatencyMarkersStorage* markerStorage, uint32_t maxFrameLatency=1 )
: m_mode( mode ),
m_waitLatency( maxFrameLatency+1 ),
m_latencyMarkersStorage( markerStorage ) {
setFpsLimitFrametimeFromEnv();
}
virtual ~FramePacerMode() { }
virtual void startFrame( uint64_t frameId ) { }
virtual void endFrame( uint64_t frameId ) { }
virtual void finishRender( uint64_t frameId ) { }
void waitRenderFinished( uint64_t frameId ) {
if (m_mode) m_fenceGpuFinished.wait(frameId-m_waitLatency); }
void signalRenderFinished( uint64_t frameId ) {
if (m_mode) m_fenceGpuFinished.signal(frameId); }
void signalGpuStart( uint64_t frameId ) {
if (m_mode) m_fenceGpuStart.signal(frameId); }
void signalCsFinished( uint64_t frameId ) {
if (m_mode) m_fenceCsFinished.signal(frameId); }
void setTargetFrameRate( double frameRate ) {
if (!m_fpsLimitEnvOverride && frameRate > 1.0)
m_fpsLimitFrametime.store( 1'000'000/frameRate );
}
const Mode m_mode;
static bool getDoubleFromEnv( const char* name, double* result );
static bool getIntFromEnv( const char* name, int* result );
protected:
void setFpsLimitFrametimeFromEnv();
const uint32_t m_waitLatency;
LatencyMarkersStorage* m_latencyMarkersStorage;
std::atomic<int32_t> m_fpsLimitFrametime = { 0 };
bool m_fpsLimitEnvOverride = { false };
sync::Fence m_fenceGpuStart = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
sync::Fence m_fenceCsFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) };
};
inline bool FramePacerMode::getDoubleFromEnv( const char* name, double* result ) {
std::string env = env::getEnvVar(name);
if (env.empty())
return false;
try {
*result = std::stod(env);
return true;
} catch (const std::invalid_argument&) {
return false;
}
}
inline bool FramePacerMode::getIntFromEnv( const char* name, int* result ) {
std::string env = env::getEnvVar(name);
if (env.empty())
return false;
try {
*result = std::stoi(env);
return true;
} catch (const std::invalid_argument&) {
return false;
}
}
inline void FramePacerMode::setFpsLimitFrametimeFromEnv() {
double fpsLimit;
if (!getDoubleFromEnv("DXVK_FRAME_RATE", &fpsLimit))
return;
m_fpsLimitEnvOverride = true;
if (fpsLimit < 1.0)
return;
m_fpsLimitFrametime = 1'000'000/fpsLimit;
}
}

View file

@ -0,0 +1,43 @@
#include "dxvk_framepacer_mode_low_latency.h"
namespace dxvk {
bool getLowLatencyOffsetFromEnv( int32_t& offset ) {
if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_OFFSET", &offset))
return false;
return true;
}
bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) {
int32_t o;
if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o))
return false;
allowOverlap = (bool) o;
return true;
}
int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) {
int32_t offset = options.lowLatencyOffset;
int32_t o;
if (getLowLatencyOffsetFromEnv(o))
offset = o;
offset = std::max( -10000, offset );
offset = std::min( 10000, offset );
return offset;
}
bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) {
bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap;
bool o;
if (getLowLatencyAllowCpuFramesOverlapFromEnv(o))
allowOverlap = o;
return allowOverlap;
}
}

View file

@ -0,0 +1,255 @@
#pragma once
#include "dxvk_framepacer_mode.h"
#include "../dxvk_options.h"
#include "../../util/log/log.h"
#include "../../util/util_string.h"
#include <assert.h>
namespace dxvk {
/*
* This low-latency mode aims to reduce latency with minimal impact in fps.
* Effective when operating in the GPU-limit. Efficient to be used in the CPU-limit as well.
*
* Greatly reduces input lag variations when switching between CPU- and GPU-limit, and
* compared to the max-frame-latency approach, it has a much more stable input lag when
* GPU running times change dramatically, which can happen for example when rotating within a scene.
*
* The current implementation rather generates fluctuations alternating frame-by-frame
* depending on the game's and dxvk's CPU-time variations. This might be visible as a loss
* in smoothness, which is an area this implementation can be further improved. Unsuitable
* smoothing however might degrade input-lag feel, so it's not implemented for now, but
* more advanced smoothing techniques will be investigated in the future.
* In some situations however, this low-latency pacing actually improves smoothing though,
* it will depend on the game.
*
* An interesting observation while playtesting was that not only the input lag was affected,
* but the video generated did progress more cleanly in time as well with regards to
* medium-term time consistency, in other words, the video playback speed remained more steady.
*
* Optimized for VRR and VK_PRESENT_MODE_IMMEDIATE_KHR. It also comes with its own fps-limiter
* which is typically used to prevent the game's fps exceeding the monitor's refresh rate,
* and which is tightly integrated into the pacing logic.
*
* Can be fine-tuned via the dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap
* variables (or their respective environment variables)
* Compared to maxFrameLatency = 3, render-latency reductions of up to 67% are achieved.
*/
class LowLatencyMode : public FramePacerMode {
using microseconds = std::chrono::microseconds;
using time_point = high_resolution_clock::time_point;
public:
LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options)
: FramePacerMode(mode, storage),
m_lowLatencyOffset(getLowLatencyOffset(options)),
m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) {
Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) );
Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) );
}
~LowLatencyMode() {}
void startFrame( uint64_t frameId ) override {
using std::chrono::duration_cast;
if (!m_allowCpuFramesOverlap)
m_fenceCsFinished.wait( frameId-1 );
m_fenceGpuStart.wait( frameId-1 );
time_point now = high_resolution_clock::now();
uint64_t finishedId = m_latencyMarkersStorage->getTimeline()->gpuFinished.load();
if (finishedId <= DXGI_MAX_SWAP_CHAIN_BUFFERS+1ull)
return;
if (finishedId == frameId-1) {
// we are the only in-flight frame, nothing to do other then to apply fps-limiter if needed
m_lastStart = sleepFor( now, 0 );
return;
}
if (finishedId != frameId-2) {
Logger::err( str::format("internal error during low-latency frame pacing: expected finished frameId=",
frameId-2, ", got: ", finishedId) );
}
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-1);
// estimate the target gpu sync point for this frame
// and calculate backwards when we want to start this frame
const SyncProps props = getSyncPrediction();
int32_t gpuReadyPrediction = duration_cast<microseconds>(
m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count();
int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync;
int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset;
m_lastStart = sleepFor( now, delay );
}
void finishRender( uint64_t frameId ) override {
using std::chrono::duration_cast;
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
int32_t numLoop = (int32_t)(m->gpuReady.size())-1;
if (numLoop <= 1) {
m_props[frameId % m_props.size()] = SyncProps();
m_props[frameId % m_props.size()].isOutlier = true;
m_propsFinished.store( frameId );
return;
}
// estimates the optimal overlap for cpu/gpu work by optimizing gpu scheduling first
// such that the gpu doesn't go into idle for this frame, and then aligning cpu submits
// where gpuSubmit[i] <= gpuRun[i] for all i
std::vector<int32_t>& gpuRun = m_tempGpuRun;
std::vector<int32_t>& gpuRunDurations = m_tempGpuRunDurations;
gpuRun.clear();
gpuRunDurations.clear();
int32_t optimizedGpuTime = 0;
gpuRun.push_back(optimizedGpuTime);
for (int i=0; i<numLoop; ++i) {
time_point _gpuRun = std::max( m->gpuReady[i], m->gpuQueueSubmit[i] );
int32_t duration = duration_cast<microseconds>( m->gpuReady[i+1] - _gpuRun ).count();
optimizedGpuTime += duration;
gpuRun.push_back(optimizedGpuTime);
gpuRunDurations.push_back(duration);
}
int32_t alignment = duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count()
- gpuRun[numLoop-1];
int32_t offset = 0;
for (int i=numLoop-2; i>=0; --i) {
int32_t curSubmit = duration_cast<microseconds>( m->gpuSubmit[i] - m->gpuSubmit[0] ).count();
int32_t diff = curSubmit - gpuRun[i] - alignment;
diff = std::max( 0, diff );
offset += diff;
alignment += diff;
}
SyncProps& props = m_props[frameId % m_props.size()];
props.gpuSync = gpuRun[numLoop-1];
props.cpuUntilGpuSync = offset + duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->start ).count();
props.optimizedGpuTime = optimizedGpuTime;
props.isOutlier = isOutlier(frameId);
m_propsFinished.store( frameId );
}
Sleep::TimePoint sleepFor( const Sleep::TimePoint t, int32_t delay ) {
// account for the fps limit and ensure we won't sleep too long, just in case
int32_t frametime = std::chrono::duration_cast<microseconds>( t - m_lastStart ).count();
int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
delay = std::max( delay, frametimeDiff );
delay = std::max( 0, std::min( delay, 20000 ) );
Sleep::TimePoint nextStart = t + microseconds(delay);
Sleep::sleepUntil( t, nextStart );
return nextStart;
}
private:
struct SyncProps {
int32_t optimizedGpuTime; // gpu executing packed submits in one go
int32_t gpuSync; // us after gpuStart
int32_t cpuUntilGpuSync;
bool isOutlier;
};
SyncProps getSyncPrediction() {
// in the future we might use more samples to get a prediction
// however, simple averaging gives a slightly artificial mouse input
// more advanced methods will be investigated
SyncProps res = {};
uint64_t id = m_propsFinished;
if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
return res;
for (size_t i=0; i<7; ++i) {
const SyncProps& props = m_props[ (id-i) % m_props.size() ];
if (!props.isOutlier) {
id = id-i;
break;
}
}
return m_props[ id % m_props.size() ];
};
int32_t getGpuStartToFinishPrediction() {
uint64_t id = m_propsFinished;
if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
return 0;
for (size_t i=0; i<7; ++i) {
const SyncProps& props = m_props[ (id-i) % m_props.size() ];
if (!props.isOutlier) {
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i);
if (m->gpuReady.empty() || m->gpuSubmit.empty())
return m->gpuFinished - m->gpuStart;
time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] );
return std::chrono::duration_cast<microseconds>( t - m->start ).count()
+ props.optimizedGpuTime
- m->gpuStart;
}
}
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id);
return m->gpuFinished - m->gpuStart;
};
bool isOutlier( uint64_t frameId ) {
constexpr size_t numLoop = 7;
int32_t totalCpuTime = 0;
for (size_t i=0; i<numLoop; ++i) {
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-i);
totalCpuTime += m->cpuFinished;
}
int32_t avgCpuTime = totalCpuTime / numLoop;
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) )
return true;
return false;
}
int32_t getLowLatencyOffset( const DxvkOptions& options );
bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options );
const int32_t m_lowLatencyOffset;
const bool m_allowCpuFramesOverlap;
Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
std::array<SyncProps, 16> m_props;
std::atomic<uint64_t> m_propsFinished = { 0 };
std::vector<int32_t> m_tempGpuRun;
std::vector<int32_t> m_tempGpuRunDurations;
};
}

View file

@ -0,0 +1,45 @@
#pragma once
#include "dxvk_framepacer_mode.h"
namespace dxvk {
/*
* Minimal latency is achieved here by waiting for the previous
* frame to complete, which results in very much reduced fps.
* Generally not recommended, but helpful to get insights to fine-tune
* the low-latency mode, and possibly is useful for running games
* in the cpu limit.
*/
class MinLatencyMode : public FramePacerMode {
public:
MinLatencyMode(Mode mode, LatencyMarkersStorage* storage)
: FramePacerMode(mode, storage, 0) {}
~MinLatencyMode() {}
void startFrame( uint64_t frameId ) override {
Sleep::TimePoint now = high_resolution_clock::now();
int32_t frametime = std::chrono::duration_cast<std::chrono::microseconds>(
now - m_lastStart ).count();
int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
int32_t delay = std::max( 0, frametimeDiff );
delay = std::min( delay, 20000 );
Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay);
Sleep::sleepUntil( now, nextStart );
m_lastStart = nextStart;
}
private:
Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
};
}

View file

@ -0,0 +1,148 @@
#pragma once
#include <atomic>
#include <dxgi.h>
#include <vector>
#include <array>
#include <assert.h>
#include "../../util/util_sleep.h"
#include "../../util/log/log.h"
#include "../../util/util_string.h"
namespace dxvk {
class FramePacer;
class LatencyMarkersStorage;
struct LatencyMarkers {
using time_point = high_resolution_clock::time_point;
time_point start;
time_point end;
int32_t csStart;
int32_t csFinished;
int32_t cpuFinished;
int32_t gpuStart;
int32_t gpuFinished;
int32_t presentFinished;
std::vector<time_point> gpuReady;
std::vector<time_point> gpuSubmit;
std::vector<time_point> gpuQueueSubmit;
};
/*
* stores which information is accessible for which frame
*/
struct LatencyMarkersTimeline {
std::atomic<uint64_t> cpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
std::atomic<uint64_t> gpuStart = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
std::atomic<uint64_t> gpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
std::atomic<uint64_t> frameFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
};
class LatencyMarkersReader {
public:
LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries );
bool getNext( const LatencyMarkers*& result );
private:
const LatencyMarkersStorage* m_storage;
uint64_t m_index;
};
class LatencyMarkersStorage {
friend class LatencyMarkersReader;
friend class FramePacer;
public:
LatencyMarkersStorage() { }
~LatencyMarkersStorage() { }
LatencyMarkersReader getReader( uint32_t numEntries ) const {
return LatencyMarkersReader(this, numEntries);
}
void registerFrameStart( uint64_t frameId ) {
if (frameId <= m_timeline.frameFinished.load()) {
Logger::warn( str::format("internal error during registerFrameStart: expected frameId=",
m_timeline.frameFinished.load()+1, ", got: ", frameId) );
}
auto now = high_resolution_clock::now();
LatencyMarkers* markers = getMarkers(frameId);
markers->start = now;
}
void registerFrameEnd( uint64_t frameId ) {
if (frameId <= m_timeline.frameFinished.load()) {
Logger::warn( str::format("internal error during registerFrameEnd: expected frameId=",
m_timeline.frameFinished.load()+1, ", got: ", frameId) );
}
auto now = high_resolution_clock::now();
LatencyMarkers* markers = getMarkers(frameId);
markers->presentFinished = std::chrono::duration_cast<std::chrono::microseconds>(
now - markers->start).count();
markers->end = now;
m_timeline.frameFinished.store(frameId);
}
const LatencyMarkersTimeline* getTimeline() const {
return &m_timeline;
}
const LatencyMarkers* getConstMarkers( uint64_t frameId ) const {
return &m_markers[frameId % m_numMarkers];
}
private:
LatencyMarkers* getMarkers( uint64_t frameId ) {
return &m_markers[frameId % m_numMarkers];
}
// simple modulo hash mapping is used for frameIds. They are expected to monotonically increase by one.
// select the size large enough, so we never come into a situation where the reader cannot keep up with the producer
static constexpr uint16_t m_numMarkers = 128;
std::array<LatencyMarkers, m_numMarkers> m_markers = { };
LatencyMarkersTimeline m_timeline;
};
inline LatencyMarkersReader::LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries )
: m_storage(storage) {
m_index = 0;
if (m_storage->m_timeline.frameFinished.load() > numEntries + DXGI_MAX_SWAP_CHAIN_BUFFERS + 2)
m_index = m_storage->m_timeline.frameFinished.load() - numEntries;
}
inline bool LatencyMarkersReader::getNext( const LatencyMarkers*& result ) {
if (m_index == 0 || m_index > m_storage->m_timeline.frameFinished.load())
return false;
result = &m_storage->m_markers[m_index % m_storage->m_numMarkers];
m_index++;
return true;
}
}

View file

@ -120,6 +120,9 @@ dxvk_src = [
'hud/dxvk_hud_font.cpp', 'hud/dxvk_hud_font.cpp',
'hud/dxvk_hud_item.cpp', 'hud/dxvk_hud_item.cpp',
'hud/dxvk_hud_renderer.cpp', 'hud/dxvk_hud_renderer.cpp',
'framepacer/dxvk_framepacer.cpp',
'framepacer/dxvk_framepacer_mode_low_latency.cpp',
] ]
if platform == 'windows' if platform == 'windows'