mirror of
https://github.com/doitsujin/dxvk.git
synced 2025-03-06 20:58:37 +01:00
[dxvk] Add low-latency frame pacing
This commit is contained in:
parent
13ae0d218b
commit
01ccd1e776
9 changed files with 873 additions and 0 deletions
|
@ -128,6 +128,10 @@ namespace dxvk {
|
|||
virtual void notifyCpuPresentEnd(
|
||||
uint64_t frameId) = 0;
|
||||
|
||||
virtual void notifySubmit() { }
|
||||
virtual void notifyPresent(
|
||||
uint64_t frameId) { }
|
||||
|
||||
/**
|
||||
* \brief Called when a command list is submitted to the GPU
|
||||
*
|
||||
|
@ -174,6 +178,9 @@ namespace dxvk {
|
|||
virtual void notifyGpuExecutionEnd(
|
||||
uint64_t frameId) = 0;
|
||||
|
||||
virtual void notifyGpuPresentBegin(
|
||||
uint64_t frameId) { }
|
||||
|
||||
/**
|
||||
* \brief Called when presentation of a given frame finishes on the GPU
|
||||
*
|
||||
|
|
64
src/dxvk/framepacer/dxvk_framepacer.cpp
Normal file
64
src/dxvk/framepacer/dxvk_framepacer.cpp
Normal file
|
@ -0,0 +1,64 @@
|
|||
#include "dxvk_framepacer.h"
|
||||
#include "dxvk_framepacer_mode_low_latency.h"
|
||||
#include "dxvk_framepacer_mode_min_latency.h"
|
||||
#include "dxvk_options.h"
|
||||
#include "../../util/util_env.h"
|
||||
#include "../../util/log/log.h"
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
|
||||
FramePacer::FramePacer( const DxvkOptions& options ) {
|
||||
// we'll default to LOW_LATENCY in the draft-PR for now, for demonstration purposes,
|
||||
// highlighting the generally much better input lag and medium-term time consistency.
|
||||
// although MAX_FRAME_LATENCY has advantages in many games and is likely the better default,
|
||||
// for its higher fps throughput and less susceptibility to short-term time inconsistencies.
|
||||
// which mode being smoother depends on the game.
|
||||
FramePacerMode::Mode mode = FramePacerMode::LOW_LATENCY;
|
||||
|
||||
std::string configStr = env::getEnvVar("DXVK_FRAME_PACE");
|
||||
|
||||
if (configStr.find("max-frame-latency") != std::string::npos) {
|
||||
mode = FramePacerMode::MAX_FRAME_LATENCY;
|
||||
} else if (configStr.find("low-latency") != std::string::npos) {
|
||||
mode = FramePacerMode::LOW_LATENCY;
|
||||
} else if (configStr.find("min-latency") != std::string::npos) {
|
||||
mode = FramePacerMode::MIN_LATENCY;
|
||||
} else if (options.framePace.find("max-frame-latency") != std::string::npos) {
|
||||
mode = FramePacerMode::MAX_FRAME_LATENCY;
|
||||
} else if (options.framePace.find("low-latency") != std::string::npos) {
|
||||
mode = FramePacerMode::LOW_LATENCY;
|
||||
} else if (options.framePace.find("min-latency") != std::string::npos) {
|
||||
mode = FramePacerMode::MIN_LATENCY;
|
||||
}
|
||||
|
||||
switch (mode) {
|
||||
case FramePacerMode::MAX_FRAME_LATENCY:
|
||||
Logger::info( "Frame pace: max-frame-latency" );
|
||||
m_mode = std::make_unique<FramePacerMode>(FramePacerMode::MAX_FRAME_LATENCY, &m_latencyMarkersStorage);
|
||||
break;
|
||||
|
||||
case FramePacerMode::LOW_LATENCY:
|
||||
Logger::info( "Frame pace: low-latency" );
|
||||
m_mode = std::make_unique<LowLatencyMode>(mode, &m_latencyMarkersStorage, options);
|
||||
break;
|
||||
|
||||
case FramePacerMode::MIN_LATENCY:
|
||||
Logger::info( "Frame pace: min-latency" );
|
||||
m_mode = std::make_unique<MinLatencyMode>(mode, &m_latencyMarkersStorage);
|
||||
break;
|
||||
}
|
||||
|
||||
for (auto& gpuStart: m_gpuStarts) {
|
||||
gpuStart.store(0);
|
||||
}
|
||||
|
||||
// be consistent that every frame has a gpuReady event from the previous frame
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(DXGI_MAX_SWAP_CHAIN_BUFFERS+1);
|
||||
m->gpuReady.push_back(high_resolution_clock::now());
|
||||
}
|
||||
|
||||
|
||||
FramePacer::~FramePacer() {}
|
||||
|
||||
}
|
191
src/dxvk/framepacer/dxvk_framepacer.h
Normal file
191
src/dxvk/framepacer/dxvk_framepacer.h
Normal file
|
@ -0,0 +1,191 @@
|
|||
#pragma once
|
||||
|
||||
#include "dxvk_framepacer_mode.h"
|
||||
#include "dxvk_latency_markers.h"
|
||||
#include "../dxvk_latency.h"
|
||||
#include "../../util/util_time.h"
|
||||
#include <dxgi.h>
|
||||
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
struct DxvkOptions;
|
||||
|
||||
/* \brief Frame pacer interface managing the CPU - GPU synchronization.
|
||||
*
|
||||
* GPUs render frames asynchronously to the game's and dxvk's CPU-side work
|
||||
* in order to improve fps-throughput. Aligning the cpu work to chosen time-
|
||||
* points allows to tune certain characteristics of the video presentation,
|
||||
* like smoothness and latency.
|
||||
*/
|
||||
|
||||
class FramePacer : public DxvkLatencyTracker {
|
||||
using microseconds = std::chrono::microseconds;
|
||||
public:
|
||||
|
||||
FramePacer( const DxvkOptions& options );
|
||||
~FramePacer();
|
||||
|
||||
void sleepAndBeginFrame(
|
||||
uint64_t frameId,
|
||||
double maxFrameRate) override {
|
||||
// wait for finished rendering of a previous frame, typically the one before last
|
||||
m_mode->waitRenderFinished(frameId);
|
||||
// potentially wait some more if the cpu gets too much ahead
|
||||
m_mode->startFrame(frameId);
|
||||
m_latencyMarkersStorage.registerFrameStart(frameId);
|
||||
m_gpuStarts[ frameId % m_gpuStarts.size() ].store(0);
|
||||
}
|
||||
|
||||
void notifyGpuPresentEnd( uint64_t frameId ) override {
|
||||
// the frame has been displayed to the screen
|
||||
m_latencyMarkersStorage.registerFrameEnd(frameId);
|
||||
m_mode->endFrame(frameId);
|
||||
}
|
||||
|
||||
void notifyCsRenderBegin( uint64_t frameId ) override {
|
||||
auto now = high_resolution_clock::now();
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
|
||||
m->csStart = std::chrono::duration_cast<microseconds>(now - m->start).count();
|
||||
}
|
||||
|
||||
void notifyCsRenderEnd( uint64_t frameId ) override {
|
||||
auto now = high_resolution_clock::now();
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
|
||||
m->csFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
|
||||
m_mode->signalCsFinished( frameId );
|
||||
}
|
||||
|
||||
void notifySubmit() override {
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastSubmitFrameId+1);
|
||||
m->gpuSubmit.push_back(high_resolution_clock::now());
|
||||
}
|
||||
|
||||
void notifyPresent( uint64_t frameId ) override {
|
||||
// dx to vk translation is finished
|
||||
if (frameId != 0) {
|
||||
auto now = high_resolution_clock::now();
|
||||
m_lastSubmitFrameId = frameId;
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
|
||||
LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
|
||||
m->gpuSubmit.push_back(now);
|
||||
m->cpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
|
||||
next->gpuSubmit.clear();
|
||||
|
||||
m_latencyMarkersStorage.m_timeline.cpuFinished.store(frameId);
|
||||
}
|
||||
}
|
||||
|
||||
void notifyQueueSubmit( uint64_t frameId ) override {
|
||||
assert( frameId == m_lastQueueSubmitFrameId + 1 );
|
||||
auto now = high_resolution_clock::now();
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
|
||||
m->gpuQueueSubmit.push_back(now);
|
||||
queueSubmitCheckGpuStart(frameId, m, now);
|
||||
}
|
||||
|
||||
void notifyQueuePresentBegin( uint64_t frameId ) override {
|
||||
if (frameId != 0) {
|
||||
auto now = high_resolution_clock::now();
|
||||
m_lastQueueSubmitFrameId = frameId;
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
|
||||
LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
|
||||
m->gpuQueueSubmit.push_back(now);
|
||||
next->gpuQueueSubmit.clear();
|
||||
queueSubmitCheckGpuStart(frameId, m, now);
|
||||
}
|
||||
}
|
||||
|
||||
void notifyGpuExecutionBegin( uint64_t frameId ) override {
|
||||
assert( frameId == m_lastFinishedFrameId+1 );
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
|
||||
gpuExecutionCheckGpuStart(frameId, m, high_resolution_clock::now());
|
||||
}
|
||||
|
||||
void notifyGpuExecutionEnd( uint64_t frameId ) override {
|
||||
auto now = high_resolution_clock::now();
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(m_lastFinishedFrameId+1);
|
||||
m->gpuReady.push_back(now);
|
||||
}
|
||||
|
||||
virtual void notifyGpuPresentBegin( uint64_t frameId ) override {
|
||||
// we get frameId == 0 for repeated presents (SyncInterval)
|
||||
if (frameId != 0) {
|
||||
m_lastFinishedFrameId = frameId;
|
||||
auto now = high_resolution_clock::now();
|
||||
|
||||
LatencyMarkers* m = m_latencyMarkersStorage.getMarkers(frameId);
|
||||
LatencyMarkers* next = m_latencyMarkersStorage.getMarkers(frameId+1);
|
||||
m->gpuReady.push_back(now);
|
||||
m->gpuFinished = std::chrono::duration_cast<microseconds>(now - m->start).count();
|
||||
next->gpuReady.clear();
|
||||
next->gpuReady.push_back(now);
|
||||
|
||||
gpuExecutionCheckGpuStart(frameId, m, now);
|
||||
|
||||
m_latencyMarkersStorage.m_timeline.gpuFinished.store(frameId);
|
||||
m_mode->finishRender(frameId);
|
||||
m_mode->signalRenderFinished(frameId);
|
||||
}
|
||||
}
|
||||
|
||||
FramePacerMode::Mode getMode() const {
|
||||
return m_mode->m_mode;
|
||||
}
|
||||
|
||||
void setTargetFrameRate( double frameRate ) {
|
||||
m_mode->setTargetFrameRate(frameRate);
|
||||
}
|
||||
|
||||
bool needsAutoMarkers() override {
|
||||
return true;
|
||||
}
|
||||
|
||||
LatencyMarkersStorage m_latencyMarkersStorage;
|
||||
|
||||
|
||||
// not implemented methods
|
||||
|
||||
|
||||
void notifyCpuPresentBegin( uint64_t frameId) override { }
|
||||
void notifyCpuPresentEnd( uint64_t frameId ) override { }
|
||||
void notifyQueuePresentEnd( uint64_t frameId, VkResult status) override { }
|
||||
void discardTimings() override { }
|
||||
DxvkLatencyStats getStatistics( uint64_t frameId ) override
|
||||
{ return DxvkLatencyStats(); }
|
||||
|
||||
private:
|
||||
|
||||
void signalGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
|
||||
m->gpuStart = std::chrono::duration_cast<microseconds>(t - m->start).count();
|
||||
m_latencyMarkersStorage.m_timeline.gpuStart.store(frameId);
|
||||
m_mode->signalGpuStart(frameId);
|
||||
}
|
||||
|
||||
void queueSubmitCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
|
||||
auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
|
||||
uint16_t val = gpuStart.fetch_or(queueSubmitBit);
|
||||
if (val == gpuReadyBit)
|
||||
signalGpuStart( frameId, m, t );
|
||||
}
|
||||
|
||||
void gpuExecutionCheckGpuStart( uint64_t frameId, LatencyMarkers* m, const high_resolution_clock::time_point& t ) {
|
||||
auto& gpuStart = m_gpuStarts[ frameId % m_gpuStarts.size() ];
|
||||
uint16_t val = gpuStart.fetch_or(gpuReadyBit);
|
||||
if (val == queueSubmitBit)
|
||||
signalGpuStart( frameId, m, t );
|
||||
}
|
||||
|
||||
std::unique_ptr<FramePacerMode> m_mode;
|
||||
|
||||
uint64_t m_lastSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
uint64_t m_lastQueueSubmitFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
uint64_t m_lastFinishedFrameId = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
|
||||
std::array< std::atomic< uint16_t >, 16 > m_gpuStarts = { };
|
||||
static constexpr uint16_t queueSubmitBit = 1;
|
||||
static constexpr uint16_t gpuReadyBit = 2;
|
||||
|
||||
};
|
||||
|
||||
}
|
117
src/dxvk/framepacer/dxvk_framepacer_mode.h
Normal file
117
src/dxvk/framepacer/dxvk_framepacer_mode.h
Normal file
|
@ -0,0 +1,117 @@
|
|||
#pragma once
|
||||
|
||||
#include "dxvk_latency_markers.h"
|
||||
#include "../../util/sync/sync_signal.h"
|
||||
#include "../../util/util_env.h"
|
||||
#include <dxgi.h>
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
/*
|
||||
* /brief Abstract frame pacer mode in order to support different strategies of synchronization.
|
||||
*/
|
||||
|
||||
class FramePacerMode {
|
||||
|
||||
public:
|
||||
|
||||
enum Mode {
|
||||
MAX_FRAME_LATENCY = 0,
|
||||
LOW_LATENCY,
|
||||
MIN_LATENCY
|
||||
};
|
||||
|
||||
FramePacerMode( Mode mode, LatencyMarkersStorage* markerStorage, uint32_t maxFrameLatency=1 )
|
||||
: m_mode( mode ),
|
||||
m_waitLatency( maxFrameLatency+1 ),
|
||||
m_latencyMarkersStorage( markerStorage ) {
|
||||
setFpsLimitFrametimeFromEnv();
|
||||
}
|
||||
|
||||
virtual ~FramePacerMode() { }
|
||||
|
||||
virtual void startFrame( uint64_t frameId ) { }
|
||||
virtual void endFrame( uint64_t frameId ) { }
|
||||
|
||||
virtual void finishRender( uint64_t frameId ) { }
|
||||
|
||||
void waitRenderFinished( uint64_t frameId ) {
|
||||
if (m_mode) m_fenceGpuFinished.wait(frameId-m_waitLatency); }
|
||||
|
||||
void signalRenderFinished( uint64_t frameId ) {
|
||||
if (m_mode) m_fenceGpuFinished.signal(frameId); }
|
||||
|
||||
void signalGpuStart( uint64_t frameId ) {
|
||||
if (m_mode) m_fenceGpuStart.signal(frameId); }
|
||||
|
||||
void signalCsFinished( uint64_t frameId ) {
|
||||
if (m_mode) m_fenceCsFinished.signal(frameId); }
|
||||
|
||||
void setTargetFrameRate( double frameRate ) {
|
||||
if (!m_fpsLimitEnvOverride && frameRate > 1.0)
|
||||
m_fpsLimitFrametime.store( 1'000'000/frameRate );
|
||||
}
|
||||
|
||||
const Mode m_mode;
|
||||
|
||||
static bool getDoubleFromEnv( const char* name, double* result );
|
||||
static bool getIntFromEnv( const char* name, int* result );
|
||||
|
||||
protected:
|
||||
|
||||
void setFpsLimitFrametimeFromEnv();
|
||||
|
||||
const uint32_t m_waitLatency;
|
||||
LatencyMarkersStorage* m_latencyMarkersStorage;
|
||||
std::atomic<int32_t> m_fpsLimitFrametime = { 0 };
|
||||
bool m_fpsLimitEnvOverride = { false };
|
||||
|
||||
sync::Fence m_fenceGpuStart = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
|
||||
sync::Fence m_fenceGpuFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS) };
|
||||
sync::Fence m_fenceCsFinished = { sync::Fence(DXGI_MAX_SWAP_CHAIN_BUFFERS+50) };
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
inline bool FramePacerMode::getDoubleFromEnv( const char* name, double* result ) {
|
||||
std::string env = env::getEnvVar(name);
|
||||
if (env.empty())
|
||||
return false;
|
||||
|
||||
try {
|
||||
*result = std::stod(env);
|
||||
return true;
|
||||
} catch (const std::invalid_argument&) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline bool FramePacerMode::getIntFromEnv( const char* name, int* result ) {
|
||||
std::string env = env::getEnvVar(name);
|
||||
if (env.empty())
|
||||
return false;
|
||||
|
||||
try {
|
||||
*result = std::stoi(env);
|
||||
return true;
|
||||
} catch (const std::invalid_argument&) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
inline void FramePacerMode::setFpsLimitFrametimeFromEnv() {
|
||||
double fpsLimit;
|
||||
if (!getDoubleFromEnv("DXVK_FRAME_RATE", &fpsLimit))
|
||||
return;
|
||||
|
||||
m_fpsLimitEnvOverride = true;
|
||||
if (fpsLimit < 1.0)
|
||||
return;
|
||||
|
||||
m_fpsLimitFrametime = 1'000'000/fpsLimit;
|
||||
}
|
||||
|
||||
}
|
43
src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
Normal file
43
src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.cpp
Normal file
|
@ -0,0 +1,43 @@
|
|||
#include "dxvk_framepacer_mode_low_latency.h"
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
|
||||
bool getLowLatencyOffsetFromEnv( int32_t& offset ) {
|
||||
if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_OFFSET", &offset))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
bool getLowLatencyAllowCpuFramesOverlapFromEnv( bool& allowOverlap ) {
|
||||
int32_t o;
|
||||
if (!FramePacerMode::getIntFromEnv("DXVK_LOW_LATENCY_ALLOW_CPU_FRAMES_OVERLAP", &o))
|
||||
return false;
|
||||
allowOverlap = (bool) o;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
int32_t LowLatencyMode::getLowLatencyOffset( const DxvkOptions& options ) {
|
||||
int32_t offset = options.lowLatencyOffset;
|
||||
int32_t o;
|
||||
if (getLowLatencyOffsetFromEnv(o))
|
||||
offset = o;
|
||||
|
||||
offset = std::max( -10000, offset );
|
||||
offset = std::min( 10000, offset );
|
||||
return offset;
|
||||
}
|
||||
|
||||
|
||||
bool LowLatencyMode::getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options ) {
|
||||
bool allowOverlap = options.lowLatencyAllowCpuFramesOverlap;
|
||||
bool o;
|
||||
if (getLowLatencyAllowCpuFramesOverlapFromEnv(o))
|
||||
allowOverlap = o;
|
||||
return allowOverlap;
|
||||
}
|
||||
|
||||
|
||||
}
|
255
src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
Normal file
255
src/dxvk/framepacer/dxvk_framepacer_mode_low_latency.h
Normal file
|
@ -0,0 +1,255 @@
|
|||
#pragma once
|
||||
|
||||
#include "dxvk_framepacer_mode.h"
|
||||
#include "../dxvk_options.h"
|
||||
#include "../../util/log/log.h"
|
||||
#include "../../util/util_string.h"
|
||||
#include <assert.h>
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
/*
|
||||
* This low-latency mode aims to reduce latency with minimal impact in fps.
|
||||
* Effective when operating in the GPU-limit. Efficient to be used in the CPU-limit as well.
|
||||
*
|
||||
* Greatly reduces input lag variations when switching between CPU- and GPU-limit, and
|
||||
* compared to the max-frame-latency approach, it has a much more stable input lag when
|
||||
* GPU running times change dramatically, which can happen for example when rotating within a scene.
|
||||
*
|
||||
* The current implementation rather generates fluctuations alternating frame-by-frame
|
||||
* depending on the game's and dxvk's CPU-time variations. This might be visible as a loss
|
||||
* in smoothness, which is an area this implementation can be further improved. Unsuitable
|
||||
* smoothing however might degrade input-lag feel, so it's not implemented for now, but
|
||||
* more advanced smoothing techniques will be investigated in the future.
|
||||
* In some situations however, this low-latency pacing actually improves smoothing though,
|
||||
* it will depend on the game.
|
||||
*
|
||||
* An interesting observation while playtesting was that not only the input lag was affected,
|
||||
* but the video generated did progress more cleanly in time as well with regards to
|
||||
* medium-term time consistency, in other words, the video playback speed remained more steady.
|
||||
*
|
||||
* Optimized for VRR and VK_PRESENT_MODE_IMMEDIATE_KHR. It also comes with its own fps-limiter
|
||||
* which is typically used to prevent the game's fps exceeding the monitor's refresh rate,
|
||||
* and which is tightly integrated into the pacing logic.
|
||||
*
|
||||
* Can be fine-tuned via the dxvk.lowLatencyOffset and dxvk.lowLatencyAllowCpuFramesOverlap
|
||||
* variables (or their respective environment variables)
|
||||
* Compared to maxFrameLatency = 3, render-latency reductions of up to 67% are achieved.
|
||||
*/
|
||||
|
||||
class LowLatencyMode : public FramePacerMode {
|
||||
using microseconds = std::chrono::microseconds;
|
||||
using time_point = high_resolution_clock::time_point;
|
||||
public:
|
||||
|
||||
LowLatencyMode(Mode mode, LatencyMarkersStorage* storage, const DxvkOptions& options)
|
||||
: FramePacerMode(mode, storage),
|
||||
m_lowLatencyOffset(getLowLatencyOffset(options)),
|
||||
m_allowCpuFramesOverlap(getLowLatencyAllowCpuFramesOverlap(options)) {
|
||||
Logger::info( str::format("Using lowLatencyOffset: ", m_lowLatencyOffset) );
|
||||
Logger::info( str::format("Using lowLatencyAllowCpuFramesOverlap: ", m_allowCpuFramesOverlap) );
|
||||
}
|
||||
|
||||
~LowLatencyMode() {}
|
||||
|
||||
|
||||
void startFrame( uint64_t frameId ) override {
|
||||
using std::chrono::duration_cast;
|
||||
|
||||
if (!m_allowCpuFramesOverlap)
|
||||
m_fenceCsFinished.wait( frameId-1 );
|
||||
|
||||
m_fenceGpuStart.wait( frameId-1 );
|
||||
|
||||
time_point now = high_resolution_clock::now();
|
||||
uint64_t finishedId = m_latencyMarkersStorage->getTimeline()->gpuFinished.load();
|
||||
if (finishedId <= DXGI_MAX_SWAP_CHAIN_BUFFERS+1ull)
|
||||
return;
|
||||
|
||||
if (finishedId == frameId-1) {
|
||||
// we are the only in-flight frame, nothing to do other then to apply fps-limiter if needed
|
||||
m_lastStart = sleepFor( now, 0 );
|
||||
return;
|
||||
}
|
||||
|
||||
if (finishedId != frameId-2) {
|
||||
Logger::err( str::format("internal error during low-latency frame pacing: expected finished frameId=",
|
||||
frameId-2, ", got: ", finishedId) );
|
||||
}
|
||||
|
||||
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-1);
|
||||
|
||||
// estimate the target gpu sync point for this frame
|
||||
// and calculate backwards when we want to start this frame
|
||||
|
||||
const SyncProps props = getSyncPrediction();
|
||||
int32_t gpuReadyPrediction = duration_cast<microseconds>(
|
||||
m->start + microseconds(m->gpuStart+getGpuStartToFinishPrediction()) - now).count();
|
||||
|
||||
int32_t targetGpuSync = gpuReadyPrediction + props.gpuSync;
|
||||
int32_t delay = targetGpuSync - props.cpuUntilGpuSync + m_lowLatencyOffset;
|
||||
|
||||
m_lastStart = sleepFor( now, delay );
|
||||
|
||||
}
|
||||
|
||||
|
||||
void finishRender( uint64_t frameId ) override {
|
||||
|
||||
using std::chrono::duration_cast;
|
||||
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
|
||||
|
||||
int32_t numLoop = (int32_t)(m->gpuReady.size())-1;
|
||||
if (numLoop <= 1) {
|
||||
m_props[frameId % m_props.size()] = SyncProps();
|
||||
m_props[frameId % m_props.size()].isOutlier = true;
|
||||
m_propsFinished.store( frameId );
|
||||
return;
|
||||
}
|
||||
|
||||
// estimates the optimal overlap for cpu/gpu work by optimizing gpu scheduling first
|
||||
// such that the gpu doesn't go into idle for this frame, and then aligning cpu submits
|
||||
// where gpuSubmit[i] <= gpuRun[i] for all i
|
||||
|
||||
std::vector<int32_t>& gpuRun = m_tempGpuRun;
|
||||
std::vector<int32_t>& gpuRunDurations = m_tempGpuRunDurations;
|
||||
gpuRun.clear();
|
||||
gpuRunDurations.clear();
|
||||
int32_t optimizedGpuTime = 0;
|
||||
gpuRun.push_back(optimizedGpuTime);
|
||||
|
||||
for (int i=0; i<numLoop; ++i) {
|
||||
time_point _gpuRun = std::max( m->gpuReady[i], m->gpuQueueSubmit[i] );
|
||||
int32_t duration = duration_cast<microseconds>( m->gpuReady[i+1] - _gpuRun ).count();
|
||||
optimizedGpuTime += duration;
|
||||
gpuRun.push_back(optimizedGpuTime);
|
||||
gpuRunDurations.push_back(duration);
|
||||
}
|
||||
|
||||
int32_t alignment = duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->gpuSubmit[0] ).count()
|
||||
- gpuRun[numLoop-1];
|
||||
|
||||
int32_t offset = 0;
|
||||
for (int i=numLoop-2; i>=0; --i) {
|
||||
int32_t curSubmit = duration_cast<microseconds>( m->gpuSubmit[i] - m->gpuSubmit[0] ).count();
|
||||
int32_t diff = curSubmit - gpuRun[i] - alignment;
|
||||
diff = std::max( 0, diff );
|
||||
offset += diff;
|
||||
alignment += diff;
|
||||
}
|
||||
|
||||
|
||||
SyncProps& props = m_props[frameId % m_props.size()];
|
||||
props.gpuSync = gpuRun[numLoop-1];
|
||||
props.cpuUntilGpuSync = offset + duration_cast<microseconds>( m->gpuSubmit[numLoop-1] - m->start ).count();
|
||||
props.optimizedGpuTime = optimizedGpuTime;
|
||||
props.isOutlier = isOutlier(frameId);
|
||||
|
||||
m_propsFinished.store( frameId );
|
||||
|
||||
}
|
||||
|
||||
|
||||
Sleep::TimePoint sleepFor( const Sleep::TimePoint t, int32_t delay ) {
|
||||
|
||||
// account for the fps limit and ensure we won't sleep too long, just in case
|
||||
int32_t frametime = std::chrono::duration_cast<microseconds>( t - m_lastStart ).count();
|
||||
int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
|
||||
delay = std::max( delay, frametimeDiff );
|
||||
delay = std::max( 0, std::min( delay, 20000 ) );
|
||||
|
||||
Sleep::TimePoint nextStart = t + microseconds(delay);
|
||||
Sleep::sleepUntil( t, nextStart );
|
||||
return nextStart;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
|
||||
struct SyncProps {
|
||||
int32_t optimizedGpuTime; // gpu executing packed submits in one go
|
||||
int32_t gpuSync; // us after gpuStart
|
||||
int32_t cpuUntilGpuSync;
|
||||
bool isOutlier;
|
||||
};
|
||||
|
||||
|
||||
SyncProps getSyncPrediction() {
|
||||
// in the future we might use more samples to get a prediction
|
||||
// however, simple averaging gives a slightly artificial mouse input
|
||||
// more advanced methods will be investigated
|
||||
SyncProps res = {};
|
||||
uint64_t id = m_propsFinished;
|
||||
if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
|
||||
return res;
|
||||
|
||||
for (size_t i=0; i<7; ++i) {
|
||||
const SyncProps& props = m_props[ (id-i) % m_props.size() ];
|
||||
if (!props.isOutlier) {
|
||||
id = id-i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return m_props[ id % m_props.size() ];
|
||||
};
|
||||
|
||||
|
||||
int32_t getGpuStartToFinishPrediction() {
|
||||
uint64_t id = m_propsFinished;
|
||||
if (id < DXGI_MAX_SWAP_CHAIN_BUFFERS+7)
|
||||
return 0;
|
||||
|
||||
for (size_t i=0; i<7; ++i) {
|
||||
const SyncProps& props = m_props[ (id-i) % m_props.size() ];
|
||||
if (!props.isOutlier) {
|
||||
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id-i);
|
||||
if (m->gpuReady.empty() || m->gpuSubmit.empty())
|
||||
return m->gpuFinished - m->gpuStart;
|
||||
|
||||
time_point t = std::max( m->gpuReady[0], m->gpuSubmit[0] );
|
||||
return std::chrono::duration_cast<microseconds>( t - m->start ).count()
|
||||
+ props.optimizedGpuTime
|
||||
- m->gpuStart;
|
||||
}
|
||||
}
|
||||
|
||||
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(id);
|
||||
return m->gpuFinished - m->gpuStart;
|
||||
};
|
||||
|
||||
|
||||
bool isOutlier( uint64_t frameId ) {
|
||||
constexpr size_t numLoop = 7;
|
||||
int32_t totalCpuTime = 0;
|
||||
for (size_t i=0; i<numLoop; ++i) {
|
||||
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId-i);
|
||||
totalCpuTime += m->cpuFinished;
|
||||
}
|
||||
|
||||
int32_t avgCpuTime = totalCpuTime / numLoop;
|
||||
const LatencyMarkers* m = m_latencyMarkersStorage->getConstMarkers(frameId);
|
||||
if (m->cpuFinished > 1.7*avgCpuTime || m->gpuSubmit.empty() || m->gpuReady.size() != (m->gpuSubmit.size()+1) )
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
int32_t getLowLatencyOffset( const DxvkOptions& options );
|
||||
bool getLowLatencyAllowCpuFramesOverlap( const DxvkOptions& options );
|
||||
|
||||
const int32_t m_lowLatencyOffset;
|
||||
const bool m_allowCpuFramesOverlap;
|
||||
|
||||
Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
|
||||
std::array<SyncProps, 16> m_props;
|
||||
std::atomic<uint64_t> m_propsFinished = { 0 };
|
||||
|
||||
std::vector<int32_t> m_tempGpuRun;
|
||||
std::vector<int32_t> m_tempGpuRunDurations;
|
||||
|
||||
};
|
||||
|
||||
}
|
45
src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
Normal file
45
src/dxvk/framepacer/dxvk_framepacer_mode_min_latency.h
Normal file
|
@ -0,0 +1,45 @@
|
|||
#pragma once
|
||||
|
||||
#include "dxvk_framepacer_mode.h"
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
/*
|
||||
* Minimal latency is achieved here by waiting for the previous
|
||||
* frame to complete, which results in very much reduced fps.
|
||||
* Generally not recommended, but helpful to get insights to fine-tune
|
||||
* the low-latency mode, and possibly is useful for running games
|
||||
* in the cpu limit.
|
||||
*/
|
||||
|
||||
class MinLatencyMode : public FramePacerMode {
|
||||
|
||||
public:
|
||||
|
||||
MinLatencyMode(Mode mode, LatencyMarkersStorage* storage)
|
||||
: FramePacerMode(mode, storage, 0) {}
|
||||
|
||||
~MinLatencyMode() {}
|
||||
|
||||
void startFrame( uint64_t frameId ) override {
|
||||
|
||||
Sleep::TimePoint now = high_resolution_clock::now();
|
||||
int32_t frametime = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
now - m_lastStart ).count();
|
||||
int32_t frametimeDiff = std::max( 0, m_fpsLimitFrametime.load() - frametime );
|
||||
int32_t delay = std::max( 0, frametimeDiff );
|
||||
delay = std::min( delay, 20000 );
|
||||
|
||||
Sleep::TimePoint nextStart = now + std::chrono::microseconds(delay);
|
||||
Sleep::sleepUntil( now, nextStart );
|
||||
m_lastStart = nextStart;
|
||||
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
Sleep::TimePoint m_lastStart = { high_resolution_clock::now() };
|
||||
|
||||
};
|
||||
|
||||
}
|
148
src/dxvk/framepacer/dxvk_latency_markers.h
Normal file
148
src/dxvk/framepacer/dxvk_latency_markers.h
Normal file
|
@ -0,0 +1,148 @@
|
|||
#pragma once
|
||||
|
||||
#include <atomic>
|
||||
#include <dxgi.h>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <assert.h>
|
||||
#include "../../util/util_sleep.h"
|
||||
#include "../../util/log/log.h"
|
||||
#include "../../util/util_string.h"
|
||||
|
||||
|
||||
namespace dxvk {
|
||||
|
||||
class FramePacer;
|
||||
class LatencyMarkersStorage;
|
||||
|
||||
|
||||
struct LatencyMarkers {
|
||||
|
||||
using time_point = high_resolution_clock::time_point;
|
||||
|
||||
time_point start;
|
||||
time_point end;
|
||||
|
||||
int32_t csStart;
|
||||
int32_t csFinished;
|
||||
int32_t cpuFinished;
|
||||
int32_t gpuStart;
|
||||
int32_t gpuFinished;
|
||||
int32_t presentFinished;
|
||||
|
||||
std::vector<time_point> gpuReady;
|
||||
std::vector<time_point> gpuSubmit;
|
||||
std::vector<time_point> gpuQueueSubmit;
|
||||
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
* stores which information is accessible for which frame
|
||||
*/
|
||||
struct LatencyMarkersTimeline {
|
||||
|
||||
std::atomic<uint64_t> cpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
std::atomic<uint64_t> gpuStart = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
std::atomic<uint64_t> gpuFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
std::atomic<uint64_t> frameFinished = { DXGI_MAX_SWAP_CHAIN_BUFFERS };
|
||||
|
||||
};
|
||||
|
||||
|
||||
class LatencyMarkersReader {
|
||||
|
||||
public:
|
||||
|
||||
LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries );
|
||||
bool getNext( const LatencyMarkers*& result );
|
||||
|
||||
private:
|
||||
|
||||
const LatencyMarkersStorage* m_storage;
|
||||
uint64_t m_index;
|
||||
|
||||
};
|
||||
|
||||
|
||||
class LatencyMarkersStorage {
|
||||
friend class LatencyMarkersReader;
|
||||
friend class FramePacer;
|
||||
public:
|
||||
|
||||
LatencyMarkersStorage() { }
|
||||
~LatencyMarkersStorage() { }
|
||||
|
||||
LatencyMarkersReader getReader( uint32_t numEntries ) const {
|
||||
return LatencyMarkersReader(this, numEntries);
|
||||
}
|
||||
|
||||
void registerFrameStart( uint64_t frameId ) {
|
||||
if (frameId <= m_timeline.frameFinished.load()) {
|
||||
Logger::warn( str::format("internal error during registerFrameStart: expected frameId=",
|
||||
m_timeline.frameFinished.load()+1, ", got: ", frameId) );
|
||||
}
|
||||
auto now = high_resolution_clock::now();
|
||||
|
||||
LatencyMarkers* markers = getMarkers(frameId);
|
||||
markers->start = now;
|
||||
}
|
||||
|
||||
void registerFrameEnd( uint64_t frameId ) {
|
||||
if (frameId <= m_timeline.frameFinished.load()) {
|
||||
Logger::warn( str::format("internal error during registerFrameEnd: expected frameId=",
|
||||
m_timeline.frameFinished.load()+1, ", got: ", frameId) );
|
||||
}
|
||||
auto now = high_resolution_clock::now();
|
||||
|
||||
LatencyMarkers* markers = getMarkers(frameId);
|
||||
markers->presentFinished = std::chrono::duration_cast<std::chrono::microseconds>(
|
||||
now - markers->start).count();
|
||||
markers->end = now;
|
||||
|
||||
m_timeline.frameFinished.store(frameId);
|
||||
}
|
||||
|
||||
const LatencyMarkersTimeline* getTimeline() const {
|
||||
return &m_timeline;
|
||||
}
|
||||
|
||||
const LatencyMarkers* getConstMarkers( uint64_t frameId ) const {
|
||||
return &m_markers[frameId % m_numMarkers];
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
|
||||
LatencyMarkers* getMarkers( uint64_t frameId ) {
|
||||
return &m_markers[frameId % m_numMarkers];
|
||||
}
|
||||
|
||||
// simple modulo hash mapping is used for frameIds. They are expected to monotonically increase by one.
|
||||
// select the size large enough, so we never come into a situation where the reader cannot keep up with the producer
|
||||
static constexpr uint16_t m_numMarkers = 128;
|
||||
std::array<LatencyMarkers, m_numMarkers> m_markers = { };
|
||||
LatencyMarkersTimeline m_timeline;
|
||||
|
||||
};
|
||||
|
||||
|
||||
|
||||
inline LatencyMarkersReader::LatencyMarkersReader( const LatencyMarkersStorage* storage, uint32_t numEntries )
|
||||
: m_storage(storage) {
|
||||
m_index = 0;
|
||||
if (m_storage->m_timeline.frameFinished.load() > numEntries + DXGI_MAX_SWAP_CHAIN_BUFFERS + 2)
|
||||
m_index = m_storage->m_timeline.frameFinished.load() - numEntries;
|
||||
}
|
||||
|
||||
|
||||
inline bool LatencyMarkersReader::getNext( const LatencyMarkers*& result ) {
|
||||
if (m_index == 0 || m_index > m_storage->m_timeline.frameFinished.load())
|
||||
return false;
|
||||
|
||||
result = &m_storage->m_markers[m_index % m_storage->m_numMarkers];
|
||||
m_index++;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
|
@ -120,6 +120,9 @@ dxvk_src = [
|
|||
'hud/dxvk_hud_font.cpp',
|
||||
'hud/dxvk_hud_item.cpp',
|
||||
'hud/dxvk_hud_renderer.cpp',
|
||||
|
||||
'framepacer/dxvk_framepacer.cpp',
|
||||
'framepacer/dxvk_framepacer_mode_low_latency.cpp',
|
||||
]
|
||||
|
||||
if platform == 'windows'
|
||||
|
|
Loading…
Add table
Reference in a new issue