From 559ee799e78278cde40e0c9bada921fac7fb3092 Mon Sep 17 00:00:00 2001
From: Philip Rebohle <philip.rebohle@tu-dortmund.de>
Date: Fri, 21 Feb 2025 13:48:03 +0100
Subject: [PATCH] [dxvk,d3d11] Refactor CS command data allocation

Allows us to allocate a (potentially growing) array of
arbitrary data structures for a CS command.
---
 src/d3d11/d3d11_cmd.h       |  18 ++---
 src/d3d11/d3d11_context.cpp |  83 +++++++++++-----------
 src/d3d11/d3d11_context.h   |  29 ++++----
 src/dxvk/dxvk_cs.h          | 135 ++++++++++++++++++++++++++++--------
 4 files changed, 169 insertions(+), 96 deletions(-)

diff --git a/src/d3d11/d3d11_cmd.h b/src/d3d11/d3d11_cmd.h
index 52f2f4dbc..635f63b72 100644
--- a/src/d3d11/d3d11_cmd.h
+++ b/src/d3d11/d3d11_cmd.h
@@ -10,23 +10,13 @@ namespace dxvk {
    * Used to identify the type of command
    * data most recently added to a CS chunk.
    */
-  enum class D3D11CmdType {
+  enum class D3D11CmdType : uint32_t {
+    None,
     DrawIndirect,
     DrawIndirectIndexed,
   };
 
 
-  /**
-   * \brief Command data header
-   * 
-   * Stores the command type. All command
-   * data structs must inherit this struct.
-   */
-  struct D3D11CmdData {
-    D3D11CmdType        type;
-  };
-
-
   /**
    * \brief Indirect draw command data
    * 
@@ -34,10 +24,10 @@ namespace dxvk {
    * the first draw, as well as the number of
    * draws to execute.
    */
-  struct D3D11CmdDrawIndirectData : public D3D11CmdData {
+  struct D3D11CmdDrawIndirectData {
     uint32_t            offset;
     uint32_t            count;
     uint32_t            stride;
   };
 
-}
\ No newline at end of file
+}
diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp
index ae2edcc49..c99c40178 100644
--- a/src/d3d11/d3d11_context.cpp
+++ b/src/d3d11/d3d11_context.cpp
@@ -19,8 +19,7 @@ namespace dxvk {
     m_flags     (ContextFlags),
     m_staging   (Device, StagingBufferSize),
     m_csFlags   (CsFlags),
-    m_csChunk   (AllocCsChunk()),
-    m_cmdData   (nullptr) {
+    m_csChunk   (AllocCsChunk()) {
     // Create local allocation cache with the same properties
     // that we will use for common dynamic buffer types
     uint32_t cachedDynamic = pParent->GetOptions()->cachedDynamicResources;
@@ -1125,28 +1124,28 @@ namespace dxvk {
     if (unlikely(HasDirtyGraphicsBindings()))
       ApplyDirtyGraphicsBindings();
 
-    // If possible, batch up multiple indirect draw calls of
-    // the same type into one single multiDrawIndirect call
-    auto cmdData = static_cast<D3D11CmdDrawIndirectData*>(m_cmdData);
-    auto stride = 0u;
+    // If possible, batch multiple indirect draw calls into one single multidraw call
+    if (m_csDataType == D3D11CmdType::DrawIndirectIndexed) {
+      auto cmdData = static_cast<D3D11CmdDrawIndirectData*>(m_csData->first());
+      auto stride = GetIndirectCommandStride(cmdData, AlignedByteOffsetForArgs, sizeof(VkDrawIndexedIndirectCommand));
 
-    if (cmdData && cmdData->type == D3D11CmdType::DrawIndirectIndexed)
-      stride = GetIndirectCommandStride(cmdData, AlignedByteOffsetForArgs, sizeof(VkDrawIndexedIndirectCommand));
-
-    if (stride) {
-      cmdData->count += 1;
-      cmdData->stride = stride;
-    } else {
-      cmdData = EmitCsCmd<D3D11CmdDrawIndirectData>(
-        [] (DxvkContext* ctx, const D3D11CmdDrawIndirectData* data) {
-          ctx->drawIndexedIndirect(data->offset, data->count, data->stride, true);
-        });
-
-      cmdData->type   = D3D11CmdType::DrawIndirectIndexed;
-      cmdData->offset = AlignedByteOffsetForArgs;
-      cmdData->count  = 1;
-      cmdData->stride = 0;
+      if (stride) {
+        cmdData->count += 1;
+        cmdData->stride = stride;
+        return;
+      }
     }
+
+    // Need to start a new draw sequence
+    EmitCsCmd<D3D11CmdDrawIndirectData>(D3D11CmdType::DrawIndirectIndexed, 1u,
+      [] (DxvkContext* ctx, const D3D11CmdDrawIndirectData* data, size_t) {
+        ctx->drawIndexedIndirect(data->offset, data->count, data->stride, true);
+      });
+
+    auto cmdData = new (m_csData->first()) D3D11CmdDrawIndirectData();
+    cmdData->offset = AlignedByteOffsetForArgs;
+    cmdData->count  = 1;
+    cmdData->stride = 0;
   }
 
 
@@ -1163,28 +1162,28 @@ namespace dxvk {
     if (unlikely(HasDirtyGraphicsBindings()))
       ApplyDirtyGraphicsBindings();
 
-    // If possible, batch up multiple indirect draw calls of
-    // the same type into one single multiDrawIndirect call
-    auto cmdData = static_cast<D3D11CmdDrawIndirectData*>(m_cmdData);
-    auto stride = 0u;
+    // If possible, batch multiple indirect draw calls into one single multidraw call
+    if (m_csDataType == D3D11CmdType::DrawIndirect) {
+      auto cmdData = static_cast<D3D11CmdDrawIndirectData*>(m_csData->first());
+      auto stride = GetIndirectCommandStride(cmdData, AlignedByteOffsetForArgs, sizeof(VkDrawIndirectCommand));
 
-    if (cmdData && cmdData->type == D3D11CmdType::DrawIndirect)
-      stride = GetIndirectCommandStride(cmdData, AlignedByteOffsetForArgs, sizeof(VkDrawIndirectCommand));
-
-    if (stride) {
-      cmdData->count += 1;
-      cmdData->stride = stride;
-    } else {
-      cmdData = EmitCsCmd<D3D11CmdDrawIndirectData>(
-        [] (DxvkContext* ctx, const D3D11CmdDrawIndirectData* data) {
-          ctx->drawIndirect(data->offset, data->count, data->stride, true);
-        });
-
-      cmdData->type   = D3D11CmdType::DrawIndirect;
-      cmdData->offset = AlignedByteOffsetForArgs;
-      cmdData->count  = 1;
-      cmdData->stride = 0;
+      if (stride) {
+        cmdData->count += 1;
+        cmdData->stride = stride;
+        return;
+      }
     }
+
+    // Need to start a new draw sequence
+    EmitCsCmd<D3D11CmdDrawIndirectData>(D3D11CmdType::DrawIndirect, 1u,
+      [] (DxvkContext* ctx, const D3D11CmdDrawIndirectData* data, size_t) {
+        ctx->drawIndirect(data->offset, data->count, data->stride, true);
+      });
+
+    auto cmdData = new (m_csData->first()) D3D11CmdDrawIndirectData();
+    cmdData->offset = AlignedByteOffsetForArgs;
+    cmdData->count  = 1;
+    cmdData->stride = 0;
   }
 
 
diff --git a/src/d3d11/d3d11_context.h b/src/d3d11/d3d11_context.h
index d4b06e2d8..02dd816ab 100644
--- a/src/d3d11/d3d11_context.h
+++ b/src/d3d11/d3d11_context.h
@@ -793,9 +793,11 @@ namespace dxvk {
 
     DxvkStagingBuffer           m_staging;
 
+    D3D11CmdType                m_csDataType = D3D11CmdType::None;
+
     DxvkCsChunkFlags            m_csFlags;
     DxvkCsChunkRef              m_csChunk;
-    D3D11CmdData*               m_cmdData;
+    DxvkCsDataBlock*            m_csData = nullptr;
 
     DxvkLocalAllocationCache    m_allocationCache;
 
@@ -1152,7 +1154,10 @@ namespace dxvk {
 
     template<bool AllowFlush = true, typename Cmd>
     void EmitCs(Cmd&& command) {
-      m_cmdData = nullptr;
+      if (unlikely(m_csDataType != D3D11CmdType::None)) {
+        m_csData = nullptr;
+        m_csDataType = D3D11CmdType::None;
+      }
 
       if (unlikely(!m_csChunk->push(command))) {
         GetTypedContext()->EmitCsChunk(std::move(m_csChunk));
@@ -1165,12 +1170,12 @@ namespace dxvk {
       }
     }
 
-    template<typename M, bool AllowFlush = true, typename Cmd, typename... Args>
-    M* EmitCsCmd(Cmd&& command, Args&&... args) {
-      M* data = m_csChunk->pushCmd<M, Cmd, Args...>(
-        command, std::forward<Args>(args)...);
+    template<typename M, bool AllowFlush = true, typename Cmd>
+    void EmitCsCmd(D3D11CmdType type, size_t count, Cmd&& command) {
+      m_csDataType = type;
+      m_csData = m_csChunk->pushCmd<M, Cmd>(command, count);
 
-      if (unlikely(!data)) {
+      if (unlikely(!m_csData)) {
         GetTypedContext()->EmitCsChunk(std::move(m_csChunk));
         m_csChunk = AllocCsChunk();
 
@@ -1179,19 +1184,17 @@ namespace dxvk {
 
         // We must record this command after the potential
         // flush since the caller may still access the data
-        data = m_csChunk->pushCmd<M, Cmd, Args...>(
-          command, std::forward<Args>(args)...);
+        m_csData = m_csChunk->pushCmd<M, Cmd>(command, count);
       }
-
-      m_cmdData = data;
-      return data;
     }
 
     void FlushCsChunk() {
       if (likely(!m_csChunk->empty())) {
+        m_csData = nullptr;
+        m_csDataType = D3D11CmdType::None;
+
         GetTypedContext()->EmitCsChunk(std::move(m_csChunk));
         m_csChunk = AllocCsChunk();
-        m_cmdData = nullptr;
       }
     }
 
diff --git a/src/dxvk/dxvk_cs.h b/src/dxvk/dxvk_cs.h
index 48f12b623..1a102e1d0 100644
--- a/src/dxvk/dxvk_cs.h
+++ b/src/dxvk/dxvk_cs.h
@@ -11,7 +11,9 @@
 #include "dxvk_context.h"
 
 namespace dxvk {
-  
+
+  constexpr static size_t DxvkCsChunkSize = 16384;
+
   /**
    * \brief Command stream operation
    * 
@@ -86,6 +88,41 @@ namespace dxvk {
   };
 
 
+  /**
+   * \brief Command data block
+   *
+   * Provides functionality to allocate a potentially growing
+   * array of structures for a command to traverse.
+   */
+  class DxvkCsDataBlock {
+    friend class DxvkCsChunk;
+  public:
+
+    /**
+     * \brief Number of structures allocated
+     * \returns Number of structures allocated
+     */
+    size_t count() const {
+      return m_structCount;
+    }
+
+    /**
+     * \brief Retrieves pointer to first structure
+     * \returns Untyped pointer to first structure
+     */
+    void* first() {
+      return reinterpret_cast<char*>(this) + m_dataOffset;
+    }
+
+  private:
+
+    uint32_t m_dataOffset  = 0u;
+    uint16_t m_structSize  = 0u;
+    uint16_t m_structCount = 0u;
+
+  };
+
+
   /**
    * \brief Typed command with metadata
    * 
@@ -98,26 +135,33 @@ namespace dxvk {
 
   public:
 
-    template<typename... Args>
-    DxvkCsDataCmd(T&& cmd, Args&&... args)
-    : m_command (std::move(cmd)),
-      m_data    (std::forward<Args>(args)...) { }
-    
+    DxvkCsDataCmd(T&& cmd)
+    : m_command(std::move(cmd)) { }
+
+    ~DxvkCsDataCmd() {
+      auto data = reinterpret_cast<M*>(m_data.first());
+
+      for (size_t i = 0; i < m_data.count(); i++)
+        data[i].~M();
+    }
+
     DxvkCsDataCmd             (DxvkCsDataCmd&&) = delete;
     DxvkCsDataCmd& operator = (DxvkCsDataCmd&&) = delete;
 
     void exec(DxvkContext* ctx) {
-      m_command(ctx, &m_data);
+      // No const here so that the function can move objects efficiently
+      m_command(ctx, reinterpret_cast<M*>(m_data.first()), m_data.count());
     }
 
-    M* data() {
+    DxvkCsDataBlock* data() {
       return &m_data;
     }
 
   private:
 
-    T m_command;
-    M m_data;
+    alignas(M)
+    T               m_command;
+    DxvkCsDataBlock m_data;
 
   };
   
@@ -140,12 +184,12 @@ namespace dxvk {
    * Stores a list of commands.
    */
   class DxvkCsChunk : public RcObject {
-    constexpr static size_t MaxBlockSize = 16384;
+
   public:
-    
+
     DxvkCsChunk();
     ~DxvkCsChunk();
-    
+
     /**
      * \brief Checks whether the chunk is empty
      * \returns \c true if the chunk is empty
@@ -167,7 +211,7 @@ namespace dxvk {
     template<typename T>
     bool push(T& command) {
       using FuncType = DxvkCsTypedCmd<T>;
-      void* ptr = alloc<FuncType>();
+      void* ptr = alloc<FuncType>(0u);
 
       if (unlikely(!ptr))
         return false;
@@ -186,23 +230,60 @@ namespace dxvk {
      * \brief Adds a command with data to the chunk 
      * 
      * \param [in] command The command to add
-     * \param [in] args Constructor args for the data object
+     * \param [in] count Number of items to allocate. Should be at least
+     *    1 in order to avoid the possibility of an empty command. Note
+     *    that all allocated structures \e must be initialized before
+     *    handing off the command to the worker thread.
      * \returns Pointer to the data object, or \c nullptr
      */
-    template<typename M, typename T, typename... Args>
-    M* pushCmd(T& command, Args&&... args) {
+    template<typename M, typename T>
+    DxvkCsDataBlock* pushCmd(T& command, size_t count) {
+      size_t dataSize = count * sizeof(M);
+
+      // DxvkCsDataCmd is aligned to M
       using FuncType = DxvkCsDataCmd<T, M>;
-      void* ptr = alloc<FuncType>();
+      void* ptr = alloc<FuncType>(dataSize);
 
       if (unlikely(!ptr))
         return nullptr;
 
-      auto next = new (ptr) FuncType(std::move(command), std::forward<Args>(args)...);
+      // Command data is always packed tightly after the function object
+      auto next = new (ptr) FuncType(std::move(command));
       append(next);
 
-      return next->data();
+      // Do some cursed pointer math here so that the block can figure out
+      // where its data is stored based on its own address. This saves a
+      // decent amount of CS chunk memory compared to storing a pointer.
+      auto block = next->data();
+      block->m_dataOffset = reinterpret_cast<uintptr_t>(&m_data[m_commandOffset - dataSize])
+                          - reinterpret_cast<uintptr_t>(block);
+      block->m_structSize = sizeof(M);
+      block->m_structCount = count;
+      return block;
     }
-    
+
+    /**
+     * \brief Allocates more storage for a data block
+     *
+     * The data bock \e must be owned by the last command added to
+     * the CS chunk, or this may override subsequent command data.
+     * \param [in] block Data block
+     * \param [in] count Number of structures to allocate
+     * \returns Pointer to first allocated structure, or \c nullptr
+     */
+    void* pushData(DxvkCsDataBlock* block, uint32_t count) {
+      uint32_t dataSize = block->m_structSize * count;
+
+      if (unlikely(m_commandOffset + dataSize > DxvkCsChunkSize))
+        return nullptr;
+
+      void* ptr = &m_data[m_commandOffset];
+      m_commandOffset += dataSize;
+
+      block->m_structCount += count;
+      return ptr;
+    }
+
     /**
      * \brief Initializes chunk for recording
      * \param [in] flags Chunk flags
@@ -237,18 +318,18 @@ namespace dxvk {
     DxvkCsChunkFlags m_flags;
     
     alignas(64)
-    char m_data[MaxBlockSize];
+    char m_data[DxvkCsChunkSize];
 
     template<typename T>
-    void* alloc() {
+    void* alloc(size_t extra) {
       if (alignof(T) > alignof(DxvkCsCmd))
         m_commandOffset = dxvk::align(m_commandOffset, alignof(T));
 
-      if (unlikely(m_commandOffset + sizeof(T) > MaxBlockSize))
+      if (unlikely(m_commandOffset + sizeof(T) + extra > DxvkCsChunkSize))
         return nullptr;
 
       void* result = &m_data[m_commandOffset];
-      m_commandOffset += sizeof(T);
+      m_commandOffset += sizeof(T) + extra;
       return result;
     }
 
@@ -420,7 +501,7 @@ namespace dxvk {
    * commands on a DXVK context. 
    */
   class DxvkCsThread {
-    
+
   public:
 
     constexpr static uint64_t SynchronizeAll = ~0ull;
@@ -515,5 +596,5 @@ namespace dxvk {
     void threadFunc();
     
   };
-  
+
 }