diff --git a/src/d3d11/d3d11_context.cpp b/src/d3d11/d3d11_context.cpp index 9ba60418d..cad645607 100644 --- a/src/d3d11/d3d11_context.cpp +++ b/src/d3d11/d3d11_context.cpp @@ -5507,19 +5507,25 @@ namespace dxvk { if (Length <= MaxDirectUpdateSize && !((Offset | Length) & 0x3)) { // The backend has special code paths for small buffer updates, // however both offset and size must be aligned to four bytes. - std::array<char, MaxDirectUpdateSize> data; - std::memcpy(data.data(), pSrcData, Length); + // Write the data directly to the CS chunk. + uint32_t dwordCount = Length / sizeof(uint32_t); - EmitCs([ - cBufferData = data, + EmitCsCmd<uint32_t>(D3D11CmdType::None, dwordCount, [ cBufferSlice = std::move(bufferSlice) - ] (DxvkContext* ctx) { + ] (DxvkContext* ctx, const uint32_t* data, size_t) { ctx->updateBuffer( cBufferSlice.buffer(), cBufferSlice.offset(), - cBufferSlice.length(), - cBufferData.data()); + cBufferSlice.length(), data); }); + + // Compiler should be able to vectorize here, but GCC only does + // if we cast the destination pointer to the correct type first + auto src = reinterpret_cast<const uint32_t*>(pSrcData); + auto dst = reinterpret_cast<uint32_t*>(m_csData->first()); + + for (uint32_t i = 0; i < dwordCount; i++) + new (dst + i) uint32_t(src[i]); } else { // Write directly to a staging buffer and dispatch a copy DxvkBufferSlice stagingSlice = AllocStagingBuffer(Length);