diff --git a/src/GPU3D_Compute.cpp b/src/GPU3D_Compute.cpp index 346a6a53..1c50483c 100644 --- a/src/GPU3D_Compute.cpp +++ b/src/GPU3D_Compute.cpp @@ -19,6 +19,9 @@ #include "GPU3D_Compute.h" #include +#include + +#include "Utils.h" #include "OpenGLSupport.h" @@ -50,6 +53,14 @@ bool ComputeRenderer::CompileShader(GLuint& shader, const std::string& source, c shaderSource += std::to_string(ScreenHeight); shaderSource += "\n#define MaxWorkTiles "; shaderSource += std::to_string(MaxWorkTiles); + shaderSource += "\n#define TileSize "; + shaderSource += std::to_string(TileSize); + shaderSource += "\nconst int CoarseTileCountY = "; + shaderSource += std::to_string(CoarseTileCountY) + ";"; + shaderSource += "\n#define CoarseTileArea "; + shaderSource += std::to_string(CoarseTileArea); + shaderSource += "\n#define ClearCoarseBinMaskLocalSize "; + shaderSource += std::to_string(ClearCoarseBinMaskLocalSize); shaderSource += ComputeRendererShaders::Common; shaderSource += source; @@ -297,6 +308,8 @@ void ComputeRenderer::Reset(GPU& gpu) void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinates) { + u8 TileScale; + CurGLCompositor.SetScaleFactor(scale); if (ScaleFactor != -1) @@ -310,6 +323,22 @@ void ComputeRenderer::SetRenderSettings(int scale, bool highResolutionCoordinate ScreenWidth = 256 * ScaleFactor; ScreenHeight = 192 * ScaleFactor; + //Starting at 4.5x we want to double TileSize every time scale doubles + TileScale = 2 * ScaleFactor / 9; + TileScale = GetMSBit(TileScale); + TileScale <<= 1; + TileScale += TileScale == 0; + + std::printf("Scale: %d\n", ScaleFactor); + std::printf("TileScale: %d\n", TileScale); + + TileSize = std::min(8 * TileScale, 32); + CoarseTileCountY = TileSize < 32 ? 4 : 6; + ClearCoarseBinMaskLocalSize = TileSize < 32 ? 64 : 48; + CoarseTileArea = CoarseTileCountX * CoarseTileCountY; + CoarseTileW = CoarseTileCountX * TileSize; + CoarseTileH = CoarseTileCountY * TileSize; + TilesPerLine = ScreenWidth/TileSize; TileLines = ScreenHeight/TileSize; @@ -918,7 +947,7 @@ void ComputeRenderer::RenderFrame(GPU& gpu) glBindBufferBase(GL_UNIFORM_BUFFER, 0, MetaUniformMemory); glUseProgram(ShaderClearCoarseBinMask); - glDispatchCompute(TilesPerLine*TileLines/32, 1, 1); + glDispatchCompute(TilesPerLine*TileLines/ClearCoarseBinMaskLocalSize, 1, 1); bool wbuffer = false; if (numYSpans > 0) @@ -932,23 +961,23 @@ void ComputeRenderer::RenderFrame(GPU& gpu) glBindImageTexture(0, YSpanIndicesTexture, 0, GL_FALSE, 0, GL_READ_ONLY, GL_RGBA16UI); glUseProgram(ShaderInterpXSpans[wbuffer]); glDispatchCompute((numSetupIndices + 31) / 32, 1, 1); - glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // bin polygons glUseProgram(ShaderBinCombined); glDispatchCompute(((gpu.GPU3D.RenderNumPolygons + 31) / 32), ScreenWidth/CoarseTileW, ScreenHeight/CoarseTileH); - glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // calculate list offsets glUseProgram(ShaderCalculateWorkListOffset); glDispatchCompute((numVariants + 31) / 32, 1, 1); - glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); // sort shader work glUseProgram(ShaderSortWork); glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, BinResultMemory); glDispatchComputeIndirect(offsetof(BinResultHeader, SortWorkWorkCount)); - glMemoryBarrier(GL_SHADER_STORAGE_BUFFER); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); glActiveTexture(GL_TEXTURE0); diff --git a/src/GPU3D_Compute.h b/src/GPU3D_Compute.h index 751737b7..6b96a33f 100644 --- a/src/GPU3D_Compute.h +++ b/src/GPU3D_Compute.h @@ -163,11 +163,13 @@ private: float TextureLayer; }; - static constexpr int TileSize = 8; + int TileSize; static constexpr int CoarseTileCountX = 8; - static constexpr int CoarseTileCountY = 4; - static constexpr int CoarseTileW = CoarseTileCountX * TileSize; - static constexpr int CoarseTileH = CoarseTileCountY * TileSize; + int CoarseTileCountY; + int CoarseTileArea; + int CoarseTileW; + int CoarseTileH; + int ClearCoarseBinMaskLocalSize; static constexpr int BinStride = 2048/32; static constexpr int CoarseBinStride = BinStride/32; diff --git a/src/GPU3D_Compute_shaders.h b/src/GPU3D_Compute_shaders.h index 26fb7bde..b1086ba0 100644 --- a/src/GPU3D_Compute_shaders.h +++ b/src/GPU3D_Compute_shaders.h @@ -339,9 +339,7 @@ const uint ResultAttrStart = ResultDepthStart+ScreenWidth*ScreenHeight*2; const char* Common = R"( -#define TileSize 8 const int CoarseTileCountX = 8; -const int CoarseTileCountY = 4; const int CoarseTileW = (CoarseTileCountX * TileSize); const int CoarseTileH = (CoarseTileCountY * TileSize); @@ -848,7 +846,7 @@ void main() const std::string ClearCoarseBinMask = BinningBuffer + R"( -layout (local_size_x = 32) in; +layout (local_size_x = ClearCoarseBinMaskLocalSize) in; void main() { @@ -864,7 +862,7 @@ const std::string BinCombined = XSpanSetupBuffer + WorkDescBuffer + R"( -layout (local_size_x = 32) in; +layout (local_size_x = CoarseTileArea) in; bool BinPolygon(Polygon polygon, ivec2 topLeft, ivec2 botRight) { diff --git a/src/Utils.h b/src/Utils.h index eae9193a..395aa14b 100644 --- a/src/Utils.h +++ b/src/Utils.h @@ -38,6 +38,20 @@ std::pair, u32> PadToPowerOf2(const u8* data, u32 len) noe std::unique_ptr CopyToUnique(const u8* data, u32 len) noexcept; +template +T GetMSBit(T val) +{ + val |= (val >> 1); + val |= (val >> 2); + val |= (val >> 4); + + if constexpr(sizeof(val) > 1) val |= (val >> 8); + if constexpr(sizeof(val) > 2) val |= (val >> 16); + if constexpr(sizeof(val) > 4) val |= (val >> 32); + + return val - (val >> 1); +} + } #endif // MELONDS_UTILS_H