Prefix sum based translucency

2025-05-22 11:45:28 +10:00
parent 5c2024bab4
commit a314c26b89
5 changed files with 180 additions and 6 deletions
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/buildtranslucents.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/buildtranslucents.comp
@@ -0,0 +1,51 @@
+#version 450
+#extension GL_ARB_gpu_shader_int64 : enable
+
+layout(local_size_x = 128) in;
+
+#define DRAW_BUFFER_BINDING 1
+#define DRAW_COUNT_BUFFER_BINDING 2
+#define SECTION_METADATA_BUFFER_BINDING 3
+#define INDIRECT_SECTION_LOOKUP_BINDING 4
+
+#import <voxy:lod/gl46/bindings.glsl>
+#import <voxy:lod/section.glsl>
+
+/*
+    uint  count;
+    uint  instanceCount;
+    uint  firstIndex;
+    int  baseVertex;
+    uint  baseInstance;
+    */
+
+layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
+    uint[] translucentCommandData;
+};
+
+//Note: if i want reverse indexing i need to use the index buffer offset to offset
+void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
+    DrawCommand cmd;
+    cmd.count = quadCount * 6;
+    cmd.instanceCount = 1;
+    cmd.firstIndex = 0;
+    cmd.baseVertex = int(offset)<<2;
+    cmd.baseInstance = instance;
+    cmdBuffer[idx] = cmd;
+}
+
+void main() {
+    if (gl_GlobalInvocationID.x >= translucentDrawCount) {
+        return;
+    }
+    uint drawId = translucentCommandData[gl_GlobalInvocationID.x+TRANSLUCENT_WRITE_BASE];
+    SectionMeta meta = sectionData[indirectLookup[drawId]];
+    uint detail = extractDetail(meta);
+
+    uvec3 rel = abs(extractPosition(meta)-(baseSectionPos>>detail));
+    uint dist = (rel.x+rel.y+rel.z)<<detail;
+    dist = TRANSLUCENT_WRITE_BASE-min(dist, TRANSLUCENT_WRITE_BASE);
+
+    uint drawPtr = atomicAdd(translucentCommandData[dist],1)+TRANSLUCENT_OFFSET;
+    writeCmd(drawPtr, drawId, extractQuadStart(meta), meta.cntA&0xFFFF);
+}
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/cmdgen.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/cmdgen.comp
@@ -11,10 +11,8 @@ layout(local_size_x = 128) in;
 #define POSITION_SCRATCH_BINDING 6
 #define POSITION_SCRATCH_ACCESS writeonly

-#import <voxy:lod/quad_format.glsl>
 #import <voxy:lod/gl46/bindings.glsl>
 #import <voxy:lod/section.glsl>
-#line 11

 //https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GL_EXT_shader_16bit_storage.txt
 // adds support for uint8_t which can use for compact visibility buffer
@@ -35,6 +33,9 @@ layout(binding = STATISTICS_BUFFER_BINDING, std430) restrict buffer statisticsBu
    uint  baseInstance;
    */

+layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
+    uint[] translucentCommandData;
+};

 //Note: if i want reverse indexing i need to use the index buffer offset to offset
 void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
@@ -108,8 +109,12 @@ void main() {
        //Translucency
        count = meta.cntA&0xFFFF;
        if (count != 0) {
-            uint translucentCommandPtr = atomicAdd(translucentDrawCount, 1) + TRANSLUCENT_OFFSET;//FIXME: dont hardcode this offset
-            writeCmd(translucentCommandPtr, drawId, ptr, count);
+            uint tp = atomicAdd(translucentDrawCount, 1)+TRANSLUCENT_WRITE_BASE;
+            translucentCommandData[tp] = drawId;
+            uvec3 absRel = abs(relative);
+            uint distToCamera = (absRel.x+absRel.y+absRel.z)<<detail;
+            distToCamera = TRANSLUCENT_WRITE_BASE-min(distToCamera, TRANSLUCENT_WRITE_BASE);
+            atomicAdd(translucentCommandData[distToCamera], 1);
            #ifdef HAS_STATISTICS
            totalQuads += count;
            #endif
--- a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
+++ b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
@@ -0,0 +1,57 @@
+#version 460
+
+#extension GL_KHR_shader_subgroup_arithmetic: require
+#extension GL_KHR_shader_subgroup_basic : require
+
+#define WORK_SIZE 256
+
+//Does inital parralel prefix sum on batches of WORK_SIZE
+layout(local_size_x=WORK_SIZE) in;
+
+layout(binding = IO_BUFFER, std430) restrict buffer InputBuffer {
+    uvec4[] ioCount;
+};
+
+shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
+
+void main() {
+    warpPrefixSum[gl_SubgroupInvocationID] = 0;
+    barrier();
+
+    //todo
+    //assert(gl_SubgroupSize == 32);
+    //assert(gl_NumSubgroups == (WORK_SIZE>>5));
+
+    uint gid = gl_GlobalInvocationID.x;
+    uvec4 count = uvec4(0);
+    uint sum = 0;
+    {
+        uvec4 dat = ioCount[gid];
+        count.yzw = dat.xyz;
+        count.z += count.y;
+        count.w += count.z;
+        sum = count.w + dat.w;
+    }
+
+    subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
+
+    count += subgroupExclusiveAdd(sum);
+
+    if ((gl_LocalInvocationID.x&31u)==31) {
+        warpPrefixSum[gl_SubgroupID] = count.x+sum;
+    }
+
+    barrier();
+
+    if (gl_SubgroupID == 0) {
+        uint val = warpPrefixSum[gl_SubgroupInvocationID];
+        subgroupBarrier();
+        //Use warp to do entire add in 1 reduction
+        warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
+    }
+
+    barrier();
+    //Add the computed sum across all threads and warps
+    count += warpPrefixSum[gl_SubgroupID];
+    ioCount[gid] = count;
+}