Prefix sum based translucency

This commit is contained in:
mcrcortex
2025-05-22 11:45:28 +10:00
parent 5c2024bab4
commit a314c26b89
5 changed files with 180 additions and 6 deletions

View File

@@ -0,0 +1,51 @@
#version 450
#extension GL_ARB_gpu_shader_int64 : enable
layout(local_size_x = 128) in;
#define DRAW_BUFFER_BINDING 1
#define DRAW_COUNT_BUFFER_BINDING 2
#define SECTION_METADATA_BUFFER_BINDING 3
#define INDIRECT_SECTION_LOOKUP_BINDING 4
#import <voxy:lod/gl46/bindings.glsl>
#import <voxy:lod/section.glsl>
/*
uint count;
uint instanceCount;
uint firstIndex;
int baseVertex;
uint baseInstance;
*/
layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
uint[] translucentCommandData;
};
//Note: if i want reverse indexing i need to use the index buffer offset to offset
void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
DrawCommand cmd;
cmd.count = quadCount * 6;
cmd.instanceCount = 1;
cmd.firstIndex = 0;
cmd.baseVertex = int(offset)<<2;
cmd.baseInstance = instance;
cmdBuffer[idx] = cmd;
}
void main() {
if (gl_GlobalInvocationID.x >= translucentDrawCount) {
return;
}
uint drawId = translucentCommandData[gl_GlobalInvocationID.x+TRANSLUCENT_WRITE_BASE];
SectionMeta meta = sectionData[indirectLookup[drawId]];
uint detail = extractDetail(meta);
uvec3 rel = abs(extractPosition(meta)-(baseSectionPos>>detail));
uint dist = (rel.x+rel.y+rel.z)<<detail;
dist = TRANSLUCENT_WRITE_BASE-min(dist, TRANSLUCENT_WRITE_BASE);
uint drawPtr = atomicAdd(translucentCommandData[dist],1)+TRANSLUCENT_OFFSET;
writeCmd(drawPtr, drawId, extractQuadStart(meta), meta.cntA&0xFFFF);
}

View File

@@ -11,10 +11,8 @@ layout(local_size_x = 128) in;
#define POSITION_SCRATCH_BINDING 6
#define POSITION_SCRATCH_ACCESS writeonly
#import <voxy:lod/quad_format.glsl>
#import <voxy:lod/gl46/bindings.glsl>
#import <voxy:lod/section.glsl>
#line 11
//https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GL_EXT_shader_16bit_storage.txt
// adds support for uint8_t which can use for compact visibility buffer
@@ -35,6 +33,9 @@ layout(binding = STATISTICS_BUFFER_BINDING, std430) restrict buffer statisticsBu
uint baseInstance;
*/
layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
uint[] translucentCommandData;
};
//Note: if i want reverse indexing i need to use the index buffer offset to offset
void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
@@ -108,8 +109,12 @@ void main() {
//Translucency
count = meta.cntA&0xFFFF;
if (count != 0) {
uint translucentCommandPtr = atomicAdd(translucentDrawCount, 1) + TRANSLUCENT_OFFSET;//FIXME: dont hardcode this offset
writeCmd(translucentCommandPtr, drawId, ptr, count);
uint tp = atomicAdd(translucentDrawCount, 1)+TRANSLUCENT_WRITE_BASE;
translucentCommandData[tp] = drawId;
uvec3 absRel = abs(relative);
uint distToCamera = (absRel.x+absRel.y+absRel.z)<<detail;
distToCamera = TRANSLUCENT_WRITE_BASE-min(distToCamera, TRANSLUCENT_WRITE_BASE);
atomicAdd(translucentCommandData[distToCamera], 1);
#ifdef HAS_STATISTICS
totalQuads += count;
#endif

View File

@@ -0,0 +1,57 @@
#version 460
#extension GL_KHR_shader_subgroup_arithmetic: require
#extension GL_KHR_shader_subgroup_basic : require
#define WORK_SIZE 256
//Does inital parralel prefix sum on batches of WORK_SIZE
layout(local_size_x=WORK_SIZE) in;
layout(binding = IO_BUFFER, std430) restrict buffer InputBuffer {
uvec4[] ioCount;
};
shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
void main() {
warpPrefixSum[gl_SubgroupInvocationID] = 0;
barrier();
//todo
//assert(gl_SubgroupSize == 32);
//assert(gl_NumSubgroups == (WORK_SIZE>>5));
uint gid = gl_GlobalInvocationID.x;
uvec4 count = uvec4(0);
uint sum = 0;
{
uvec4 dat = ioCount[gid];
count.yzw = dat.xyz;
count.z += count.y;
count.w += count.z;
sum = count.w + dat.w;
}
subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
count += subgroupExclusiveAdd(sum);
if ((gl_LocalInvocationID.x&31u)==31) {
warpPrefixSum[gl_SubgroupID] = count.x+sum;
}
barrier();
if (gl_SubgroupID == 0) {
uint val = warpPrefixSum[gl_SubgroupInvocationID];
subgroupBarrier();
//Use warp to do entire add in 1 reduction
warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
}
barrier();
//Add the computed sum across all threads and warps
count += warpPrefixSum[gl_SubgroupID];
ioCount[gid] = count;
}