Prefix sum based translucency
This commit is contained in:
@@ -0,0 +1,51 @@
|
||||
#version 450
|
||||
#extension GL_ARB_gpu_shader_int64 : enable
|
||||
|
||||
layout(local_size_x = 128) in;
|
||||
|
||||
#define DRAW_BUFFER_BINDING 1
|
||||
#define DRAW_COUNT_BUFFER_BINDING 2
|
||||
#define SECTION_METADATA_BUFFER_BINDING 3
|
||||
#define INDIRECT_SECTION_LOOKUP_BINDING 4
|
||||
|
||||
#import <voxy:lod/gl46/bindings.glsl>
|
||||
#import <voxy:lod/section.glsl>
|
||||
|
||||
/*
|
||||
uint count;
|
||||
uint instanceCount;
|
||||
uint firstIndex;
|
||||
int baseVertex;
|
||||
uint baseInstance;
|
||||
*/
|
||||
|
||||
layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
|
||||
uint[] translucentCommandData;
|
||||
};
|
||||
|
||||
//Note: if i want reverse indexing i need to use the index buffer offset to offset
|
||||
void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
|
||||
DrawCommand cmd;
|
||||
cmd.count = quadCount * 6;
|
||||
cmd.instanceCount = 1;
|
||||
cmd.firstIndex = 0;
|
||||
cmd.baseVertex = int(offset)<<2;
|
||||
cmd.baseInstance = instance;
|
||||
cmdBuffer[idx] = cmd;
|
||||
}
|
||||
|
||||
void main() {
|
||||
if (gl_GlobalInvocationID.x >= translucentDrawCount) {
|
||||
return;
|
||||
}
|
||||
uint drawId = translucentCommandData[gl_GlobalInvocationID.x+TRANSLUCENT_WRITE_BASE];
|
||||
SectionMeta meta = sectionData[indirectLookup[drawId]];
|
||||
uint detail = extractDetail(meta);
|
||||
|
||||
uvec3 rel = abs(extractPosition(meta)-(baseSectionPos>>detail));
|
||||
uint dist = (rel.x+rel.y+rel.z)<<detail;
|
||||
dist = TRANSLUCENT_WRITE_BASE-min(dist, TRANSLUCENT_WRITE_BASE);
|
||||
|
||||
uint drawPtr = atomicAdd(translucentCommandData[dist],1)+TRANSLUCENT_OFFSET;
|
||||
writeCmd(drawPtr, drawId, extractQuadStart(meta), meta.cntA&0xFFFF);
|
||||
}
|
||||
@@ -11,10 +11,8 @@ layout(local_size_x = 128) in;
|
||||
#define POSITION_SCRATCH_BINDING 6
|
||||
#define POSITION_SCRATCH_ACCESS writeonly
|
||||
|
||||
#import <voxy:lod/quad_format.glsl>
|
||||
#import <voxy:lod/gl46/bindings.glsl>
|
||||
#import <voxy:lod/section.glsl>
|
||||
#line 11
|
||||
|
||||
//https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GL_EXT_shader_16bit_storage.txt
|
||||
// adds support for uint8_t which can use for compact visibility buffer
|
||||
@@ -35,6 +33,9 @@ layout(binding = STATISTICS_BUFFER_BINDING, std430) restrict buffer statisticsBu
|
||||
uint baseInstance;
|
||||
*/
|
||||
|
||||
layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
|
||||
uint[] translucentCommandData;
|
||||
};
|
||||
|
||||
//Note: if i want reverse indexing i need to use the index buffer offset to offset
|
||||
void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
|
||||
@@ -108,8 +109,12 @@ void main() {
|
||||
//Translucency
|
||||
count = meta.cntA&0xFFFF;
|
||||
if (count != 0) {
|
||||
uint translucentCommandPtr = atomicAdd(translucentDrawCount, 1) + TRANSLUCENT_OFFSET;//FIXME: dont hardcode this offset
|
||||
writeCmd(translucentCommandPtr, drawId, ptr, count);
|
||||
uint tp = atomicAdd(translucentDrawCount, 1)+TRANSLUCENT_WRITE_BASE;
|
||||
translucentCommandData[tp] = drawId;
|
||||
uvec3 absRel = abs(relative);
|
||||
uint distToCamera = (absRel.x+absRel.y+absRel.z)<<detail;
|
||||
distToCamera = TRANSLUCENT_WRITE_BASE-min(distToCamera, TRANSLUCENT_WRITE_BASE);
|
||||
atomicAdd(translucentCommandData[distToCamera], 1);
|
||||
#ifdef HAS_STATISTICS
|
||||
totalQuads += count;
|
||||
#endif
|
||||
|
||||
@@ -0,0 +1,57 @@
|
||||
#version 460
|
||||
|
||||
#extension GL_KHR_shader_subgroup_arithmetic: require
|
||||
#extension GL_KHR_shader_subgroup_basic : require
|
||||
|
||||
#define WORK_SIZE 256
|
||||
|
||||
//Does inital parralel prefix sum on batches of WORK_SIZE
|
||||
layout(local_size_x=WORK_SIZE) in;
|
||||
|
||||
layout(binding = IO_BUFFER, std430) restrict buffer InputBuffer {
|
||||
uvec4[] ioCount;
|
||||
};
|
||||
|
||||
shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
|
||||
|
||||
void main() {
|
||||
warpPrefixSum[gl_SubgroupInvocationID] = 0;
|
||||
barrier();
|
||||
|
||||
//todo
|
||||
//assert(gl_SubgroupSize == 32);
|
||||
//assert(gl_NumSubgroups == (WORK_SIZE>>5));
|
||||
|
||||
uint gid = gl_GlobalInvocationID.x;
|
||||
uvec4 count = uvec4(0);
|
||||
uint sum = 0;
|
||||
{
|
||||
uvec4 dat = ioCount[gid];
|
||||
count.yzw = dat.xyz;
|
||||
count.z += count.y;
|
||||
count.w += count.z;
|
||||
sum = count.w + dat.w;
|
||||
}
|
||||
|
||||
subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
|
||||
|
||||
count += subgroupExclusiveAdd(sum);
|
||||
|
||||
if ((gl_LocalInvocationID.x&31u)==31) {
|
||||
warpPrefixSum[gl_SubgroupID] = count.x+sum;
|
||||
}
|
||||
|
||||
barrier();
|
||||
|
||||
if (gl_SubgroupID == 0) {
|
||||
uint val = warpPrefixSum[gl_SubgroupInvocationID];
|
||||
subgroupBarrier();
|
||||
//Use warp to do entire add in 1 reduction
|
||||
warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
|
||||
}
|
||||
|
||||
barrier();
|
||||
//Add the computed sum across all threads and warps
|
||||
count += warpPrefixSum[gl_SubgroupID];
|
||||
ioCount[gid] = count;
|
||||
}
|
||||
Reference in New Issue
Block a user