diff --git a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp index 6d0a8fa1..a815f7e9 100644 --- a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp +++ b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp @@ -12,92 +12,98 @@ layout(binding = IO_BUFFER, std430) buffer InputBuffer { uvec4[] ioCount; }; -shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp +shared uint warpPrefixSum[8];//Warps are 32, tricks require full warp void main() { - /* - uint subgroupId = gl_LocalInvocationID.x>>5; - warpPrefixSum[gl_SubgroupInvocationID] = 0; - memoryBarrierShared(); + if (gl_SubgroupSize == 32) { + #ifdef IS_INTEL + uint subgroupId = gl_LocalInvocationID.x>>5; + #else + uint subgroupId = gl_SubgroupID; + #endif - //todo - //assert(gl_SubgroupSize == 32); - //assert(gl_NumSubgroups == (WORK_SIZE>>5)); + //todo + //assert(gl_SubgroupSize == 32); + //assert(gl_NumSubgroups == (WORK_SIZE>>5)); - uint gid = gl_GlobalInvocationID.x; - uvec4 count = uvec4(0); - uint sum = 0; - { - uvec4 dat = ioCount[gid]; - count.yzw = dat.xyz; - count.z += count.y; - count.w += count.z; - sum = count.w + dat.w; + uint gid = gl_GlobalInvocationID.x; + uvec4 count = uvec4(0); + uint sum = 0; + { + uvec4 dat = ioCount[gid]; + count.yzw = dat.xyz; + count.z += count.y; + count.w += count.z; + sum = count.w + dat.w; + } + subgroupBarrier();//Wait for all threads in the subgroup to get the buffer + + count += subgroupExclusiveAdd(sum); + + if (gl_SubgroupInvocationID==31) { + warpPrefixSum[subgroupId] = count.x+sum; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x<8) { + uint val = warpPrefixSum[gl_SubgroupInvocationID]; + subgroupBarrier(); + //Use warp to do entire add in 1 reduction + warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val); + } + + memoryBarrierShared(); + barrier(); + + //Add the computed sum across all threads and warps + count += warpPrefixSum[subgroupId]; + ioCount[gid] = count; + } else { + #ifdef IS_INTEL + uint subgroupId = gl_LocalInvocationID.x>>6; + #else + uint subgroupId = gl_SubgroupID; + #endif + + //todo + //assert(gl_SubgroupSize == 32); + //assert(gl_NumSubgroups == (WORK_SIZE>>5)); + + uint gid = gl_GlobalInvocationID.x; + uvec4 count = uvec4(0); + uint sum = 0; + { + uvec4 dat = ioCount[gid]; + count.yzw = dat.xyz; + count.z += count.y; + count.w += count.z; + sum = count.w + dat.w; + } + subgroupBarrier();//Wait for all threads in the subgroup to get the buffer + + count += subgroupExclusiveAdd(sum); + + if (gl_SubgroupInvocationID==63) { + warpPrefixSum[subgroupId] = count.x+sum; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x<4) { + uint val = warpPrefixSum[gl_SubgroupInvocationID]; + subgroupBarrier(); + //Use warp to do entire add in 1 reduction + warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val); + } + + memoryBarrierShared(); + barrier(); + + //Add the computed sum across all threads and warps + count += warpPrefixSum[subgroupId]; + ioCount[gid] = count; } - - barrier(); - count += subgroupExclusiveAdd(sum); - - if (gl_SubgroupInvocationID==31) { - warpPrefixSum[subgroupId] = count.x+sum; - } - memoryBarrierShared(); - barrier(); - uint val = warpPrefixSum[gl_SubgroupInvocationID]; - barrier(); - if (subgroupId == 0) { - //Use warp to do entire add in 1 reduction - warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val); - } - memoryBarrierShared(); - barrier(); - count += warpPrefixSum[subgroupId]; - ioCount[gid] = count; - */ - - - #ifdef IS_INTEL - uint subgroupId = gl_LocalInvocationID.x>>5; - #else - uint subgroupId = gl_SubgroupID; - #endif - - //todo - //assert(gl_SubgroupSize == 32); - //assert(gl_NumSubgroups == (WORK_SIZE>>5)); - - uint gid = gl_GlobalInvocationID.x; - uvec4 count = uvec4(0); - uint sum = 0; - { - uvec4 dat = ioCount[gid]; - count.yzw = dat.xyz; - count.z += count.y; - count.w += count.z; - sum = count.w + dat.w; - } - subgroupBarrier();//Wait for all threads in the subgroup to get the buffer - - count += subgroupExclusiveAdd(sum); - - if (gl_SubgroupInvocationID==31) { - warpPrefixSum[subgroupId] = count.x+sum; - } - - memoryBarrierShared(); - barrier(); - - if (subgroupId == 0) { - uint val = warpPrefixSum[gl_SubgroupInvocationID]; - subgroupBarrier(); - //Use warp to do entire add in 1 reduction - warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val); - } - - memoryBarrierShared(); - barrier(); - - //Add the computed sum across all threads and warps - count += warpPrefixSum[subgroupId]; - ioCount[gid] = count; } \ No newline at end of file