Prefix sum based translucency

This commit is contained in:
mcrcortex
2025-05-22 11:45:28 +10:00
parent 5c2024bab4
commit a314c26b89
5 changed files with 180 additions and 6 deletions

View File

@@ -47,6 +47,11 @@ public class GlBuffer extends TrackedObject {
return this; return this;
} }
public GlBuffer zeroRange(long offset, long size) {
nglClearNamedBufferSubData(this.id, GL_R8UI, offset, size, GL_RED_INTEGER, GL_UNSIGNED_BYTE, 0);
return this;
}
public GlBuffer fill(int data) { public GlBuffer fill(int data) {
//Clear unpack values //Clear unpack values
//Fixed in mesa commit a5c3c452 //Fixed in mesa commit a5c3c452

View File

@@ -32,12 +32,13 @@ import static org.lwjgl.opengl.GL33.glBindSampler;
import static org.lwjgl.opengl.GL40C.GL_DRAW_INDIRECT_BUFFER; import static org.lwjgl.opengl.GL40C.GL_DRAW_INDIRECT_BUFFER;
import static org.lwjgl.opengl.GL43.*; import static org.lwjgl.opengl.GL43.*;
import static org.lwjgl.opengl.GL45.glBindTextureUnit; import static org.lwjgl.opengl.GL45.glBindTextureUnit;
import static org.lwjgl.opengl.GL45.glClearNamedBufferData;
//Uses MDIC to render the sections //Uses MDIC to render the sections
public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, BasicSectionGeometryData> { public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, BasicSectionGeometryData> {
private static final int TRANSLUCENT_OFFSET = 400_000;//in draw calls private static final int TRANSLUCENT_OFFSET = 400_000;//in draw calls
private static final int TEMPORAL_OFFSET = 500_000;//in draw calls private static final int TEMPORAL_OFFSET = 500_000;//in draw calls
private static final int STATISTICS_BUFFER_BINDING = 7; private static final int STATISTICS_BUFFER_BINDING = 8;
private final Shader terrainShader = Shader.make() private final Shader terrainShader = Shader.make()
.defineIf("DEBUG_RENDER", false) .defineIf("DEBUG_RENDER", false)
.add(ShaderType.VERTEX, "voxy:lod/gl46/quads2.vert") .add(ShaderType.VERTEX, "voxy:lod/gl46/quads2.vert")
@@ -45,9 +46,11 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
.compile(); .compile();
private final Shader commandGenShader = Shader.make() private final Shader commandGenShader = Shader.make()
.define("TRANSLUCENT_OFFSET", TRANSLUCENT_OFFSET) .define("TRANSLUCENT_WRITE_BASE", 1024)
.define("TEMPORAL_OFFSET", TEMPORAL_OFFSET) .define("TEMPORAL_OFFSET", TEMPORAL_OFFSET)
.define("TRANSLUCENT_DISTANCE_BUFFER_BINDING", 7)
.defineIf("HAS_STATISTICS", RenderStatistics.enabled) .defineIf("HAS_STATISTICS", RenderStatistics.enabled)
.defineIf("STATISTICS_BUFFER_BINDING", RenderStatistics.enabled, STATISTICS_BUFFER_BINDING) .defineIf("STATISTICS_BUFFER_BINDING", RenderStatistics.enabled, STATISTICS_BUFFER_BINDING)
@@ -63,9 +66,23 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
.add(ShaderType.FRAGMENT, "voxy:lod/gl46/cull/raster.frag") .add(ShaderType.FRAGMENT, "voxy:lod/gl46/cull/raster.frag")
.compile(); .compile();
private final Shader prefixSumShader = Shader.make()
.add(ShaderType.COMPUTE, "voxy:util/prefixsum/inital3.comp")
.define("IO_BUFFER", 0)
.compile();
private final Shader translucentGenShader = Shader.make()
.add(ShaderType.COMPUTE, "voxy:lod/gl46/buildtranslucents.comp")
.define("TRANSLUCENT_WRITE_BASE", 1024)//The size of the prefix sum array
.define("TRANSLUCENT_DISTANCE_BUFFER_BINDING", 5)
.define("TRANSLUCENT_OFFSET", TRANSLUCENT_OFFSET)
.compile();
private final GlBuffer uniform = new GlBuffer(1024).zero(); private final GlBuffer uniform = new GlBuffer(1024).zero();
//TODO: needs to be in the viewport, since it contains the compute indirect call/values //TODO: needs to be in the viewport, since it contains the compute indirect call/values
private final GlBuffer distanceCountBuffer = new GlBuffer(1024*4+100_000*4).zero();
private final GlBuffer drawCountCallBuffer = new GlBuffer(1024).zero(); private final GlBuffer drawCountCallBuffer = new GlBuffer(1024).zero();
private final GlBuffer drawCallBuffer = new GlBuffer(5*4*(400_000+100_000+100_000)).zero();//400k draw calls private final GlBuffer drawCallBuffer = new GlBuffer(5*4*(400_000+100_000+100_000)).zero();//400k draw calls
private final GlBuffer positionScratchBuffer = new GlBuffer(8*400000).zero();//400k positions private final GlBuffer positionScratchBuffer = new GlBuffer(8*400000).zero();//400k positions
@@ -205,6 +222,7 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
{//Generate the commands {//Generate the commands
this.distanceCountBuffer.zeroRange(0, 1024*4);
this.commandGenShader.bind(); this.commandGenShader.bind();
glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniform.id); glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniform.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this.drawCallBuffer.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this.drawCallBuffer.id);
@@ -213,6 +231,7 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, viewport.visibilityBuffer.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, viewport.visibilityBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, viewport.indirectLookupBuffer.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, viewport.indirectLookupBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.positionScratchBuffer.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.positionScratchBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 7, this.distanceCountBuffer.id);
if (RenderStatistics.enabled) { if (RenderStatistics.enabled) {
this.statisticsBuffer.zero(); this.statisticsBuffer.zero();
@@ -238,6 +257,40 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
} }
} }
{//Do translucency sorting
this.prefixSumShader.bind();
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 0, this.distanceCountBuffer.id);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);//Am unsure if is needed
glDispatchCompute(1,1,1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
//glFinish();
/*
DownloadStream.INSTANCE.download(this.distanceCountBuffer, 0, 1024*4, (ptr,size)->{
int[] a = new int[1024];
for (int i = 0; i < 1024; i++) {
a[i] = MemoryUtil.memGetInt(ptr+4*i);
}
for (int i = 0; i < 1023; i++){
if (a[i+1]<a[i]) {
System.out.println(a[i]+","+a[i+1]);
}
}
});
*/
this.translucentGenShader.bind();
glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniform.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this.drawCallBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.drawCountCallBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, this.geometryManager.getMetadataBuffer().id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, viewport.indirectLookupBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, this.distanceCountBuffer.id);
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, this.drawCountCallBuffer.id);//This isnt great but its a nice trick to bound it, even if its inefficent ;-;
glDispatchComputeIndirect(0);
glMemoryBarrier(GL_COMMAND_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
}
} }
@Override @Override
@@ -261,10 +314,13 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
@Override @Override
public void free() { public void free() {
this.uniform.free(); this.uniform.free();
this.distanceCountBuffer.free();
this.terrainShader.free(); this.terrainShader.free();
this.commandGenShader.free(); this.commandGenShader.free();
this.cullShader.free(); this.cullShader.free();
this.prepShader.free(); this.prepShader.free();
this.translucentGenShader.free();
this.prefixSumShader.free();
this.drawCallBuffer.free(); this.drawCallBuffer.free();
this.drawCountCallBuffer.free(); this.drawCountCallBuffer.free();
this.positionScratchBuffer.free(); this.positionScratchBuffer.free();

View File

@@ -0,0 +1,51 @@
#version 450
#extension GL_ARB_gpu_shader_int64 : enable
layout(local_size_x = 128) in;
#define DRAW_BUFFER_BINDING 1
#define DRAW_COUNT_BUFFER_BINDING 2
#define SECTION_METADATA_BUFFER_BINDING 3
#define INDIRECT_SECTION_LOOKUP_BINDING 4
#import <voxy:lod/gl46/bindings.glsl>
#import <voxy:lod/section.glsl>
/*
uint count;
uint instanceCount;
uint firstIndex;
int baseVertex;
uint baseInstance;
*/
layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
uint[] translucentCommandData;
};
//Note: if i want reverse indexing i need to use the index buffer offset to offset
void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
DrawCommand cmd;
cmd.count = quadCount * 6;
cmd.instanceCount = 1;
cmd.firstIndex = 0;
cmd.baseVertex = int(offset)<<2;
cmd.baseInstance = instance;
cmdBuffer[idx] = cmd;
}
void main() {
if (gl_GlobalInvocationID.x >= translucentDrawCount) {
return;
}
uint drawId = translucentCommandData[gl_GlobalInvocationID.x+TRANSLUCENT_WRITE_BASE];
SectionMeta meta = sectionData[indirectLookup[drawId]];
uint detail = extractDetail(meta);
uvec3 rel = abs(extractPosition(meta)-(baseSectionPos>>detail));
uint dist = (rel.x+rel.y+rel.z)<<detail;
dist = TRANSLUCENT_WRITE_BASE-min(dist, TRANSLUCENT_WRITE_BASE);
uint drawPtr = atomicAdd(translucentCommandData[dist],1)+TRANSLUCENT_OFFSET;
writeCmd(drawPtr, drawId, extractQuadStart(meta), meta.cntA&0xFFFF);
}

View File

@@ -11,10 +11,8 @@ layout(local_size_x = 128) in;
#define POSITION_SCRATCH_BINDING 6 #define POSITION_SCRATCH_BINDING 6
#define POSITION_SCRATCH_ACCESS writeonly #define POSITION_SCRATCH_ACCESS writeonly
#import <voxy:lod/quad_format.glsl>
#import <voxy:lod/gl46/bindings.glsl> #import <voxy:lod/gl46/bindings.glsl>
#import <voxy:lod/section.glsl> #import <voxy:lod/section.glsl>
#line 11
//https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GL_EXT_shader_16bit_storage.txt //https://github.com/KhronosGroup/GLSL/blob/master/extensions/ext/GL_EXT_shader_16bit_storage.txt
// adds support for uint8_t which can use for compact visibility buffer // adds support for uint8_t which can use for compact visibility buffer
@@ -35,6 +33,9 @@ layout(binding = STATISTICS_BUFFER_BINDING, std430) restrict buffer statisticsBu
uint baseInstance; uint baseInstance;
*/ */
layout(binding = TRANSLUCENT_DISTANCE_BUFFER_BINDING, std430) restrict buffer TranslucentCommandCount {
uint[] translucentCommandData;
};
//Note: if i want reverse indexing i need to use the index buffer offset to offset //Note: if i want reverse indexing i need to use the index buffer offset to offset
void writeCmd(uint idx, uint instance, uint offset, uint quadCount) { void writeCmd(uint idx, uint instance, uint offset, uint quadCount) {
@@ -108,8 +109,12 @@ void main() {
//Translucency //Translucency
count = meta.cntA&0xFFFF; count = meta.cntA&0xFFFF;
if (count != 0) { if (count != 0) {
uint translucentCommandPtr = atomicAdd(translucentDrawCount, 1) + TRANSLUCENT_OFFSET;//FIXME: dont hardcode this offset uint tp = atomicAdd(translucentDrawCount, 1)+TRANSLUCENT_WRITE_BASE;
writeCmd(translucentCommandPtr, drawId, ptr, count); translucentCommandData[tp] = drawId;
uvec3 absRel = abs(relative);
uint distToCamera = (absRel.x+absRel.y+absRel.z)<<detail;
distToCamera = TRANSLUCENT_WRITE_BASE-min(distToCamera, TRANSLUCENT_WRITE_BASE);
atomicAdd(translucentCommandData[distToCamera], 1);
#ifdef HAS_STATISTICS #ifdef HAS_STATISTICS
totalQuads += count; totalQuads += count;
#endif #endif

View File

@@ -0,0 +1,57 @@
#version 460
#extension GL_KHR_shader_subgroup_arithmetic: require
#extension GL_KHR_shader_subgroup_basic : require
#define WORK_SIZE 256
//Does inital parralel prefix sum on batches of WORK_SIZE
layout(local_size_x=WORK_SIZE) in;
layout(binding = IO_BUFFER, std430) restrict buffer InputBuffer {
uvec4[] ioCount;
};
shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
void main() {
warpPrefixSum[gl_SubgroupInvocationID] = 0;
barrier();
//todo
//assert(gl_SubgroupSize == 32);
//assert(gl_NumSubgroups == (WORK_SIZE>>5));
uint gid = gl_GlobalInvocationID.x;
uvec4 count = uvec4(0);
uint sum = 0;
{
uvec4 dat = ioCount[gid];
count.yzw = dat.xyz;
count.z += count.y;
count.w += count.z;
sum = count.w + dat.w;
}
subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
count += subgroupExclusiveAdd(sum);
if ((gl_LocalInvocationID.x&31u)==31) {
warpPrefixSum[gl_SubgroupID] = count.x+sum;
}
barrier();
if (gl_SubgroupID == 0) {
uint val = warpPrefixSum[gl_SubgroupInvocationID];
subgroupBarrier();
//Use warp to do entire add in 1 reduction
warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
}
barrier();
//Add the computed sum across all threads and warps
count += warpPrefixSum[gl_SubgroupID];
ioCount[gid] = count;
}