From b9a3d18b561f0fa0cdfab1861f4c710818c48817 Mon Sep 17 00:00:00 2001 From: mcrcortex <18544518+MCRcortex@users.noreply.github.com> Date: Sun, 15 Sep 2024 12:10:32 +1000 Subject: [PATCH] Incremental traversal system works --- src/main/java/me/cortex/voxy/client/Voxy.java | 5 + .../client/core/gl/shader/PrintfInjector.java | 4 +- .../voxy/client/core/gl/shader/Shader.java | 8 ++ .../core/rendering/PrintfDebugUtil.java | 9 +- .../client/core/rendering/RenderService.java | 3 +- .../HierarchicalOcclusionTraverser.java | 105 ++++++++++++++++-- .../me/cortex/voxy/commonImpl/VoxyCommon.java | 1 - .../voxy/shaders/lod/hierarchical/queue.glsl | 50 +++++++++ .../shaders/lod/hierarchical/traversal.comp | 41 ++++--- .../lod/hierarchical/traversal_dev.comp | 16 +++ 10 files changed, 207 insertions(+), 35 deletions(-) create mode 100644 src/main/resources/assets/voxy/shaders/lod/hierarchical/queue.glsl create mode 100644 src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal_dev.comp diff --git a/src/main/java/me/cortex/voxy/client/Voxy.java b/src/main/java/me/cortex/voxy/client/Voxy.java index ac88d6a8..651d7ba4 100644 --- a/src/main/java/me/cortex/voxy/client/Voxy.java +++ b/src/main/java/me/cortex/voxy/client/Voxy.java @@ -16,6 +16,11 @@ import net.minecraft.client.world.ClientWorld; import java.util.Arrays; public class Voxy implements ClientModInitializer { + public static final boolean SHADER_DEBUG; + static { + SHADER_DEBUG = System.getProperty("voxy.shaderDebug", "false").equals("true"); + } + @Override public void onInitializeClient() { ClientCommandRegistrationCallback.EVENT.register((dispatcher, registryAccess) -> { diff --git a/src/main/java/me/cortex/voxy/client/core/gl/shader/PrintfInjector.java b/src/main/java/me/cortex/voxy/client/core/gl/shader/PrintfInjector.java index ece46b3b..ff57be8d 100644 --- a/src/main/java/me/cortex/voxy/client/core/gl/shader/PrintfInjector.java +++ b/src/main/java/me/cortex/voxy/client/core/gl/shader/PrintfInjector.java @@ -163,7 +163,7 @@ public class PrintfInjector implements IShaderProcessor { for (int i = 0; i < types.size(); i++) { subCode.append("printfOutputStruct.stream[printfWriteIndex+").append(i+1).append("]="); - if (types.get(i) == 'd' || types.get(i) == 'i') { + if (types.get(i) == 'd') { subCode.append("uint(").append(argVals.get(i)).append(")"); } else if (types.get(i) == 'f') { subCode.append("floatBitsToUint(").append(argVals.get(i)).append(")"); @@ -207,7 +207,7 @@ public class PrintfInjector implements IShaderProcessor { parsePrintfTypes(fmt, types); Object[] args = new Object[types.size()]; for (int i = 0; i < types.size(); i++) { - if (types.get(i) == 'd' || types.get(i) == 'i') { + if (types.get(i) == 'd') { args[i] = MemoryUtil.memGetInt(ptr); ptr += 4; cnt++; diff --git a/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java b/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java index 5e9a069d..64da2c33 100644 --- a/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java +++ b/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java @@ -56,6 +56,14 @@ public class Shader extends TrackedObject { return this; } + //Useful for inline setting (such as debug) + public Builder defineIf(String name, boolean condition) { + if (condition) { + this.defines.put(name, ""); + } + return this; + } + public Builder define(String name, int value) { this.defines.put(name, Integer.toString(value)); return this; diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/PrintfDebugUtil.java b/src/main/java/me/cortex/voxy/client/core/rendering/PrintfDebugUtil.java index 83afe345..38303ae8 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/PrintfDebugUtil.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/PrintfDebugUtil.java @@ -1,5 +1,6 @@ package me.cortex.voxy.client.core.rendering; +import me.cortex.voxy.client.Voxy; import me.cortex.voxy.client.core.gl.shader.IShaderProcessor; import me.cortex.voxy.client.core.gl.shader.PrintfInjector; @@ -7,7 +8,7 @@ import java.util.ArrayList; import java.util.List; public final class PrintfDebugUtil { - public static final boolean ENABLE_PRINTF_DEBUGGING = System.getProperty("voxy.enableShaderDebugPrintf", "false").equals("true"); + public static final boolean ENABLE_PRINTF_DEBUGGING = System.getProperty("voxy.enableShaderDebugPrintf", "false").equals("true") || Voxy.SHADER_DEBUG; private static final List printfQueue2 = new ArrayList<>(); private static final List printfQueue = new ArrayList<>(); @@ -46,4 +47,10 @@ public final class PrintfDebugUtil { out.addAll(printfQueue2); } } + + public static void bind() { + if (ENABLE_PRINTF_DEBUGGING) { + PRINTF_object.bind(); + } + } } diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java b/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java index 459431c1..6122d254 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java @@ -72,6 +72,7 @@ public class RenderService, J extends Vi Arrays.stream(world.getMapper().getBiomeEntries()).forEach(this.modelService::addBiome); world.getMapper().setBiomeCallback(this.modelService::addBiome); + /* final int H_WIDTH = 1; for (int x = -H_WIDTH; x <= H_WIDTH; x++) { for (int y = -1; y <= 0; y++) { @@ -80,7 +81,7 @@ public class RenderService, J extends Vi } } } - + */ } public void setup(Camera camera) { diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/hierachical2/HierarchicalOcclusionTraverser.java b/src/main/java/me/cortex/voxy/client/core/rendering/hierachical2/HierarchicalOcclusionTraverser.java index 035c2405..c0e7ea8c 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/hierachical2/HierarchicalOcclusionTraverser.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierachical2/HierarchicalOcclusionTraverser.java @@ -1,8 +1,10 @@ package me.cortex.voxy.client.core.rendering.hierachical2; +import me.cortex.voxy.client.Voxy; import me.cortex.voxy.client.core.gl.GlBuffer; import me.cortex.voxy.client.core.gl.shader.Shader; import me.cortex.voxy.client.core.gl.shader.ShaderType; +import me.cortex.voxy.client.core.rendering.PrintfDebugUtil; import me.cortex.voxy.client.core.rendering.util.HiZBuffer; import me.cortex.voxy.client.core.rendering.Viewport; import me.cortex.voxy.client.core.rendering.util.DownloadStream; @@ -10,13 +12,16 @@ import me.cortex.voxy.client.core.rendering.util.UploadStream; import org.lwjgl.system.MemoryUtil; import static me.cortex.voxy.client.core.rendering.PrintfDebugUtil.PRINTF_object; -import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT; -import static org.lwjgl.opengl.GL30.GL_R32UI; +import static org.lwjgl.opengl.GL11.*; +import static org.lwjgl.opengl.GL12.GL_UNPACK_IMAGE_HEIGHT; +import static org.lwjgl.opengl.GL12.GL_UNPACK_SKIP_IMAGES; +import static org.lwjgl.opengl.GL30.*; import static org.lwjgl.opengl.GL30C.GL_RED_INTEGER; import static org.lwjgl.opengl.GL42.glMemoryBarrier; import static org.lwjgl.opengl.GL43.GL_SHADER_STORAGE_BARRIER_BIT; -import static org.lwjgl.opengl.GL45.nglClearNamedBufferSubData; +import static org.lwjgl.opengl.GL45.*; +// TODO: swap to persistent gpu threads instead of dispatching MAX_ITERATIONS of compute layers public class HierarchicalOcclusionTraverser { private final HierarchicalNodeManager nodeManager; @@ -27,17 +32,31 @@ public class HierarchicalOcclusionTraverser { private final GlBuffer uniformBuffer = new GlBuffer(1024).zero(); private final GlBuffer renderList = new GlBuffer(100_000 * 4 + 4).zero();//100k sections max to render, TODO: Maybe move to render service or somewhere else - private final GlBuffer scratchBuffer = new GlBuffer(1024).zero();//Scratch utility buffer for small things to get the ordering right and memory overall - //Scratch queues for node traversal + private final GlBuffer queueMetaBuffer = new GlBuffer(4*4*5).zero(); private final GlBuffer scratchQueueA = new GlBuffer(10_000*4).zero(); private final GlBuffer scratchQueueB = new GlBuffer(10_000*4).zero(); + private static final int LOCAL_WORK_SIZE_BITS = 5; + private static final int MAX_ITERATIONS = 5; + private static final int NODE_QUEUE_INDEX_BINDING = 1; + private static final int NODE_QUEUE_META_BINDING = 2; + private static final int NODE_QUEUE_SOURCE_BINDING = 3; + private static final int NODE_QUEUE_SINK_BINDING = 4; private final HiZBuffer hiZBuffer = new HiZBuffer(); private final Shader traversal = Shader.make(PRINTF_object) - .add(ShaderType.COMPUTE, "voxy:lod/hierarchical/traversal.comp") + .defineIf("DEBUG", Voxy.SHADER_DEBUG) + .define("MAX_ITERATIONS", MAX_ITERATIONS) + .define("LOCAL_SIZE_BITS", LOCAL_WORK_SIZE_BITS) + + .define("NODE_QUEUE_INDEX_BINDING", NODE_QUEUE_INDEX_BINDING) + .define("NODE_QUEUE_META_BINDING", NODE_QUEUE_META_BINDING) + .define("NODE_QUEUE_SOURCE_BINDING", NODE_QUEUE_SOURCE_BINDING) + .define("NODE_QUEUE_SINK_BINDING", NODE_QUEUE_SINK_BINDING) + + .add(ShaderType.COMPUTE, "voxy:lod/hierarchical/traversal_dev.comp") .compile(); @@ -53,7 +72,8 @@ public class HierarchicalOcclusionTraverser { } private void bindings() { - + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, NODE_QUEUE_META_BINDING, this.queueMetaBuffer.id); + glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, this.queueMetaBuffer.id); } public void doTraversal(Viewport viewport, int depthBuffer) { @@ -61,19 +81,78 @@ public class HierarchicalOcclusionTraverser { this.hiZBuffer.buildMipChain(depthBuffer, viewport.width, viewport.height); this.uploadUniform(viewport); - UploadStream.INSTANCE.commit(); + //UploadStream.INSTANCE.commit(); //Done inside traversal this.traversal.bind(); this.bindings(); + PrintfDebugUtil.bind(); - //Use a chain of glDispatchComputeIndirect (5 times) with alternating read/write buffers - // TODO: swap to persistent gpu thread instead - + this.traverseInternal(1); this.downloadResetRequestQueue(); } + private void traverseInternal(int initialQueueSize) { + { + //Fix mesa bug + glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); + glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, 0); + glPixelStorei(GL_UNPACK_SKIP_PIXELS, 0); + glPixelStorei(GL_UNPACK_SKIP_ROWS, 0); + glPixelStorei(GL_UNPACK_SKIP_IMAGES, 0); + } + + int firstDispatchSize = (initialQueueSize+(1<>LOCAL_WORK_SIZE_BITS; + /* + //prime the queue Todo: maybe move after the traversal? cause then it is more efficient work since it doesnt need to wait for this before starting? + glClearNamedBufferData(this.queueMetaBuffer.id, GL_RGBA32UI, GL_RGBA, GL_UNSIGNED_INT, new int[]{0,1,1,0});//Prime the metadata buffer, which also contains + + //Set the first entry + glClearNamedBufferSubData(this.queueMetaBuffer.id, GL_RGBA32UI, 0, 16, GL_RGBA, GL_UNSIGNED_INT, new int[]{firstDispatchSize,1,1,initialQueueSize}); + */ + { + long ptr = UploadStream.INSTANCE.upload(this.queueMetaBuffer, 0, 16*5); + MemoryUtil.memPutInt(ptr + 0, firstDispatchSize); + MemoryUtil.memPutInt(ptr + 4, 1); + MemoryUtil.memPutInt(ptr + 8, 1); + MemoryUtil.memPutInt(ptr + 12, initialQueueSize); + for (int i = 1; i < 5; i++) { + MemoryUtil.memPutInt(ptr + (i*16)+ 0, 0); + MemoryUtil.memPutInt(ptr + (i*16)+ 4, 1); + MemoryUtil.memPutInt(ptr + (i*16)+ 8, 1); + MemoryUtil.memPutInt(ptr + (i*16)+12, 0); + } + + UploadStream.INSTANCE.commit(); + } + + glUniform1ui(NODE_QUEUE_INDEX_BINDING, 0); + + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, NODE_QUEUE_SOURCE_BINDING, this.scratchQueueA.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, NODE_QUEUE_SINK_BINDING, this.scratchQueueB.id); + + //Dont need to use indirect to dispatch the first iteration + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_COMMAND_BARRIER_BIT); + glDispatchCompute(firstDispatchSize, 1,1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_COMMAND_BARRIER_BIT); + + //Dispatch max iterations + for (int iter = 1; iter < MAX_ITERATIONS; iter++) { + glUniform1ui(NODE_QUEUE_INDEX_BINDING, iter); + + //Flipflop buffers + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, NODE_QUEUE_SOURCE_BINDING, ((iter & 1) == 0 ? this.scratchQueueA : this.scratchQueueB).id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, NODE_QUEUE_SINK_BINDING, ((iter & 1) == 0 ? this.scratchQueueB : this.scratchQueueA).id); + + //Dispatch and barrier + glDispatchComputeIndirect(iter * 4 * 4); + + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT | GL_COMMAND_BARRIER_BIT); + } + } + + private void downloadResetRequestQueue() { glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); DownloadStream.INSTANCE.download(this.requestBuffer, this::forwardDownloadResult); @@ -108,6 +187,8 @@ public class HierarchicalOcclusionTraverser { this.nodeBuffer.free(); this.uniformBuffer.free(); this.renderList.free(); - this.scratchBuffer.free(); + this.queueMetaBuffer.free(); + this.scratchQueueA.free(); + this.scratchQueueB.free(); } } diff --git a/src/main/java/me/cortex/voxy/commonImpl/VoxyCommon.java b/src/main/java/me/cortex/voxy/commonImpl/VoxyCommon.java index 5b257373..533f0784 100644 --- a/src/main/java/me/cortex/voxy/commonImpl/VoxyCommon.java +++ b/src/main/java/me/cortex/voxy/commonImpl/VoxyCommon.java @@ -18,7 +18,6 @@ public class VoxyCommon implements ModInitializer { var commit = mod.getMetadata().getCustomValue("commit").getAsString(); MOD_VERSION = version+"-"+commit; IS_DEDICATED_SERVER = FabricLoader.getInstance().getEnvironmentType() == EnvType.SERVER; - Serialization.init(); } diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/queue.glsl b/src/main/resources/assets/voxy/shaders/lod/hierarchical/queue.glsl new file mode 100644 index 00000000..e7317e08 --- /dev/null +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/queue.glsl @@ -0,0 +1,50 @@ +#define SENTINAL_OUT_OF_BOUNDS uint(-1) + +layout(location = NODE_QUEUE_INDEX_BINDING) uniform uint queueIdx; + +layout(binding = NODE_QUEUE_META_BINDING, std430) restrict buffer NodeQueueMeta { + uvec4 nodeQueueMetadata[MAX_ITERATIONS]; +}; + +layout(binding = NODE_QUEUE_SOURCE_BINDING, std430) restrict readonly buffer NodeQueueSource { + uint[] nodeQueueSource; +}; + +layout(binding = NODE_QUEUE_SINK_BINDING, std430) restrict writeonly buffer NodeQueueSink { + uint[] nodeQueueSink; +}; + +uint getCurrentNode() { + if (nodeQueueMetadata[queueIdx].w <= gl_GlobalInvocationID.x) { + return SENTINAL_OUT_OF_BOUNDS; + } + return nodeQueueSource[gl_GlobalInvocationID.x]; +} + +uint nodePushIndex = -1; +void pushNodesInit(uint nodeCount) { + //Debug + #ifdef DEBUG + if (queueIdx >= (MAX_ITERATIONS-1)) { + printf("LOG: Traversal tried inserting a node into next iteration, which is outside max iteration bounds. GID: %d, count: %d", gl_GlobalInvocationID.x, nodeCount); + nodePushIndex = -1; + return; + } + #endif + + uint index = atomicAdd(nodeQueueMetadata[queueIdx+1].w, nodeCount); + //Increment first metadata value if it changes threash hold + uint inc = ((index+LOCAL_SIZE)>>LOCAL_SIZE_BITS)-(index>>LOCAL_SIZE_BITS); + atomicAdd(nodeQueueMetadata[queueIdx+1].x, inc);//TODO: see if making this conditional on inc != 0 is faster + nodePushIndex = index; +} + +void pushNode(uint nodeId) { + #ifdef DEBUG + if (nodePushIndex == -1) { + printf("LOG: Tried pushing node when push node wasnt successful. GID: %d, pushing: %d", gl_GlobalInvocationID.x, nodeId); + return; + } + #endif + nodeQueueSink[nodePushIndex++] = nodeId; +} diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp index 1f4350a2..c1644289 100644 --- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp @@ -63,23 +63,6 @@ layout(binding = DEBUG_RENDER_NODE_INDEX, std430) restrict buffer DebugRenderNod //Contains all the screenspace computation #import -//If a request is successfully added to the RequestQueue, must update NodeData to mark that the node has been put into the request queue -// to prevent it from being requested every frame and blocking the queue - - -//Once a suitable render section is found, it is put into the RenderQueue, or if its not availbe its put into the RequestQueue -// and its children are rendered instead if it has them avalible - -//NOTE: EXPERIMENT: INSTEAD OF PERSISTENT THREADS -//TODO: since we know the tree depth is worst case 5, we can just do an indirect dispatch 5 times one for each layer -// issues with this approach, barriers and waiting for one to finish before the otehr can be executed -// advantages, MUCH SIMPLER, no shader barriers needed really , issue is need a flipflip queue but thats ok, -// also ensures the gpu is full of work capacity -// this might be what i do to start with since its much easier to do -// not sure - - - void addRequest(inout UnpackedNode node) { if (!hasRequested(node)) { //printf("Request %d %d %d %d", node.nodeId, node.flags, node.meshPtr, node.childPtr); @@ -172,10 +155,32 @@ void main() { /* +Persistent threading + //Thread 0 grabs a batch when empty void main() { while (true) { //Each thread processes an entry on the queue and pushes all children to the queue if it is determined the children need to be added } } -*/ \ No newline at end of file +*/ + + + + +//If a request is successfully added to the RequestQueue, must update NodeData to mark that the node has been put into the request queue +// to prevent it from being requested every frame and blocking the queue + + +//Once a suitable render section is found, it is put into the RenderQueue, or if its not availbe its put into the RequestQueue +// and its children are rendered instead if it has them avalible + +//NOTE: EXPERIMENT: INSTEAD OF PERSISTENT THREADS +//TODO: since we know the tree depth is worst case 5, we can just do an indirect dispatch 5 times one for each layer +// issues with this approach, barriers and waiting for one to finish before the otehr can be executed +// advantages, MUCH SIMPLER, no shader barriers needed really , issue is need a flipflip queue but thats ok, +// also ensures the gpu is full of work capacity +// this might be what i do to start with since its much easier to do +// not sure + + diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal_dev.comp b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal_dev.comp new file mode 100644 index 00000000..a1f68274 --- /dev/null +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal_dev.comp @@ -0,0 +1,16 @@ +#version 460 core + +//TODO: increase local size +#define LOCAL_SIZE_MSK ((1< + +void main() { + uint node = getCurrentNode(); + if (node != SENTINAL_OUT_OF_BOUNDS) { + printf("GID:%d, NODE %d, %d, AA, %d, %d, %d, %d", gl_GlobalInvocationID.x, node, queueIdx, nodeQueueMetadata[queueIdx].x, nodeQueueMetadata[queueIdx].y, nodeQueueMetadata[queueIdx].z, nodeQueueMetadata[queueIdx].w); + pushNodesInit(1); + pushNode(node); + } +} \ No newline at end of file