From 72e35557a420675b8e1e7bf93ef6ba426792e768 Mon Sep 17 00:00:00 2001 From: mcrcortex <{ID}+{username}@users.noreply.github.com> Date: Tue, 16 Jul 2024 00:14:53 +1000 Subject: [PATCH] it works! --- build.gradle | 2 +- .../rendering/Gl46HierarchicalRenderer.java | 21 ++++++- .../HierarchicalOcclusionRenderer.java | 58 ++++++++++++++++--- .../rendering/hierarchical/NodeManager.java | 37 +++++++++--- .../voxy/common/util/HierarchicalBitSet.java | 48 ++++++++++++++- .../lod/hierarchical/binding_points.glsl | 1 + .../voxy/shaders/lod/hierarchical/node.glsl | 4 ++ .../shaders/lod/hierarchical/screenspace.glsl | 4 +- .../shaders/lod/hierarchical/traversal.comp | 28 +++++++-- 9 files changed, 175 insertions(+), 28 deletions(-) diff --git a/build.gradle b/build.gradle index 1a5a6c6b..ea17aea3 100644 --- a/build.gradle +++ b/build.gradle @@ -74,7 +74,7 @@ dependencies { modRuntimeOnly("maven.modrinth:spark:1.10.73-fabric") modRuntimeOnly("maven.modrinth:fabric-permissions-api:0.3.1") - modRuntimeOnly("maven.modrinth:nsight-loader:1.2.0") + //modRuntimeOnly("maven.modrinth:nsight-loader:1.2.0") modImplementation('io.github.douira:glsl-transformer:2.0.1') } diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/Gl46HierarchicalRenderer.java b/src/main/java/me/cortex/voxy/client/core/rendering/Gl46HierarchicalRenderer.java index e834036a..79c9d31a 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/Gl46HierarchicalRenderer.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/Gl46HierarchicalRenderer.java @@ -12,6 +12,7 @@ import me.cortex.voxy.client.core.rendering.building.RenderGenerationService; import me.cortex.voxy.client.core.rendering.hierarchical.HierarchicalOcclusionRenderer; import me.cortex.voxy.client.core.rendering.hierarchical.INodeInteractor; import me.cortex.voxy.client.core.rendering.hierarchical.MeshManager; +import me.cortex.voxy.client.core.rendering.util.DownloadStream; import me.cortex.voxy.client.core.rendering.util.UploadStream; import me.cortex.voxy.client.mixin.joml.AccessFrustumIntersection; import me.cortex.voxy.common.world.WorldEngine; @@ -27,6 +28,7 @@ import org.joml.Matrix4f; import org.joml.Vector3f; import org.lwjgl.system.MemoryUtil; +import java.util.ArrayList; import java.util.List; import java.util.concurrent.ConcurrentLinkedDeque; import java.util.function.Consumer; @@ -52,7 +54,10 @@ import static org.lwjgl.opengl.GL45.nglClearNamedBufferSubData; public class Gl46HierarchicalRenderer implements IRenderInterface, AbstractRenderWorldInteractor { private final HierarchicalOcclusionRenderer sectionSelector; private final MeshManager meshManager = new MeshManager(); - private final PrintfInjector printf = new PrintfInjector(100000, 10, System.out::println); + + private final List printfQueue = new ArrayList<>(); + private final PrintfInjector printf = new PrintfInjector(100000, 10, this.printfQueue::add); + private final GlBuffer renderSections = new GlBuffer(100_000 * 4 + 4).zero(); @@ -99,6 +104,11 @@ public class Gl46HierarchicalRenderer implements IRenderInterface debug) { - + debug.add("Printf Queue: "); + debug.addAll(this.printfQueue); + for (String a : this.printfQueue) { + if (a.startsWith("LOG")) { + System.err.println(a); + } + } + this.printfQueue.clear(); } diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/HierarchicalOcclusionRenderer.java b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/HierarchicalOcclusionRenderer.java index 2bac563d..9fc93fa5 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/HierarchicalOcclusionRenderer.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/HierarchicalOcclusionRenderer.java @@ -13,12 +13,13 @@ import org.joml.Matrix4f; import org.joml.Vector3f; import org.lwjgl.system.MemoryUtil; +import static org.lwjgl.opengl.ARBDirectStateAccess.nglClearNamedBufferSubData; +import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT; +import static org.lwjgl.opengl.GL30.GL_R32UI; import static org.lwjgl.opengl.GL30.glBindBufferBase; import static org.lwjgl.opengl.GL33.glBindSampler; import static org.lwjgl.opengl.GL33.glGenSamplers; -import static org.lwjgl.opengl.GL42C.*; -import static org.lwjgl.opengl.GL43.GL_SHADER_STORAGE_BUFFER; -import static org.lwjgl.opengl.GL43.glDispatchCompute; +import static org.lwjgl.opengl.GL43.*; import static org.lwjgl.opengl.GL45.glBindTextureUnit; public class HierarchicalOcclusionRenderer { @@ -30,12 +31,14 @@ public class HierarchicalOcclusionRenderer { private final Shader hierarchicalTraversal; private final PrintfInjector printf; - private final GlBuffer nodeQueue; + private final GlBuffer nodeQueueA; + private final GlBuffer nodeQueueB; private final GlBuffer uniformBuffer; public HierarchicalOcclusionRenderer(INodeInteractor interactor, MeshManager mesh, PrintfInjector printf) { this.nodeManager = new NodeManager(interactor, mesh); - this.nodeQueue = new GlBuffer(1000000*4+4).zero(); + this.nodeQueueA = new GlBuffer(1000000*4+4).zero(); + this.nodeQueueB = new GlBuffer(1000000*4+4).zero(); this.uniformBuffer = new GlBuffer(1024).zero(); this.printf = printf; this.hierarchicalTraversal = Shader.make(printf) @@ -63,11 +66,22 @@ public class HierarchicalOcclusionRenderer { MemoryUtil.memPutInt(ptr, NodeManager.REQUEST_QUEUE_SIZE); ptr += 4; MemoryUtil.memPutInt(ptr, 1000000); ptr += 4; + + //decendSSS (decend screen space size) + MemoryUtil.memPutFloat(ptr, 128*128); ptr += 4; } public void doHierarchicalTraversalSelection(Gl46HierarchicalViewport viewport, int depthBuffer, GlBuffer renderSelectionResult) { this.uploadUniform(viewport); this.nodeManager.upload(); + + { + long ptr = UploadStream.INSTANCE.upload(this.nodeQueueA, 0, 8); + MemoryUtil.memPutInt(ptr, 1); ptr += 4; + MemoryUtil.memPutInt(ptr, 0); + } + + UploadStream.INSTANCE.commit(); //Make hiz @@ -78,9 +92,10 @@ public class HierarchicalOcclusionRenderer { { glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniformBuffer.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, this.nodeManager.nodeBuffer.id); - glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueue.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueueA.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 3, this.nodeManager.requestQueue.id); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 4, renderSelectionResult.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.nodeQueueB.id); //Bind the hiz buffer glBindSampler(0, this.hizSampler); @@ -89,7 +104,35 @@ public class HierarchicalOcclusionRenderer { this.printf.bind(); { //Dispatch hierarchies + nglClearNamedBufferSubData(this.nodeQueueB.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueueA.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.nodeQueueB.id); glDispatchCompute(1,1,1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + nglClearNamedBufferSubData(this.nodeQueueA.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueueB.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.nodeQueueA.id); + glDispatchCompute(8,1,1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + nglClearNamedBufferSubData(this.nodeQueueB.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueueA.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.nodeQueueB.id); + glDispatchCompute(16,1,1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + nglClearNamedBufferSubData(this.nodeQueueA.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueueB.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.nodeQueueA.id); + glDispatchCompute(32,1,1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); + + nglClearNamedBufferSubData(this.nodeQueueB.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, this.nodeQueueA.id); + glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 6, this.nodeQueueB.id); + glDispatchCompute(64,1,1); + glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); } glBindSampler(0, 0); @@ -98,7 +141,8 @@ public class HierarchicalOcclusionRenderer { } public void free() { - this.nodeQueue.free(); + this.nodeQueueA.free(); + this.nodeQueueB.free(); this.hiz.free(); this.nodeManager.free(); glDeleteSamplers(this.hizSampler); diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java index 6d2aa958..84b063c4 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java @@ -128,8 +128,8 @@ public class NodeManager { this.requestQueue = new GlBuffer(REQUEST_QUEUE_SIZE*4+4); Arrays.fill(this.localNodeData, 0); - - this.setNodePosition(0, WorldEngine.getWorldSectionId(2, 0,0,0)); + this.nodeAllocations.allocateNext(); + this.setNodePosition(0, WorldEngine.getWorldSectionId(4, 0,0,0)); this.setChildPtr(0, NODE_MSK, 0); this.setMeshId(0, MESH_MSK); this.pushNode(0); @@ -148,7 +148,7 @@ public class NodeManager { //Returns the mesh offset/id for the given node or -1 if it doesnt exist private int getNodeMesh(int node) { - return (int) (this.localNodeData[node*3+1]&((1<<24)-1)); + return (int) (this.localNodeData[node*3+1]&MESH_MSK); } private int getNodeChildPtr(int node) { @@ -224,6 +224,7 @@ public class NodeManager { for (int i = 0; i < count; i++) { int requestOp = MemoryUtil.memGetInt(ptr + i*4L); int node = requestOp&NODE_MSK; + System.out.println("Got request for node: " + node); if (this.isLeafNode(node)) { //If its a leaf node and it has a request, it must need the children @@ -255,7 +256,7 @@ public class NodeManager { } else { //If its not a leaf node, it must be missing the inner mesh so request it - if (this.getNodeMesh(node) != -1) { + if (this.getNodeMesh(node) != MESH_MSK) { //Node already has a mesh, ignore it, but might be a sign that an error has occured System.err.println("Requested a mesh for node, however the node already has a mesh"); @@ -312,6 +313,7 @@ public class NodeManager { //TODO: FIXME!! if we get a node that has an update and is watched but no id for it, it could be an update state from // an empty node to non empty node, this means we need to invalidate all the childrens positions and move them! // then also update the parent pointer + //TODO: Also need a way to remove sections, requires shuffling stuff around if (id == NO_NODE) { //The built mesh section is no longer needed, discard it // TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built?? @@ -360,7 +362,7 @@ public class NodeManager { if (request.isSatisfied()) { //If request is now satisfied update the internal nodes, create the children and reset + release the request set - this.completeRequest(request); + this.completeLeafRequest(request); //Reset + release request.clear(); @@ -392,20 +394,36 @@ public class NodeManager { } - private void completeRequest(LeafRequest request) { + private void completeLeafRequest(LeafRequest request) { //TODO: need to actually update all of the pos2meshId of the children to point to there new nodes int msk = Byte.toUnsignedInt(request.nonAirMask()); int baseIdx = this.nodeAllocations.allocateNextConsecutiveCounted(Integer.bitCount(msk)); + int cnt = 0; for (int i = 0; i < 8; i++) { if ((msk&(1<>8)&0xFF); - + int a = this.getNodeMesh(id)|((flags&0xFF)<<24); + int b = this.getNodeChildPtr(id)|(((flags>>8)&0xFF)<<24); + System.out.println("Setting mesh " + this.getNodeMesh(id) + " for node " + id); MemoryUtil.memPutInt(dst, a); dst += 4; MemoryUtil.memPutInt(dst, b); dst += 4; } @@ -445,6 +463,7 @@ public class NodeManager { } public void download() { + //this.pushNode(0); //Download the request queue then clear the counter (first 4 bytes) DownloadStream.INSTANCE.download(this.requestQueue, this::processRequestQueue); DownloadStream.INSTANCE.commit(); diff --git a/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java b/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java index 4a44202a..d45dac6b 100644 --- a/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java +++ b/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java @@ -33,9 +33,10 @@ public class HierarchicalBitSet { idx = Long.numberOfTrailingZeros(~cp) + 64*idx; long dp = this.D[idx]; idx = Long.numberOfTrailingZeros(~dp) + 64*idx; + int ret = idx; + dp |= 1L<<(idx&0x3f); this.D[idx>>6] = dp; - int ret = idx; if (dp==-1) { idx >>= 6; cp |= 1L<<(idx&0x3f); @@ -50,9 +51,26 @@ public class HierarchicalBitSet { } } this.cnt++; + return ret; } + private void set(int idx) { + long dp = this.D[idx>>6] |= 1L<<(idx&0x3f); + if (dp==-1) { + idx >>= 6; + long cp = (this.C[idx>>6] |= 1L<<(idx&0x3f)); + if (cp==-1) { + idx >>= 6; + long bp = this.B[idx>>6] |= 1L<<(idx&0x3f); + if (bp==-1) { + this.A |= 1L<<(idx&0x3f); + } + } + } + this.cnt++; + } + //Returns the next free index from idx private int findNextFree(int idx) { int pos = Long.numberOfTrailingZeros((~this.A)|((1L<<(idx>>18))-1)); @@ -66,9 +84,33 @@ public class HierarchicalBitSet { if (this.cnt+count>this.limit) { return -2;//Limit reached } - //At a minimum maybe just do a while loop for testing + //TODO:FIXME DONT DO THIS, do a faster search - return 0; + int i = 0; + while (true) { + boolean isFree = true; + for (int j = 0; j < count; j++) { + if (this.isSet(i+j)) { + isFree = false; + break; + } + } + + if (isFree) { + for (int j = 0; j < count; j++) { + this.set(j + i); + } + return i; + } else { + i++;//THIS IS SLOW BUT WORKS + /* TODO: FIX AND FINISH OPTIMIZATION + i += + while (this.D[i>>6] == -1) { + i++; + } + */ + } + } } diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/binding_points.glsl b/src/main/resources/assets/voxy/shaders/lod/hierarchical/binding_points.glsl index 390ebac7..32a329de 100644 --- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/binding_points.glsl +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/binding_points.glsl @@ -4,6 +4,7 @@ #define REQUEST_QUEUE_INDEX 3 #define RENDER_QUEUE_INDEX 4 #define TRANSFORM_ARRAY_INDEX 5 +#define NEXT_NODE_QUEUE_INDEX 6 //Samplers #define HIZ_BINDING_INDEX 0 diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/node.glsl b/src/main/resources/assets/voxy/shaders/lod/hierarchical/node.glsl index 54c408cf..e4eb2647 100644 --- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/node.glsl +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/node.glsl @@ -76,6 +76,10 @@ uint getChildCount(in UnpackedNode node) { return ((node.flags >> 2)&7U)+1; } +uint getChildPtr(in UnpackedNode node) { + return node.childPtr; +} + uint getTransformIndex(in UnpackedNode node) { return (node.flags >> 5)&31u; } diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/screenspace.glsl b/src/main/resources/assets/voxy/shaders/lod/hierarchical/screenspace.glsl index 7aec7c4b..674dc8a6 100644 --- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/screenspace.glsl +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/screenspace.glsl @@ -47,7 +47,7 @@ void setupScreenspace(in UnpackedNode node) { for (int i = 1; i < 8; i++) { //NOTE!: cant this be precomputed and put in an array?? in the scene uniform?? - vec4 pPoint = (VP*vec4(vec3((i&1)!=0,(i&2)!=0,(i&4)!=0)*32,1));//Size of section is 32x32x32 (need to change it to a bounding box in the future) + vec4 pPoint = (VP*vec4(vec3((i&1)!=0,(i&2)!=0,(i&4)!=0),1))*(32< (64*64F); + return (size.x*size.y*screenW*screenH) > decendSSS; } \ No newline at end of file diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp index 20509603..f661a08d 100644 --- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp @@ -23,6 +23,7 @@ layout(binding = SCENE_UNIFORM_INDEX, std140) uniform SceneUniform { uint screenH; uint requestQueueMaxSize; uint renderQueueMaxSize; + float decendSSS; }; layout(binding = NODE_QUEUE_INDEX, std430) restrict buffer NodeQueue { @@ -40,6 +41,11 @@ layout(binding = RENDER_QUEUE_INDEX, std430) restrict buffer RenderQueue { uint[] renderQueue; }; +layout(binding = NEXT_NODE_QUEUE_INDEX, std430) restrict buffer NextNodeQueue { + uint nextNodeQueueIndex; + uint[] nextNodeQueue; +}; + /* layout(binding = 2, std430) restrict buffer QueueData { @@ -78,7 +84,7 @@ layout(binding = 2, std430) restrict buffer QueueData { void addRequest(inout UnpackedNode node) { if (!hasRequested(node)) { - printf("requested"); + printf("LOG: Request %d %d %d %d", node.nodeId, node.flags, node.meshPtr, node.childPtr); //TODO: maybe try using only 1 variable and it being <0 being bad if (requestQueueIndex < requestQueueMaxSize) { //Mark node as having a request submitted to prevent duplicate submissions @@ -90,10 +96,16 @@ void addRequest(inout UnpackedNode node) { void enqueueChildren(in UnpackedNode node) { //printf("children"); + uint children = getChildCount(node); + uint ptr = getChildPtr(node); + uint widx = atomicAdd(nextNodeQueueIndex, children); + for (int i = 0; i < children; i++) { + nextNodeQueue[widx+i] = ptr+i; + } } void enqueueSelfForRender(in UnpackedNode node) { - //printf("render"); + printf("render %d@[%d,%d,%d]", node.lodLevel, node.pos.x, node.pos.y, node.pos.z); if (renderQueueIndex < renderQueueMaxSize) { renderQueue[atomicAdd(renderQueueIndex, 1)] = getMesh(node); } @@ -101,15 +113,18 @@ void enqueueSelfForRender(in UnpackedNode node) { //TODO: need to add an empty mesh, as a parent node might not have anything to render but the children do?? void main() { - UnpackedNode node; + if (gl_GlobalInvocationID.x>=nodeQueueSize) { + return; + } + UnpackedNode node; //Setup/unpack the node unpackNode(node, nodeQueue[gl_GlobalInvocationID.x]); - //TODO: check the node is OK first??? maybe? //Compute screenspace setupScreenspace(node); + //printf("Node %d@[%d,%d,%d] - %d - %f", node.lodLevel, node.pos.x, node.pos.y, node.pos.z, node.flags, (size.x*size.y*screenW*screenH)); //debugDumpNode(node); @@ -117,14 +132,17 @@ void main() { //printf("HizCulled"); //We are done here, dont do any more, the issue is the shader barriers maybe // its culled, maybe just mark it as culled? + printf("Cull"); } else { //It is visible, TODO: maybe do a more detailed hiz test? (or make it so that ) //Only decend if not a root node if (node.lodLevel!=0 && shouldDecend()) { if (hasChildren(node)) { + //printf("A"); enqueueChildren(node); } else { + //printf("B"); addRequest(node); //TODO: use self mesh (is error state if it doesnt have one since all leaf nodes should have a mesh) // Basicly guarenteed to have a mesh, if it doesnt it is very very bad and incorect since its a violation of the graph properties @@ -133,8 +151,10 @@ void main() { } } else { if (hasMesh(node)) { + //printf("C"); enqueueSelfForRender(node); } else { + //printf("D"); //!! not ideal, we want to render this mesh but dont have it. If we havent sent a request // then send a request for a mesh for this node. addRequest(node);