basic translucency

2025-04-05 16:55:47 +10:00
parent 9e5e5e654d
commit 7cc92a533d
12 changed files with 188 additions and 267 deletions
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/cmdgen.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/cmdgen.comp
@@ -60,8 +60,8 @@ void main() {

    //Note! its not with respect to the sectionId
    //
-    //Check the occlusion data from last frame
-    bool shouldRender = visibilityData[gl_GlobalInvocationID.x] == frameId - 1;
+    //Check the occlusion data from this frame occlusion
+    bool shouldRender = visibilityData[gl_GlobalInvocationID.x] == frameId;

    //Clear the occlusion data (not strictly? needed? i think???)
    //visibilityData[gl_GlobalInvocationID.x] = 0;
@@ -100,8 +100,8 @@ void main() {
        //Translucency
        count = meta.cntA&0xFFFF;
        if (count != 0) {
-            //uint translucentCommandPtr = atomicAdd(translucentDrawCount, 1) + 400000;//FIXME: dont hardcode this offset
-            //writeCmd(translucentCommandPtr, drawId, ptr, count);
+            uint translucentCommandPtr = atomicAdd(translucentDrawCount, 1) + TRANSLUCENT_OFFSET;//FIXME: dont hardcode this offset
+            writeCmd(translucentCommandPtr, drawId, ptr, count);
        }
        ptr += count;

--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/binding_points.glsl
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/binding_points.glsl
@@ -1,15 +0,0 @@
-#define SCENE_UNIFORM_INDEX 0
-#define NODE_DATA_INDEX 1
-#define NODE_QUEUE_INDEX 2
-#define REQUEST_QUEUE_INDEX 3
-#define RENDER_QUEUE_INDEX 4
-#define TRANSFORM_ARRAY_INDEX 5
-#define NEXT_NODE_QUEUE_INDEX 6
-
-#ifdef IS_DEBUG
-#define DEBUG_RENDER_NODE_INDEX 7
-#endif
-
-//Samplers
-#define HIZ_BINDING_INDEX 0
-
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal.comp
@@ -1,186 +0,0 @@
-#version 460 core
-
-//TODO: increase local size
-#define LOCAL_SIZE_BITS 5
-#define LOCAL_SIZE_MSK ((1<<LOCAL_SIZE_BITS)-1)
-#define LOCAL_SIZE (1<<LOCAL_SIZE_BITS)
-layout(local_size_x=LOCAL_SIZE) in;//, local_size_y=1
-
-#import <voxy:lod/hierarchical/binding_points.glsl>
-#line 7
-
-//The queue contains 3 atomics
-// end (the current processing pointer)
-// head (the current point that is ok to read from)
-// top (An atomic that is only used for writing to)
-//The way it works when enqueuing
-// top is incremented by x,
-//   write the data getting enqueued at the starting point specified by the `top` incrmenet
-// then increment head strictly _AFTER_ writing to the queue, this ensures that the data is always written and avaible in the queue
-
-layout(binding = SCENE_UNIFORM_INDEX, std140) uniform SceneUniform {
-    mat4 VP;
-    ivec3 camSecPos;
-    uint screenW;
-    vec3 camSubSecPos;
-    uint screenH;
-    uint requestQueueMaxSize;
-    uint renderQueueMaxSize;
-    float decendSSS;
-};
-
-layout(binding = REQUEST_QUEUE_INDEX, std430) restrict buffer RequestQueue {
-    uint requestQueueIndex;
-    uint[] requestQueue;
-};
-
-layout(binding = RENDER_QUEUE_INDEX, std430) restrict buffer RenderQueue {
-    uint renderQueueIndex;
-    uint[] renderQueue;
-};
-
-layout(binding = NODE_QUEUE_INDEX, std430) restrict buffer NodeQueue {
-    uint  nodeQueueSize;
-    uint[] nodeQueue;
-};
-
-layout(binding = NEXT_NODE_QUEUE_INDEX, std430) restrict buffer NextNodeQueue {
-    uint  nextNodeQueueIndex;
-    uint[] nextNodeQueue;
-};
-
-#ifdef IS_DEBUG
-layout(binding = DEBUG_RENDER_NODE_INDEX, std430) restrict buffer DebugRenderNodeQueue {
-    uint debugRenderNodeQueueIndex;
-    uint[] debugRenderNodeQueue;
-};
-#endif
-
-#import <voxy:lod/hierarchical/transform.glsl>
-
-#import <voxy:lod/hierarchical/node.glsl>
-
-//Contains all the screenspace computation
-#import <voxy:lod/hierarchical/screenspace.glsl>
-
-void addRequest(inout UnpackedNode node) {
-    if (!hasRequested(node)) {
-        //printf("Request %d %d %d %d", node.nodeId, node.flags, node.meshPtr, node.childPtr);
-        //TODO: maybe try using only 1 variable and it being <0 being bad
-        if (requestQueueIndex < requestQueueMaxSize) {
-            //Mark node as having a request submitted to prevent duplicate submissions
-            requestQueue[atomicAdd(requestQueueIndex, 1)] = getId(node);
-            markRequested(node);
-        }
-    }
-}
-
-void enqueueChildren(in UnpackedNode node) {
-    //printf("children");
-    uint children = getChildCount(node);
-    uint ptr = getChildPtr(node);
-    uint widx = atomicAdd(nextNodeQueueIndex, children);
-
-    for (int i = 0; i < children; i++) {
-        nextNodeQueue[widx+i] = ptr+i;
-    }
-}
-
-void enqueueSelfForRender(in UnpackedNode node) {
-    //printf("render %d@[%d,%d,%d]", node.lodLevel, node.pos.x, node.pos.y, node.pos.z);
-    if ((!isEmptyMesh(node)) && renderQueueIndex < renderQueueMaxSize) {
-        renderQueue[atomicAdd(renderQueueIndex, 1)] = getMesh(node);
-        #ifdef IS_DEBUG
-        debugRenderNodeQueue[atomicAdd(debugRenderNodeQueueIndex, 1)] = node.nodeId;
-        #endif
-    }
-}
-
-//TODO: need to add an empty mesh, as a parent node might not have anything to render but the children do??
-void main() {
-    if (gl_GlobalInvocationID.x>=nodeQueueSize) {
-        return;
-    }
-
-    UnpackedNode node;
-    //Setup/unpack the node
-    unpackNode(node, nodeQueue[gl_GlobalInvocationID.x]);
-    //TODO: check the node is OK first??? maybe?
-
-    //Compute screenspace
-    setupScreenspace(node);
-    //printf("Node %d@[%d,%d,%d] - %d - %f", node.lodLevel, node.pos.x, node.pos.y, node.pos.z, node.flags, (size.x*size.y*screenW*screenH));
-
-    //debugDumpNode(node);
-
-    if (outsideFrustum() || isCulledByHiz()) {
-        //printf("HizCulled");
-        //We are done here, dont do any more, the issue is the shader barriers maybe
-        // its culled, maybe just mark it as culled?
-
-
-        //printf("Cull");
-    } else {
-        //It is visible, TODO: maybe do a more detailed hiz test? (or make it so that )
-
-        //Only decend if not a root node
-        if (node.lodLevel!=0 && shouldDecend()) {
-            if (hasChildren(node)) {
-                //printf("A");
-                enqueueChildren(node);
-            } else {
-                //printf("B");
-                addRequest(node);
-                //TODO: use self mesh (is error state if it doesnt have one since all leaf nodes should have a mesh)
-                // Basicly guarenteed to have a mesh, if it doesnt it is very very bad and incorect since its a violation of the graph properties
-                // that all leaf nodes must contain a mesh
-                enqueueSelfForRender(node);
-            }
-        } else {
-            if (hasMesh(node)) {
-                //printf("C");
-                enqueueSelfForRender(node);
-            } else {
-                //printf("D");
-                //!! not ideal, we want to render this mesh but dont have it. If we havent sent a request
-                // then send a request for a mesh for this node.
-                addRequest(node);
-
-                //TODO: Decend into children? maybe add a bitflag saying is bad if the immediate children dont have meshes
-                enqueueChildren(node);
-            }
-        }
-    }
-}
-
-
-/*
-Persistent threading
-
-//Thread 0 grabs a batch when empty
-void main() {
-    while (true) {
-        //Each thread processes an entry on the queue and pushes all children to the queue if it is determined the children need to be added
-    }
-}
-*/
-
-
-
-
-//If a request is successfully added to the RequestQueue, must update NodeData to mark that the node has been put into the request queue
-// to prevent it from being requested every frame and blocking the queue
-
-
-//Once a suitable render section is found, it is put into the RenderQueue, or if its not availbe its put into the RequestQueue
-// and its children are rendered instead if it has them avalible
-
-//NOTE: EXPERIMENT: INSTEAD OF PERSISTENT THREADS
-//TODO: since we know the tree depth is worst case 5, we can just do an indirect dispatch 5 times one for each layer
-// issues with this approach, barriers and waiting for one to finish before the otehr can be executed
-// advantages, MUCH SIMPLER, no shader barriers needed really , issue is need a flipflip queue but thats ok,
-// also ensures the gpu is full of work capacity
-// this might be what i do to start with since its much easier to do
-// not sure
-
-
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal_dev.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/traversal_dev.comp
@@ -36,6 +36,13 @@ layout(binding = RENDER_TRACKER_BINDING, std430) restrict writeonly buffer rende
    uint[] lastRenderFrame;
 };

+#ifdef HAS_STATISTICS
+layout(binding = STATISTICS_BUFFER_BINDING, std430) restrict buffer statisticsBuffer {
+    uint traversalCounts[5];
+    uint renderCounts[5];
+};
+#endif
+
 void addRequest(inout UnpackedNode node) {
    //printf("Put node decend request");
    if (!hasRequested(node)) {
@@ -72,12 +79,20 @@ void enqueueSelfForRender(in UnpackedNode node) {
            #ifdef IS_DEBUG
            debugRenderNodeQueue[atomicAdd(debugRenderNodeQueueIndex, 1)] = node.nodeId;
            #endif
+
+            #ifdef HAS_STATISTICS
+            atomicAdd(renderCounts[node.lodLevel], 1);
+            #endif
        }
    }
 }


 void traverse(in UnpackedNode node) {
+    #ifdef HAS_STATISTICS
+    atomicAdd(traversalCounts[node.lodLevel], 1);
+    #endif
+
    //Compute screenspace
    setupScreenspace(node);
    //debugDumpNode(node);