Incremental traversal system works

This commit is contained in:
mcrcortex
2024-09-15 12:10:32 +10:00
parent 76aaf3824d
commit b9a3d18b56
10 changed files with 207 additions and 35 deletions

View File

@@ -0,0 +1,50 @@
#define SENTINAL_OUT_OF_BOUNDS uint(-1)
layout(location = NODE_QUEUE_INDEX_BINDING) uniform uint queueIdx;
layout(binding = NODE_QUEUE_META_BINDING, std430) restrict buffer NodeQueueMeta {
uvec4 nodeQueueMetadata[MAX_ITERATIONS];
};
layout(binding = NODE_QUEUE_SOURCE_BINDING, std430) restrict readonly buffer NodeQueueSource {
uint[] nodeQueueSource;
};
layout(binding = NODE_QUEUE_SINK_BINDING, std430) restrict writeonly buffer NodeQueueSink {
uint[] nodeQueueSink;
};
uint getCurrentNode() {
if (nodeQueueMetadata[queueIdx].w <= gl_GlobalInvocationID.x) {
return SENTINAL_OUT_OF_BOUNDS;
}
return nodeQueueSource[gl_GlobalInvocationID.x];
}
uint nodePushIndex = -1;
void pushNodesInit(uint nodeCount) {
//Debug
#ifdef DEBUG
if (queueIdx >= (MAX_ITERATIONS-1)) {
printf("LOG: Traversal tried inserting a node into next iteration, which is outside max iteration bounds. GID: %d, count: %d", gl_GlobalInvocationID.x, nodeCount);
nodePushIndex = -1;
return;
}
#endif
uint index = atomicAdd(nodeQueueMetadata[queueIdx+1].w, nodeCount);
//Increment first metadata value if it changes threash hold
uint inc = ((index+LOCAL_SIZE)>>LOCAL_SIZE_BITS)-(index>>LOCAL_SIZE_BITS);
atomicAdd(nodeQueueMetadata[queueIdx+1].x, inc);//TODO: see if making this conditional on inc != 0 is faster
nodePushIndex = index;
}
void pushNode(uint nodeId) {
#ifdef DEBUG
if (nodePushIndex == -1) {
printf("LOG: Tried pushing node when push node wasnt successful. GID: %d, pushing: %d", gl_GlobalInvocationID.x, nodeId);
return;
}
#endif
nodeQueueSink[nodePushIndex++] = nodeId;
}

View File

@@ -63,23 +63,6 @@ layout(binding = DEBUG_RENDER_NODE_INDEX, std430) restrict buffer DebugRenderNod
//Contains all the screenspace computation
#import <voxy:lod/hierarchical/screenspace.glsl>
//If a request is successfully added to the RequestQueue, must update NodeData to mark that the node has been put into the request queue
// to prevent it from being requested every frame and blocking the queue
//Once a suitable render section is found, it is put into the RenderQueue, or if its not availbe its put into the RequestQueue
// and its children are rendered instead if it has them avalible
//NOTE: EXPERIMENT: INSTEAD OF PERSISTENT THREADS
//TODO: since we know the tree depth is worst case 5, we can just do an indirect dispatch 5 times one for each layer
// issues with this approach, barriers and waiting for one to finish before the otehr can be executed
// advantages, MUCH SIMPLER, no shader barriers needed really , issue is need a flipflip queue but thats ok,
// also ensures the gpu is full of work capacity
// this might be what i do to start with since its much easier to do
// not sure
void addRequest(inout UnpackedNode node) {
if (!hasRequested(node)) {
//printf("Request %d %d %d %d", node.nodeId, node.flags, node.meshPtr, node.childPtr);
@@ -172,10 +155,32 @@ void main() {
/*
Persistent threading
//Thread 0 grabs a batch when empty
void main() {
while (true) {
//Each thread processes an entry on the queue and pushes all children to the queue if it is determined the children need to be added
}
}
*/
*/
//If a request is successfully added to the RequestQueue, must update NodeData to mark that the node has been put into the request queue
// to prevent it from being requested every frame and blocking the queue
//Once a suitable render section is found, it is put into the RenderQueue, or if its not availbe its put into the RequestQueue
// and its children are rendered instead if it has them avalible
//NOTE: EXPERIMENT: INSTEAD OF PERSISTENT THREADS
//TODO: since we know the tree depth is worst case 5, we can just do an indirect dispatch 5 times one for each layer
// issues with this approach, barriers and waiting for one to finish before the otehr can be executed
// advantages, MUCH SIMPLER, no shader barriers needed really , issue is need a flipflip queue but thats ok,
// also ensures the gpu is full of work capacity
// this might be what i do to start with since its much easier to do
// not sure

View File

@@ -0,0 +1,16 @@
#version 460 core
//TODO: increase local size
#define LOCAL_SIZE_MSK ((1<<LOCAL_SIZE_BITS)-1)
#define LOCAL_SIZE (1<<LOCAL_SIZE_BITS)
layout(local_size_x=LOCAL_SIZE) in;//, local_size_y=1
#import <voxy:lod/hierarchical/queue.glsl>
void main() {
uint node = getCurrentNode();
if (node != SENTINAL_OUT_OF_BOUNDS) {
printf("GID:%d, NODE %d, %d, AA, %d, %d, %d, %d", gl_GlobalInvocationID.x, node, queueIdx, nodeQueueMetadata[queueIdx].x, nodeQueueMetadata[queueIdx].y, nodeQueueMetadata[queueIdx].z, nodeQueueMetadata[queueIdx].w);
pushNodesInit(1);
pushNode(node);
}
}