diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java b/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java index 1a88bf09..8303e56b 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java @@ -35,6 +35,7 @@ public class NodeCleaner { private static final int SORTING_WORKER_SIZE = 64; + private static final int WORK_PER_THREAD = 8; private static final int OUTPUT_COUNT = 256; @@ -43,6 +44,7 @@ public class NodeCleaner { private final AutoBindingShader sorter = Shader.makeAuto(PrintfDebugUtil.PRINTF_processor) .define("WORK_SIZE", SORTING_WORKER_SIZE) + .define("ELEMS_PER_THREAD", WORK_PER_THREAD) .define("OUTPUT_SIZE", OUTPUT_COUNT) .define("VISIBILITY_BUFFER_BINDING", 1) .define("OUTPUT_BUFFER_BINDING", 2) @@ -134,7 +136,7 @@ public class NodeCleaner { //TODO: choose whether this is in nodeSpace or section/geometryId space // - glDispatchCompute((this.nodeManager.getCurrentMaxNodeId() + SORTING_WORKER_SIZE - 1) / SORTING_WORKER_SIZE, 1, 1); + glDispatchCompute((this.nodeManager.getCurrentMaxNodeId() + (SORTING_WORKER_SIZE+WORK_PER_THREAD) - 1) / (SORTING_WORKER_SIZE+WORK_PER_THREAD), 1, 1); glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT); this.resultTransformer.bind(); diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp b/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp index 932f3acf..a892d7bd 100644 --- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp +++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp @@ -5,12 +5,12 @@ //#define OUTPUT_SIZE 128 layout(local_size_x=WORK_SIZE, local_size_y=1) in; -//256 workgroup +#define OPS_PER_THREAD (OUTPUT_SIZE/WORK_SIZE) #import layout(binding = VISIBILITY_BUFFER_BINDING, std430) restrict readonly buffer VisibilityDataBuffer { - uint[] visiblity; + uint[] visibility; }; layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer MinimumVisibilityBuffer {//TODO: might need to be volatile @@ -18,12 +18,15 @@ layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer Minimum }; //Returns the id of the max value -uint atomicDerefMaxExchange(uint atId, uint id) { - const uint value = visiblity[id]; +uint atomicDerefMaxExchangeGlobal(uint atId, uint id) { + const uint value = visibility[id]; while (true) { - const uint existingId = minVisIds[atId]; + const uint existingId = minVisIds[atId];//TODO: check that this does what we want, and that it obtains the value of atValue, at this point + if (existingId == id) {//If we are trying to insert self, return -1 + return uint(-1); + } //Check if the value is less than the dereferenced value, if its not, return our own id - if (visiblity[existingId] <= value) { + if (visibility[existingId&((1u<<31)-1)] <= value) {//Remove the flag bit return id; } //Attempt to swap, since we know we are less than the existingId @@ -39,37 +42,70 @@ uint atomicDerefMaxExchange(uint atId, uint id) { } //TODO: optimize -void bubbleSort(uint start, uint id) { +void bubbleSortGlobal(uint start, uint id) { for (uint i = start; i < OUTPUT_SIZE; i++) { - id = atomicDerefMaxExchange(i, id); + id = atomicDerefMaxExchangeGlobal(i, id); + if (id == uint(-1)) { + break; + } } } -void main() { - //if (gl_GlobalInvocationID.x <64) { - // minVisIds[gl_GlobalInvocationID.x] = visiblity[gl_GlobalInvocationID.x]; - //} - //First do a min sort/set of min OUTPUT_SIZE values of the set - uint vis = visiblity[gl_GlobalInvocationID.x]; - if (vis == uint(-1)) { - return; +//TODO: maybe also have a shared "cache" of the visibility data +// meaning that the shader doesnt need to access global memory as much +shared uint initalSort[OUTPUT_SIZE]; + + +//Returns the id of the max value +uint atomicDerefMaxExchangeLocal(uint atId, uint id) { + const uint value = visibility[id]; + while (true) { + const uint existingId = initalSort[atId];//TODO: check that this does what we want, and that it obtains the value of atValue, at this point + if (existingId == id) {//If we are trying to insert self, return -1 + return uint(-1); + } + //Check if the value is less than the dereferenced value, if its not, return our own id + if (visibility[existingId&((1u<<31)-1)] <= value) {//Remove the flag bit + return id; + } + //Attempt to swap, since we know we are less than the existingId + const uint c = atomicCompSwap(initalSort[atId], existingId, id); + //Check if we did swap, else if we failed (or got reswapped else where) recheck + + //We did swap, (since the original mem contents was the existing id) + // which means existingId is now the max of the ptr + if (c == existingId) { + return existingId; + } } - if (visiblity[minVisIds[OUTPUT_SIZE-1]] <= vis) { - return; +} +void bubbleSortInital(uint vis, uint id) { + uint start = 0; + //Fast path cut out half the ops + if (visibility[initalSort[(OUTPUT_SIZE-1)>>1]] <= vis) {//Check if we are more than half way + start = (OUTPUT_SIZE-1)>>1; } + + for (uint i = start; i < OUTPUT_SIZE; i++) { + id = atomicDerefMaxExchangeLocal(i, id); + } +} + +bool shouldSortId(uint id) { UnpackedNode node; if (unpackNode(node, gl_GlobalInvocationID.x)==uvec4(-1)) { - return;//Unallocated node + return false;//Unallocated node } if (isEmptyMesh(node) || (!hasMesh(node))) {//|| (!hasChildren(node)) - return; + return false; } //TODO: FIXME: DONT HARDCODE TOP LEVEL LOD LEVEL if (node.lodLevel == 4) {// (!hasChildren(node)) -> Assume leaf node - return;//Cannot remove geometry from top level node + return false;//Cannot remove geometry from top level node } + /*THIS IS COMPLETLY WRONG, we need to check if all the children of the parent of the child are leaf nodes // not this node @@ -87,7 +123,56 @@ void main() { } */ + return true; +} +void main() { + //Cheaky trick, copy the _global buffer_ into the local buffer + // this means that insertion into the local buffer can be accelerated W.R.T global + for (uint i = 0; i < OPS_PER_THREAD; i++) { + //Copy in with warp size batch fetch + uint id = gl_LocalInvocationID.x + (i*WORK_SIZE); + initalSort[id] = minVisIds[id]|(1u<<31);//Flag the id as being external + } + barrier(); + //Do insertion and sort into local shared buffer + for (uint i = 0; i < ELEMS_PER_THREAD; i++) { + uint id = gl_GlobalInvocationID.x*ELEMS_PER_THREAD+i; + uint vis = visibility[id]; + if (vis == uint(-1)) { + continue; + } + //Quick exit if this element is already bigger than global output + if (visibility[minVisIds[OUTPUT_SIZE-1]] <= vis) { + continue; + } + if (!shouldSortId(id)) { + continue; + } + bubbleSortInital(vis, id); + } + barrier(); + //Do insertion into global visibility array + // this is done front to back - bubbleSort(0, gl_GlobalInvocationID.x); + //Work size batching + for (uint i = 0; i < OPS_PER_THREAD; i++) { + barrier(); + uint id = gl_LocalInvocationID.x+(i*WORK_SIZE); + uint sid = initalSort[id]; + if ((sid&(1u<<31)) != 0) { + //The flag being external was set, meaning we should NOT insert this element + continue; + } + uint vis = visibility[sid]; + //If output is already smaller than self + if (visibility[minVisIds[OUTPUT_SIZE-1]] <= vis) { + continue; + } + uint start = id; + if (visibility[minVisIds[(id+OUTPUT_SIZE)>>1]] <= vis) {//Try to skip + start = (id+OUTPUT_SIZE)>>1; + } + bubbleSortGlobal(start, sid);//Insert into global + } } \ No newline at end of file