diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java b/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java
index 1a88bf09..8303e56b 100644
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierachical/NodeCleaner.java
@@ -35,6 +35,7 @@ public class NodeCleaner {
 
 
     private static final int SORTING_WORKER_SIZE = 64;
+    private static final int WORK_PER_THREAD = 8;
     private static final int OUTPUT_COUNT = 256;
 
 
@@ -43,6 +44,7 @@ public class NodeCleaner {
 
     private final AutoBindingShader sorter = Shader.makeAuto(PrintfDebugUtil.PRINTF_processor)
             .define("WORK_SIZE", SORTING_WORKER_SIZE)
+            .define("ELEMS_PER_THREAD", WORK_PER_THREAD)
             .define("OUTPUT_SIZE", OUTPUT_COUNT)
             .define("VISIBILITY_BUFFER_BINDING", 1)
             .define("OUTPUT_BUFFER_BINDING", 2)
@@ -134,7 +136,7 @@ public class NodeCleaner {
 
                 //TODO: choose whether this is in nodeSpace or section/geometryId space
                 //
-                glDispatchCompute((this.nodeManager.getCurrentMaxNodeId() + SORTING_WORKER_SIZE - 1) / SORTING_WORKER_SIZE, 1, 1);
+                glDispatchCompute((this.nodeManager.getCurrentMaxNodeId() + (SORTING_WORKER_SIZE+WORK_PER_THREAD) - 1) / (SORTING_WORKER_SIZE+WORK_PER_THREAD), 1, 1);
                 glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
 
                 this.resultTransformer.bind();
diff --git a/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp b/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp
index 932f3acf..a892d7bd 100644
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/cleaner/sort_visibility.comp
@@ -5,12 +5,12 @@
 //#define OUTPUT_SIZE 128
 
 layout(local_size_x=WORK_SIZE, local_size_y=1) in;
-//256 workgroup
+#define OPS_PER_THREAD (OUTPUT_SIZE/WORK_SIZE)
 
 #import <voxy:lod/hierarchical/node.glsl>
 
 layout(binding = VISIBILITY_BUFFER_BINDING, std430) restrict readonly buffer VisibilityDataBuffer {
-    uint[] visiblity;
+    uint[] visibility;
 };
 
 layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer MinimumVisibilityBuffer {//TODO: might need to be volatile
@@ -18,12 +18,15 @@ layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer Minimum
 };
 
 //Returns the id of the max value
-uint atomicDerefMaxExchange(uint atId, uint id) {
-    const uint value = visiblity[id];
+uint atomicDerefMaxExchangeGlobal(uint atId, uint id) {
+    const uint value = visibility[id];
     while (true) {
-        const uint existingId = minVisIds[atId];
+        const uint existingId = minVisIds[atId];//TODO: check that this does what we want, and that it obtains the value of atValue, at this point
+        if (existingId == id) {//If we are trying to insert self, return -1
+            return uint(-1);
+        }
         //Check if the value is less than the dereferenced value, if its not, return our own id
-        if (visiblity[existingId] <= value) {
+        if (visibility[existingId&((1u<<31)-1)] <= value) {//Remove the flag bit
             return id;
         }
         //Attempt to swap, since we know we are less than the existingId
@@ -39,37 +42,70 @@ uint atomicDerefMaxExchange(uint atId, uint id) {
 }
 
 //TODO: optimize
-void bubbleSort(uint start, uint id) {
+void bubbleSortGlobal(uint start, uint id) {
     for (uint i = start; i < OUTPUT_SIZE; i++) {
-        id = atomicDerefMaxExchange(i, id);
+        id = atomicDerefMaxExchangeGlobal(i, id);
+        if (id == uint(-1)) {
+            break;
+        }
     }
 }
 
-void main() {
-    //if (gl_GlobalInvocationID.x <64) {
-    //    minVisIds[gl_GlobalInvocationID.x] = visiblity[gl_GlobalInvocationID.x];
-    //}
-    //First do a min sort/set of min OUTPUT_SIZE values of the set
-    uint vis = visiblity[gl_GlobalInvocationID.x];
-    if (vis == uint(-1)) {
-        return;
+//TODO: maybe also have a shared "cache" of the visibility data
+// meaning that the shader doesnt need to access global memory as much
+shared uint initalSort[OUTPUT_SIZE];
+
+
+//Returns the id of the max value
+uint atomicDerefMaxExchangeLocal(uint atId, uint id) {
+    const uint value = visibility[id];
+    while (true) {
+        const uint existingId = initalSort[atId];//TODO: check that this does what we want, and that it obtains the value of atValue, at this point
+        if (existingId == id) {//If we are trying to insert self, return -1
+            return uint(-1);
+        }
+        //Check if the value is less than the dereferenced value, if its not, return our own id
+        if (visibility[existingId&((1u<<31)-1)] <= value) {//Remove the flag bit
+            return id;
+        }
+        //Attempt to swap, since we know we are less than the existingId
+        const uint c = atomicCompSwap(initalSort[atId], existingId, id);
+        //Check if we did swap, else if we failed (or got reswapped else where) recheck
+
+        //We did swap, (since the original mem contents was the existing id)
+        // which means existingId is now the max of the ptr
+        if (c == existingId) {
+            return existingId;
+        }
     }
-    if (visiblity[minVisIds[OUTPUT_SIZE-1]] <= vis) {
-        return;
+}
+void bubbleSortInital(uint vis, uint id) {
+    uint start = 0;
+    //Fast path cut out half the ops
+    if (visibility[initalSort[(OUTPUT_SIZE-1)>>1]] <= vis) {//Check if we are more than half way
+        start = (OUTPUT_SIZE-1)>>1;
     }
+
+    for (uint i = start; i < OUTPUT_SIZE; i++) {
+        id = atomicDerefMaxExchangeLocal(i, id);
+    }
+}
+
+bool shouldSortId(uint id) {
     UnpackedNode node;
     if (unpackNode(node, gl_GlobalInvocationID.x)==uvec4(-1)) {
-        return;//Unallocated node
+        return false;//Unallocated node
     }
 
     if (isEmptyMesh(node) || (!hasMesh(node))) {//|| (!hasChildren(node))
-        return;
+        return false;
     }
     //TODO: FIXME: DONT HARDCODE TOP LEVEL LOD LEVEL
     if (node.lodLevel == 4) {// (!hasChildren(node)) -> Assume leaf node
-        return;//Cannot remove geometry from top level node
+        return false;//Cannot remove geometry from top level node
     }
 
+
     /*THIS IS COMPLETLY WRONG, we need to check if all the children of the parent of the child are leaf nodes
     // not this node
 
@@ -87,7 +123,56 @@ void main() {
     }
     */
 
+    return true;
+}
+void main() {
+    //Cheaky trick, copy the _global buffer_ into the local buffer
+    // this means that insertion into the local buffer can be accelerated W.R.T global
+    for (uint i = 0; i < OPS_PER_THREAD; i++) {
+        //Copy in with warp size batch fetch
+        uint id = gl_LocalInvocationID.x + (i*WORK_SIZE);
+        initalSort[id] = minVisIds[id]|(1u<<31);//Flag the id as being external
+    }
+    barrier();
+    //Do insertion and sort into local shared buffer
+    for (uint i = 0; i < ELEMS_PER_THREAD; i++) {
+        uint id = gl_GlobalInvocationID.x*ELEMS_PER_THREAD+i;
+        uint vis = visibility[id];
+        if (vis == uint(-1)) {
+            continue;
+        }
+        //Quick exit if this element is already bigger than global output
+        if (visibility[minVisIds[OUTPUT_SIZE-1]] <= vis) {
+            continue;
+        }
+        if (!shouldSortId(id)) {
+            continue;
+        }
 
+        bubbleSortInital(vis, id);
+    }
+    barrier();
+    //Do insertion into global visibility array
+    // this is done front to back
 
-    bubbleSort(0, gl_GlobalInvocationID.x);
+    //Work size batching
+    for (uint i = 0; i < OPS_PER_THREAD; i++) {
+        barrier();
+        uint id = gl_LocalInvocationID.x+(i*WORK_SIZE);
+        uint sid = initalSort[id];
+        if ((sid&(1u<<31)) != 0) {
+            //The flag being external was set, meaning we should NOT insert this element
+            continue;
+        }
+        uint vis = visibility[sid];
+        //If output is already smaller than self
+        if (visibility[minVisIds[OUTPUT_SIZE-1]] <= vis) {
+            continue;
+        }
+        uint start = id;
+        if (visibility[minVisIds[(id+OUTPUT_SIZE)>>1]] <= vis) {//Try to skip
+            start = (id+OUTPUT_SIZE)>>1;
+        }
+        bubbleSortGlobal(start, sid);//Insert into global
+    }
 }
\ No newline at end of file