diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java b/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java
index c5002ff4..ab0c9dde 100644
--- a/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java
@@ -43,6 +43,10 @@ public class RenderService<T extends AbstractSectionRenderer<J, Q>, J extends Vi
 
     private static long getGeometryBufferSize() {
         long geometryCapacity = Math.min((1L<<(64-Long.numberOfLeadingZeros(Capabilities.INSTANCE.ssboMaxSize-1)))<<1, 1L<<32)-1024/*(1L<<32)-1024*/;
+        if (Capabilities.INSTANCE.isIntel) {
+            geometryCapacity = Math.max(geometryCapacity, 1L<<30);//intel moment, force min 1gb
+        }
+
         //Limit to available dedicated memory if possible
         if (Capabilities.INSTANCE.canQueryGpuMemory) {
             //512mb less than avalible,
diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java b/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java
index 4665f4fe..d3bcda89 100644
--- a/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java
@@ -30,6 +30,7 @@ import static org.lwjgl.opengl.GL30.glBindVertexArray;
 import static org.lwjgl.opengl.GL31.GL_UNIFORM_BUFFER;
 import static org.lwjgl.opengl.GL33.glBindSampler;
 import static org.lwjgl.opengl.GL40C.GL_DRAW_INDIRECT_BUFFER;
+import static org.lwjgl.opengl.GL42.glMemoryBarrier;
 import static org.lwjgl.opengl.GL43.*;
 import static org.lwjgl.opengl.GL45.glBindTextureUnit;
 import static org.lwjgl.opengl.GL45.glClearNamedBufferData;
@@ -263,20 +264,6 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
             glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);//Am unsure if is needed
             glDispatchCompute(1,1,1);
             glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
-            //glFinish();
-            /*
-            DownloadStream.INSTANCE.download(this.distanceCountBuffer, 0, 1024*4, (ptr,size)->{
-                int[] a = new int[1024];
-                for (int i = 0; i < 1024; i++) {
-                    a[i] = MemoryUtil.memGetInt(ptr+4*i);
-                }
-                for (int i = 0; i < 1023; i++){
-                    if (a[i+1]<a[i]) {
-                        System.out.println(a[i]+","+a[i+1]);
-                    }
-                }
-            });
-            */
 
             this.translucentGenShader.bind();
             glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniform.id);
@@ -287,6 +274,7 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
             glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, this.distanceCountBuffer.id);
 
             glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, this.drawCountCallBuffer.id);//This isnt great but its a nice trick to bound it, even if its inefficent ;-;
+            glMemoryBarrier(-1);
             glDispatchComputeIndirect(0);
             glMemoryBarrier(GL_COMMAND_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
         }
diff --git a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
index 7d12751b..eedd234b 100644
--- a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
+++ b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
@@ -8,15 +8,17 @@
 //Does inital parralel prefix sum on batches of WORK_SIZE
 layout(local_size_x=WORK_SIZE) in;
 
-layout(binding = IO_BUFFER, std430) restrict buffer InputBuffer {
+layout(binding = IO_BUFFER, std430) buffer InputBuffer {
     uvec4[] ioCount;
 };
 
 shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
 
 void main() {
+    /*
+    uint subgroupId = gl_LocalInvocationID.x>>5;
     warpPrefixSum[gl_SubgroupInvocationID] = 0;
-    barrier();
+    memoryBarrierShared();
 
     //todo
     //assert(gl_SubgroupSize == 32);
@@ -33,33 +35,69 @@ void main() {
         sum = count.w + dat.w;
     }
 
+    barrier();
+    count += subgroupExclusiveAdd(sum);
+
+    if (gl_SubgroupInvocationID==31) {
+        warpPrefixSum[subgroupId] = count.x+sum;
+    }
+    memoryBarrierShared();
+    barrier();
+    uint val = warpPrefixSum[gl_SubgroupInvocationID];
+    barrier();
+    if (subgroupId == 0) {
+        //Use warp to do entire add in 1 reduction
+        warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
+    }
+    memoryBarrierShared();
+    barrier();
+    count += warpPrefixSum[subgroupId];
+    ioCount[gid] = count;
+    */
+
+
+    #ifdef IS_INTEL
+    uint subgroupId = gl_LocalInvocationID.x>>5;
+    #else
+    uint subgroupId = gl_SubgroupID;
+    #endif
+
+    //todo
+    //assert(gl_SubgroupSize == 32);
+    //assert(gl_NumSubgroups == (WORK_SIZE>>5));
+
+    uint gid = gl_GlobalInvocationID.x;
+    uvec4 count = uvec4(0);
+    uint sum = 0;
+    {
+        uvec4 dat = ioCount[gid];
+        count.yzw = dat.xyz;
+        count.z += count.y;
+        count.w += count.z;
+        sum = count.w + dat.w;
+    }
     subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
 
     count += subgroupExclusiveAdd(sum);
 
-    if ((gl_LocalInvocationID.x&31u)==31) {
-        warpPrefixSum[gl_SubgroupID] = count.x+sum;
+    if (gl_SubgroupInvocationID==31) {
+        warpPrefixSum[subgroupId] = count.x+sum;
     }
 
+    memoryBarrierShared();
     barrier();
 
-    #ifdef IS_INTEL
-    uint val = subgroupExclusiveAdd(warpPrefixSum[gl_SubgroupInvocationID]);
-    barrier();
-    if (gl_SubgroupID == 0) {
-        warpPrefixSum[gl_SubgroupInvocationID] = val;
-    }
-    #else
     if (gl_SubgroupID == 0) {
         uint val = warpPrefixSum[gl_SubgroupInvocationID];
         subgroupBarrier();
         //Use warp to do entire add in 1 reduction
         warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
     }
-    #endif
 
+    memoryBarrierShared();
     barrier();
+
     //Add the computed sum across all threads and warps
-    count += warpPrefixSum[gl_SubgroupID];
+    count += warpPrefixSum[subgroupId];
     ioCount[gid] = count;
 }
\ No newline at end of file