diff --git a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
index 6d0a8fa1..a815f7e9 100644
--- a/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
+++ b/src/main/resources/assets/voxy/shaders/util/prefixsum/inital3.comp
@@ -12,92 +12,98 @@ layout(binding = IO_BUFFER, std430) buffer InputBuffer {
     uvec4[] ioCount;
 };
 
-shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
+shared uint warpPrefixSum[8];//Warps are 32, tricks require full warp
 
 void main() {
-    /*
-    uint subgroupId = gl_LocalInvocationID.x>>5;
-    warpPrefixSum[gl_SubgroupInvocationID] = 0;
-    memoryBarrierShared();
+    if (gl_SubgroupSize == 32) {
+        #ifdef IS_INTEL
+        uint subgroupId = gl_LocalInvocationID.x>>5;
+        #else
+        uint subgroupId = gl_SubgroupID;
+        #endif
 
-    //todo
-    //assert(gl_SubgroupSize == 32);
-    //assert(gl_NumSubgroups == (WORK_SIZE>>5));
+        //todo
+        //assert(gl_SubgroupSize == 32);
+        //assert(gl_NumSubgroups == (WORK_SIZE>>5));
 
-    uint gid = gl_GlobalInvocationID.x;
-    uvec4 count = uvec4(0);
-    uint sum = 0;
-    {
-        uvec4 dat = ioCount[gid];
-        count.yzw = dat.xyz;
-        count.z += count.y;
-        count.w += count.z;
-        sum = count.w + dat.w;
+        uint gid = gl_GlobalInvocationID.x;
+        uvec4 count = uvec4(0);
+        uint sum = 0;
+        {
+            uvec4 dat = ioCount[gid];
+            count.yzw = dat.xyz;
+            count.z += count.y;
+            count.w += count.z;
+            sum = count.w + dat.w;
+        }
+        subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
+
+        count += subgroupExclusiveAdd(sum);
+
+        if (gl_SubgroupInvocationID==31) {
+            warpPrefixSum[subgroupId] = count.x+sum;
+        }
+
+        memoryBarrierShared();
+        barrier();
+
+        if (gl_LocalInvocationID.x<8) {
+            uint val = warpPrefixSum[gl_SubgroupInvocationID];
+            subgroupBarrier();
+            //Use warp to do entire add in 1 reduction
+            warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
+        }
+
+        memoryBarrierShared();
+        barrier();
+
+        //Add the computed sum across all threads and warps
+        count += warpPrefixSum[subgroupId];
+        ioCount[gid] = count;
+    } else {
+        #ifdef IS_INTEL
+        uint subgroupId = gl_LocalInvocationID.x>>6;
+        #else
+        uint subgroupId = gl_SubgroupID;
+        #endif
+
+        //todo
+        //assert(gl_SubgroupSize == 32);
+        //assert(gl_NumSubgroups == (WORK_SIZE>>5));
+
+        uint gid = gl_GlobalInvocationID.x;
+        uvec4 count = uvec4(0);
+        uint sum = 0;
+        {
+            uvec4 dat = ioCount[gid];
+            count.yzw = dat.xyz;
+            count.z += count.y;
+            count.w += count.z;
+            sum = count.w + dat.w;
+        }
+        subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
+
+        count += subgroupExclusiveAdd(sum);
+
+        if (gl_SubgroupInvocationID==63) {
+            warpPrefixSum[subgroupId] = count.x+sum;
+        }
+
+        memoryBarrierShared();
+        barrier();
+
+        if (gl_LocalInvocationID.x<4) {
+            uint val = warpPrefixSum[gl_SubgroupInvocationID];
+            subgroupBarrier();
+            //Use warp to do entire add in 1 reduction
+            warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
+        }
+
+        memoryBarrierShared();
+        barrier();
+
+        //Add the computed sum across all threads and warps
+        count += warpPrefixSum[subgroupId];
+        ioCount[gid] = count;
     }
-
-    barrier();
-    count += subgroupExclusiveAdd(sum);
-
-    if (gl_SubgroupInvocationID==31) {
-        warpPrefixSum[subgroupId] = count.x+sum;
-    }
-    memoryBarrierShared();
-    barrier();
-    uint val = warpPrefixSum[gl_SubgroupInvocationID];
-    barrier();
-    if (subgroupId == 0) {
-        //Use warp to do entire add in 1 reduction
-        warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
-    }
-    memoryBarrierShared();
-    barrier();
-    count += warpPrefixSum[subgroupId];
-    ioCount[gid] = count;
-    */
-
-
-    #ifdef IS_INTEL
-    uint subgroupId = gl_LocalInvocationID.x>>5;
-    #else
-    uint subgroupId = gl_SubgroupID;
-    #endif
-
-    //todo
-    //assert(gl_SubgroupSize == 32);
-    //assert(gl_NumSubgroups == (WORK_SIZE>>5));
-
-    uint gid = gl_GlobalInvocationID.x;
-    uvec4 count = uvec4(0);
-    uint sum = 0;
-    {
-        uvec4 dat = ioCount[gid];
-        count.yzw = dat.xyz;
-        count.z += count.y;
-        count.w += count.z;
-        sum = count.w + dat.w;
-    }
-    subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
-
-    count += subgroupExclusiveAdd(sum);
-
-    if (gl_SubgroupInvocationID==31) {
-        warpPrefixSum[subgroupId] = count.x+sum;
-    }
-
-    memoryBarrierShared();
-    barrier();
-
-    if (subgroupId == 0) {
-        uint val = warpPrefixSum[gl_SubgroupInvocationID];
-        subgroupBarrier();
-        //Use warp to do entire add in 1 reduction
-        warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
-    }
-
-    memoryBarrierShared();
-    barrier();
-
-    //Add the computed sum across all threads and warps
-    count += warpPrefixSum[subgroupId];
-    ioCount[gid] = count;
 }
\ No newline at end of file