diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java b/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java index c5002ff4..ab0c9dde 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/RenderService.java @@ -43,6 +43,10 @@ public class RenderService, J extends Vi private static long getGeometryBufferSize() { long geometryCapacity = Math.min((1L<<(64-Long.numberOfLeadingZeros(Capabilities.INSTANCE.ssboMaxSize-1)))<<1, 1L<<32)-1024/*(1L<<32)-1024*/; + if (Capabilities.INSTANCE.isIntel) { + geometryCapacity = Math.max(geometryCapacity, 1L<<30);//intel moment, force min 1gb + } + //Limit to available dedicated memory if possible if (Capabilities.INSTANCE.canQueryGpuMemory) { //512mb less than avalible, diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java b/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java index 4665f4fe..d3bcda89 100644 --- a/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java +++ b/src/main/java/me/cortex/voxy/client/core/rendering/section/MDICSectionRenderer.java @@ -30,6 +30,7 @@ import static org.lwjgl.opengl.GL30.glBindVertexArray; import static org.lwjgl.opengl.GL31.GL_UNIFORM_BUFFER; import static org.lwjgl.opengl.GL33.glBindSampler; import static org.lwjgl.opengl.GL40C.GL_DRAW_INDIRECT_BUFFER; +import static org.lwjgl.opengl.GL42.glMemoryBarrier; import static org.lwjgl.opengl.GL43.*; import static org.lwjgl.opengl.GL45.glBindTextureUnit; import static org.lwjgl.opengl.GL45.glClearNamedBufferData; @@ -263,20 +264,6 @@ public class MDICSectionRenderer extends AbstractSectionRenderer{ - int[] a = new int[1024]; - for (int i = 0; i < 1024; i++) { - a[i] = MemoryUtil.memGetInt(ptr+4*i); - } - for (int i = 0; i < 1023; i++){ - if (a[i+1]>5; warpPrefixSum[gl_SubgroupInvocationID] = 0; - barrier(); + memoryBarrierShared(); //todo //assert(gl_SubgroupSize == 32); @@ -33,33 +35,69 @@ void main() { sum = count.w + dat.w; } + barrier(); + count += subgroupExclusiveAdd(sum); + + if (gl_SubgroupInvocationID==31) { + warpPrefixSum[subgroupId] = count.x+sum; + } + memoryBarrierShared(); + barrier(); + uint val = warpPrefixSum[gl_SubgroupInvocationID]; + barrier(); + if (subgroupId == 0) { + //Use warp to do entire add in 1 reduction + warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val); + } + memoryBarrierShared(); + barrier(); + count += warpPrefixSum[subgroupId]; + ioCount[gid] = count; + */ + + + #ifdef IS_INTEL + uint subgroupId = gl_LocalInvocationID.x>>5; + #else + uint subgroupId = gl_SubgroupID; + #endif + + //todo + //assert(gl_SubgroupSize == 32); + //assert(gl_NumSubgroups == (WORK_SIZE>>5)); + + uint gid = gl_GlobalInvocationID.x; + uvec4 count = uvec4(0); + uint sum = 0; + { + uvec4 dat = ioCount[gid]; + count.yzw = dat.xyz; + count.z += count.y; + count.w += count.z; + sum = count.w + dat.w; + } subgroupBarrier();//Wait for all threads in the subgroup to get the buffer count += subgroupExclusiveAdd(sum); - if ((gl_LocalInvocationID.x&31u)==31) { - warpPrefixSum[gl_SubgroupID] = count.x+sum; + if (gl_SubgroupInvocationID==31) { + warpPrefixSum[subgroupId] = count.x+sum; } + memoryBarrierShared(); barrier(); - #ifdef IS_INTEL - uint val = subgroupExclusiveAdd(warpPrefixSum[gl_SubgroupInvocationID]); - barrier(); - if (gl_SubgroupID == 0) { - warpPrefixSum[gl_SubgroupInvocationID] = val; - } - #else if (gl_SubgroupID == 0) { uint val = warpPrefixSum[gl_SubgroupInvocationID]; subgroupBarrier(); //Use warp to do entire add in 1 reduction warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val); } - #endif + memoryBarrierShared(); barrier(); + //Add the computed sum across all threads and warps - count += warpPrefixSum[gl_SubgroupID]; + count += warpPrefixSum[subgroupId]; ioCount[gid] = count; } \ No newline at end of file