Fix intel
This commit is contained in:
@@ -43,6 +43,10 @@ public class RenderService<T extends AbstractSectionRenderer<J, Q>, J extends Vi
|
|||||||
|
|
||||||
private static long getGeometryBufferSize() {
|
private static long getGeometryBufferSize() {
|
||||||
long geometryCapacity = Math.min((1L<<(64-Long.numberOfLeadingZeros(Capabilities.INSTANCE.ssboMaxSize-1)))<<1, 1L<<32)-1024/*(1L<<32)-1024*/;
|
long geometryCapacity = Math.min((1L<<(64-Long.numberOfLeadingZeros(Capabilities.INSTANCE.ssboMaxSize-1)))<<1, 1L<<32)-1024/*(1L<<32)-1024*/;
|
||||||
|
if (Capabilities.INSTANCE.isIntel) {
|
||||||
|
geometryCapacity = Math.max(geometryCapacity, 1L<<30);//intel moment, force min 1gb
|
||||||
|
}
|
||||||
|
|
||||||
//Limit to available dedicated memory if possible
|
//Limit to available dedicated memory if possible
|
||||||
if (Capabilities.INSTANCE.canQueryGpuMemory) {
|
if (Capabilities.INSTANCE.canQueryGpuMemory) {
|
||||||
//512mb less than avalible,
|
//512mb less than avalible,
|
||||||
|
|||||||
@@ -30,6 +30,7 @@ import static org.lwjgl.opengl.GL30.glBindVertexArray;
|
|||||||
import static org.lwjgl.opengl.GL31.GL_UNIFORM_BUFFER;
|
import static org.lwjgl.opengl.GL31.GL_UNIFORM_BUFFER;
|
||||||
import static org.lwjgl.opengl.GL33.glBindSampler;
|
import static org.lwjgl.opengl.GL33.glBindSampler;
|
||||||
import static org.lwjgl.opengl.GL40C.GL_DRAW_INDIRECT_BUFFER;
|
import static org.lwjgl.opengl.GL40C.GL_DRAW_INDIRECT_BUFFER;
|
||||||
|
import static org.lwjgl.opengl.GL42.glMemoryBarrier;
|
||||||
import static org.lwjgl.opengl.GL43.*;
|
import static org.lwjgl.opengl.GL43.*;
|
||||||
import static org.lwjgl.opengl.GL45.glBindTextureUnit;
|
import static org.lwjgl.opengl.GL45.glBindTextureUnit;
|
||||||
import static org.lwjgl.opengl.GL45.glClearNamedBufferData;
|
import static org.lwjgl.opengl.GL45.glClearNamedBufferData;
|
||||||
@@ -263,20 +264,6 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
|
|||||||
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);//Am unsure if is needed
|
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);//Am unsure if is needed
|
||||||
glDispatchCompute(1,1,1);
|
glDispatchCompute(1,1,1);
|
||||||
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
||||||
//glFinish();
|
|
||||||
/*
|
|
||||||
DownloadStream.INSTANCE.download(this.distanceCountBuffer, 0, 1024*4, (ptr,size)->{
|
|
||||||
int[] a = new int[1024];
|
|
||||||
for (int i = 0; i < 1024; i++) {
|
|
||||||
a[i] = MemoryUtil.memGetInt(ptr+4*i);
|
|
||||||
}
|
|
||||||
for (int i = 0; i < 1023; i++){
|
|
||||||
if (a[i+1]<a[i]) {
|
|
||||||
System.out.println(a[i]+","+a[i+1]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
});
|
|
||||||
*/
|
|
||||||
|
|
||||||
this.translucentGenShader.bind();
|
this.translucentGenShader.bind();
|
||||||
glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniform.id);
|
glBindBufferBase(GL_UNIFORM_BUFFER, 0, this.uniform.id);
|
||||||
@@ -287,6 +274,7 @@ public class MDICSectionRenderer extends AbstractSectionRenderer<MDICViewport, B
|
|||||||
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, this.distanceCountBuffer.id);
|
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 5, this.distanceCountBuffer.id);
|
||||||
|
|
||||||
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, this.drawCountCallBuffer.id);//This isnt great but its a nice trick to bound it, even if its inefficent ;-;
|
glBindBuffer(GL_DISPATCH_INDIRECT_BUFFER, this.drawCountCallBuffer.id);//This isnt great but its a nice trick to bound it, even if its inefficent ;-;
|
||||||
|
glMemoryBarrier(-1);
|
||||||
glDispatchComputeIndirect(0);
|
glDispatchComputeIndirect(0);
|
||||||
glMemoryBarrier(GL_COMMAND_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
|
glMemoryBarrier(GL_COMMAND_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -8,15 +8,17 @@
|
|||||||
//Does inital parralel prefix sum on batches of WORK_SIZE
|
//Does inital parralel prefix sum on batches of WORK_SIZE
|
||||||
layout(local_size_x=WORK_SIZE) in;
|
layout(local_size_x=WORK_SIZE) in;
|
||||||
|
|
||||||
layout(binding = IO_BUFFER, std430) restrict buffer InputBuffer {
|
layout(binding = IO_BUFFER, std430) buffer InputBuffer {
|
||||||
uvec4[] ioCount;
|
uvec4[] ioCount;
|
||||||
};
|
};
|
||||||
|
|
||||||
shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
|
shared uint warpPrefixSum[32];//Warps are 32, tricks require full warp
|
||||||
|
|
||||||
void main() {
|
void main() {
|
||||||
|
/*
|
||||||
|
uint subgroupId = gl_LocalInvocationID.x>>5;
|
||||||
warpPrefixSum[gl_SubgroupInvocationID] = 0;
|
warpPrefixSum[gl_SubgroupInvocationID] = 0;
|
||||||
barrier();
|
memoryBarrierShared();
|
||||||
|
|
||||||
//todo
|
//todo
|
||||||
//assert(gl_SubgroupSize == 32);
|
//assert(gl_SubgroupSize == 32);
|
||||||
@@ -33,33 +35,69 @@ void main() {
|
|||||||
sum = count.w + dat.w;
|
sum = count.w + dat.w;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
barrier();
|
||||||
|
count += subgroupExclusiveAdd(sum);
|
||||||
|
|
||||||
|
if (gl_SubgroupInvocationID==31) {
|
||||||
|
warpPrefixSum[subgroupId] = count.x+sum;
|
||||||
|
}
|
||||||
|
memoryBarrierShared();
|
||||||
|
barrier();
|
||||||
|
uint val = warpPrefixSum[gl_SubgroupInvocationID];
|
||||||
|
barrier();
|
||||||
|
if (subgroupId == 0) {
|
||||||
|
//Use warp to do entire add in 1 reduction
|
||||||
|
warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
|
||||||
|
}
|
||||||
|
memoryBarrierShared();
|
||||||
|
barrier();
|
||||||
|
count += warpPrefixSum[subgroupId];
|
||||||
|
ioCount[gid] = count;
|
||||||
|
*/
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef IS_INTEL
|
||||||
|
uint subgroupId = gl_LocalInvocationID.x>>5;
|
||||||
|
#else
|
||||||
|
uint subgroupId = gl_SubgroupID;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//todo
|
||||||
|
//assert(gl_SubgroupSize == 32);
|
||||||
|
//assert(gl_NumSubgroups == (WORK_SIZE>>5));
|
||||||
|
|
||||||
|
uint gid = gl_GlobalInvocationID.x;
|
||||||
|
uvec4 count = uvec4(0);
|
||||||
|
uint sum = 0;
|
||||||
|
{
|
||||||
|
uvec4 dat = ioCount[gid];
|
||||||
|
count.yzw = dat.xyz;
|
||||||
|
count.z += count.y;
|
||||||
|
count.w += count.z;
|
||||||
|
sum = count.w + dat.w;
|
||||||
|
}
|
||||||
subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
|
subgroupBarrier();//Wait for all threads in the subgroup to get the buffer
|
||||||
|
|
||||||
count += subgroupExclusiveAdd(sum);
|
count += subgroupExclusiveAdd(sum);
|
||||||
|
|
||||||
if ((gl_LocalInvocationID.x&31u)==31) {
|
if (gl_SubgroupInvocationID==31) {
|
||||||
warpPrefixSum[gl_SubgroupID] = count.x+sum;
|
warpPrefixSum[subgroupId] = count.x+sum;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
memoryBarrierShared();
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
#ifdef IS_INTEL
|
|
||||||
uint val = subgroupExclusiveAdd(warpPrefixSum[gl_SubgroupInvocationID]);
|
|
||||||
barrier();
|
|
||||||
if (gl_SubgroupID == 0) {
|
|
||||||
warpPrefixSum[gl_SubgroupInvocationID] = val;
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
if (gl_SubgroupID == 0) {
|
if (gl_SubgroupID == 0) {
|
||||||
uint val = warpPrefixSum[gl_SubgroupInvocationID];
|
uint val = warpPrefixSum[gl_SubgroupInvocationID];
|
||||||
subgroupBarrier();
|
subgroupBarrier();
|
||||||
//Use warp to do entire add in 1 reduction
|
//Use warp to do entire add in 1 reduction
|
||||||
warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
|
warpPrefixSum[gl_SubgroupInvocationID] = subgroupExclusiveAdd(val);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
|
memoryBarrierShared();
|
||||||
barrier();
|
barrier();
|
||||||
|
|
||||||
//Add the computed sum across all threads and warps
|
//Add the computed sum across all threads and warps
|
||||||
count += warpPrefixSum[gl_SubgroupID];
|
count += warpPrefixSum[subgroupId];
|
||||||
ioCount[gid] = count;
|
ioCount[gid] = count;
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user