Greatly accelerated geometry cleaner gpu code
This commit is contained in:
@@ -35,6 +35,7 @@ public class NodeCleaner {
|
|||||||
|
|
||||||
|
|
||||||
private static final int SORTING_WORKER_SIZE = 64;
|
private static final int SORTING_WORKER_SIZE = 64;
|
||||||
|
private static final int WORK_PER_THREAD = 8;
|
||||||
private static final int OUTPUT_COUNT = 256;
|
private static final int OUTPUT_COUNT = 256;
|
||||||
|
|
||||||
|
|
||||||
@@ -43,6 +44,7 @@ public class NodeCleaner {
|
|||||||
|
|
||||||
private final AutoBindingShader sorter = Shader.makeAuto(PrintfDebugUtil.PRINTF_processor)
|
private final AutoBindingShader sorter = Shader.makeAuto(PrintfDebugUtil.PRINTF_processor)
|
||||||
.define("WORK_SIZE", SORTING_WORKER_SIZE)
|
.define("WORK_SIZE", SORTING_WORKER_SIZE)
|
||||||
|
.define("ELEMS_PER_THREAD", WORK_PER_THREAD)
|
||||||
.define("OUTPUT_SIZE", OUTPUT_COUNT)
|
.define("OUTPUT_SIZE", OUTPUT_COUNT)
|
||||||
.define("VISIBILITY_BUFFER_BINDING", 1)
|
.define("VISIBILITY_BUFFER_BINDING", 1)
|
||||||
.define("OUTPUT_BUFFER_BINDING", 2)
|
.define("OUTPUT_BUFFER_BINDING", 2)
|
||||||
@@ -134,7 +136,7 @@ public class NodeCleaner {
|
|||||||
|
|
||||||
//TODO: choose whether this is in nodeSpace or section/geometryId space
|
//TODO: choose whether this is in nodeSpace or section/geometryId space
|
||||||
//
|
//
|
||||||
glDispatchCompute((this.nodeManager.getCurrentMaxNodeId() + SORTING_WORKER_SIZE - 1) / SORTING_WORKER_SIZE, 1, 1);
|
glDispatchCompute((this.nodeManager.getCurrentMaxNodeId() + (SORTING_WORKER_SIZE+WORK_PER_THREAD) - 1) / (SORTING_WORKER_SIZE+WORK_PER_THREAD), 1, 1);
|
||||||
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
|
||||||
|
|
||||||
this.resultTransformer.bind();
|
this.resultTransformer.bind();
|
||||||
|
|||||||
@@ -5,12 +5,12 @@
|
|||||||
//#define OUTPUT_SIZE 128
|
//#define OUTPUT_SIZE 128
|
||||||
|
|
||||||
layout(local_size_x=WORK_SIZE, local_size_y=1) in;
|
layout(local_size_x=WORK_SIZE, local_size_y=1) in;
|
||||||
//256 workgroup
|
#define OPS_PER_THREAD (OUTPUT_SIZE/WORK_SIZE)
|
||||||
|
|
||||||
#import <voxy:lod/hierarchical/node.glsl>
|
#import <voxy:lod/hierarchical/node.glsl>
|
||||||
|
|
||||||
layout(binding = VISIBILITY_BUFFER_BINDING, std430) restrict readonly buffer VisibilityDataBuffer {
|
layout(binding = VISIBILITY_BUFFER_BINDING, std430) restrict readonly buffer VisibilityDataBuffer {
|
||||||
uint[] visiblity;
|
uint[] visibility;
|
||||||
};
|
};
|
||||||
|
|
||||||
layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer MinimumVisibilityBuffer {//TODO: might need to be volatile
|
layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer MinimumVisibilityBuffer {//TODO: might need to be volatile
|
||||||
@@ -18,12 +18,15 @@ layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict volatile buffer Minimum
|
|||||||
};
|
};
|
||||||
|
|
||||||
//Returns the id of the max value
|
//Returns the id of the max value
|
||||||
uint atomicDerefMaxExchange(uint atId, uint id) {
|
uint atomicDerefMaxExchangeGlobal(uint atId, uint id) {
|
||||||
const uint value = visiblity[id];
|
const uint value = visibility[id];
|
||||||
while (true) {
|
while (true) {
|
||||||
const uint existingId = minVisIds[atId];
|
const uint existingId = minVisIds[atId];//TODO: check that this does what we want, and that it obtains the value of atValue, at this point
|
||||||
|
if (existingId == id) {//If we are trying to insert self, return -1
|
||||||
|
return uint(-1);
|
||||||
|
}
|
||||||
//Check if the value is less than the dereferenced value, if its not, return our own id
|
//Check if the value is less than the dereferenced value, if its not, return our own id
|
||||||
if (visiblity[existingId] <= value) {
|
if (visibility[existingId&((1u<<31)-1)] <= value) {//Remove the flag bit
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
//Attempt to swap, since we know we are less than the existingId
|
//Attempt to swap, since we know we are less than the existingId
|
||||||
@@ -39,37 +42,70 @@ uint atomicDerefMaxExchange(uint atId, uint id) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
//TODO: optimize
|
//TODO: optimize
|
||||||
void bubbleSort(uint start, uint id) {
|
void bubbleSortGlobal(uint start, uint id) {
|
||||||
for (uint i = start; i < OUTPUT_SIZE; i++) {
|
for (uint i = start; i < OUTPUT_SIZE; i++) {
|
||||||
id = atomicDerefMaxExchange(i, id);
|
id = atomicDerefMaxExchangeGlobal(i, id);
|
||||||
|
if (id == uint(-1)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void main() {
|
//TODO: maybe also have a shared "cache" of the visibility data
|
||||||
//if (gl_GlobalInvocationID.x <64) {
|
// meaning that the shader doesnt need to access global memory as much
|
||||||
// minVisIds[gl_GlobalInvocationID.x] = visiblity[gl_GlobalInvocationID.x];
|
shared uint initalSort[OUTPUT_SIZE];
|
||||||
//}
|
|
||||||
//First do a min sort/set of min OUTPUT_SIZE values of the set
|
|
||||||
uint vis = visiblity[gl_GlobalInvocationID.x];
|
//Returns the id of the max value
|
||||||
if (vis == uint(-1)) {
|
uint atomicDerefMaxExchangeLocal(uint atId, uint id) {
|
||||||
return;
|
const uint value = visibility[id];
|
||||||
|
while (true) {
|
||||||
|
const uint existingId = initalSort[atId];//TODO: check that this does what we want, and that it obtains the value of atValue, at this point
|
||||||
|
if (existingId == id) {//If we are trying to insert self, return -1
|
||||||
|
return uint(-1);
|
||||||
|
}
|
||||||
|
//Check if the value is less than the dereferenced value, if its not, return our own id
|
||||||
|
if (visibility[existingId&((1u<<31)-1)] <= value) {//Remove the flag bit
|
||||||
|
return id;
|
||||||
|
}
|
||||||
|
//Attempt to swap, since we know we are less than the existingId
|
||||||
|
const uint c = atomicCompSwap(initalSort[atId], existingId, id);
|
||||||
|
//Check if we did swap, else if we failed (or got reswapped else where) recheck
|
||||||
|
|
||||||
|
//We did swap, (since the original mem contents was the existing id)
|
||||||
|
// which means existingId is now the max of the ptr
|
||||||
|
if (c == existingId) {
|
||||||
|
return existingId;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (visiblity[minVisIds[OUTPUT_SIZE-1]] <= vis) {
|
}
|
||||||
return;
|
void bubbleSortInital(uint vis, uint id) {
|
||||||
|
uint start = 0;
|
||||||
|
//Fast path cut out half the ops
|
||||||
|
if (visibility[initalSort[(OUTPUT_SIZE-1)>>1]] <= vis) {//Check if we are more than half way
|
||||||
|
start = (OUTPUT_SIZE-1)>>1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for (uint i = start; i < OUTPUT_SIZE; i++) {
|
||||||
|
id = atomicDerefMaxExchangeLocal(i, id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool shouldSortId(uint id) {
|
||||||
UnpackedNode node;
|
UnpackedNode node;
|
||||||
if (unpackNode(node, gl_GlobalInvocationID.x)==uvec4(-1)) {
|
if (unpackNode(node, gl_GlobalInvocationID.x)==uvec4(-1)) {
|
||||||
return;//Unallocated node
|
return false;//Unallocated node
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isEmptyMesh(node) || (!hasMesh(node))) {//|| (!hasChildren(node))
|
if (isEmptyMesh(node) || (!hasMesh(node))) {//|| (!hasChildren(node))
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
//TODO: FIXME: DONT HARDCODE TOP LEVEL LOD LEVEL
|
//TODO: FIXME: DONT HARDCODE TOP LEVEL LOD LEVEL
|
||||||
if (node.lodLevel == 4) {// (!hasChildren(node)) -> Assume leaf node
|
if (node.lodLevel == 4) {// (!hasChildren(node)) -> Assume leaf node
|
||||||
return;//Cannot remove geometry from top level node
|
return false;//Cannot remove geometry from top level node
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
/*THIS IS COMPLETLY WRONG, we need to check if all the children of the parent of the child are leaf nodes
|
/*THIS IS COMPLETLY WRONG, we need to check if all the children of the parent of the child are leaf nodes
|
||||||
// not this node
|
// not this node
|
||||||
|
|
||||||
@@ -87,7 +123,56 @@ void main() {
|
|||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
bubbleSort(0, gl_GlobalInvocationID.x);
|
void main() {
|
||||||
|
//Cheaky trick, copy the _global buffer_ into the local buffer
|
||||||
|
// this means that insertion into the local buffer can be accelerated W.R.T global
|
||||||
|
for (uint i = 0; i < OPS_PER_THREAD; i++) {
|
||||||
|
//Copy in with warp size batch fetch
|
||||||
|
uint id = gl_LocalInvocationID.x + (i*WORK_SIZE);
|
||||||
|
initalSort[id] = minVisIds[id]|(1u<<31);//Flag the id as being external
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
//Do insertion and sort into local shared buffer
|
||||||
|
for (uint i = 0; i < ELEMS_PER_THREAD; i++) {
|
||||||
|
uint id = gl_GlobalInvocationID.x*ELEMS_PER_THREAD+i;
|
||||||
|
uint vis = visibility[id];
|
||||||
|
if (vis == uint(-1)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
//Quick exit if this element is already bigger than global output
|
||||||
|
if (visibility[minVisIds[OUTPUT_SIZE-1]] <= vis) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (!shouldSortId(id)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bubbleSortInital(vis, id);
|
||||||
|
}
|
||||||
|
barrier();
|
||||||
|
//Do insertion into global visibility array
|
||||||
|
// this is done front to back
|
||||||
|
|
||||||
|
//Work size batching
|
||||||
|
for (uint i = 0; i < OPS_PER_THREAD; i++) {
|
||||||
|
barrier();
|
||||||
|
uint id = gl_LocalInvocationID.x+(i*WORK_SIZE);
|
||||||
|
uint sid = initalSort[id];
|
||||||
|
if ((sid&(1u<<31)) != 0) {
|
||||||
|
//The flag being external was set, meaning we should NOT insert this element
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
uint vis = visibility[sid];
|
||||||
|
//If output is already smaller than self
|
||||||
|
if (visibility[minVisIds[OUTPUT_SIZE-1]] <= vis) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
uint start = id;
|
||||||
|
if (visibility[minVisIds[(id+OUTPUT_SIZE)>>1]] <= vis) {//Try to skip
|
||||||
|
start = (id+OUTPUT_SIZE)>>1;
|
||||||
|
}
|
||||||
|
bubbleSortGlobal(start, sid);//Insert into global
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Reference in New Issue
Block a user