compute scatter based setting

This commit is contained in:
mcrcortex
2025-05-14 20:19:50 +10:00
parent 0d27de5ba7
commit 3221702fc2
4 changed files with 198 additions and 85 deletions

View File

@@ -6,6 +6,8 @@ import it.unimi.dsi.fastutil.ints.IntConsumer;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import me.cortex.voxy.client.core.gl.GlBuffer;
import me.cortex.voxy.client.core.gl.shader.Shader;
import me.cortex.voxy.client.core.gl.shader.ShaderType;
import me.cortex.voxy.client.core.rendering.ISectionWatcher;
import me.cortex.voxy.client.core.rendering.building.BuiltSection;
import me.cortex.voxy.client.core.rendering.section.geometry.BasicAsyncGeometryManager;
@@ -15,6 +17,7 @@ import me.cortex.voxy.client.core.rendering.util.UploadStream;
import me.cortex.voxy.common.Logger;
import me.cortex.voxy.common.util.MemoryBuffer;
import me.cortex.voxy.common.world.WorldSection;
import net.fabricmc.loader.impl.util.log.Log;
import org.lwjgl.system.MemoryUtil;
import java.lang.invoke.MethodHandles;
@@ -25,6 +28,16 @@ import java.util.concurrent.locks.LockSupport;
import java.util.concurrent.locks.StampedLock;
import static me.cortex.voxy.client.core.rendering.section.geometry.BasicSectionGeometryData.SECTION_METADATA_SIZE;
import static org.lwjgl.opengl.ARBUniformBufferObject.glBindBufferBase;
import static org.lwjgl.opengl.GL30C.glUniform1ui;
import static org.lwjgl.opengl.GL42C.GL_UNIFORM_BARRIER_BIT;
import static org.lwjgl.opengl.GL42C.glMemoryBarrier;
import static org.lwjgl.opengl.GL43C.*;
import static org.lwjgl.opengl.GL44.GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
//TODO: create an "async upload stream", that is, the upload stream is a raw mapped buffer pointer that can be written to
// which is then synced to the gpu on "render thread sync",
//An "async host" for a NodeManager, has specific synchonius entry and exit points
// this is done off thread to reduce the amount of work done on the render thread, improving frame stability and reducing runtime overhead
@@ -127,6 +140,13 @@ public class AsyncNodeManager {
return resultSet;
}
private final Shader scatterWrite = Shader.make()
.define("INPUT_BUFFER_BINDING", 0)
.define("OUTPUT_BUFFER1_BINDING", 1)
.define("OUTPUT_BUFFER2_BINDING", 2)
.add(ShaderType.COMPUTE, "voxy:util/scatter.comp")
.compile();
private void run() {
if (this.workCounter.get() == 0) {
LockSupport.park();
@@ -192,7 +212,9 @@ public class AsyncNodeManager {
job.release();
} while (true);
for (int limit = 0; limit < 200; limit++) {//Limit uploading
final int UPLOAD_LIMIT = 200;
for (int limit = 0; limit < UPLOAD_LIMIT/2; limit++) //Limit uploading, TODO: limit this by frame sync count, not here
{
var job = this.geometryUpdateQueue.poll();
if (job == null)
break;
@@ -200,7 +222,7 @@ public class AsyncNodeManager {
this.manager.processGeometryResult(job);
}
for (int limit = 0; limit < 2; limit++) {
while (true) {//Process all request batches
var job = this.requestBatchQueue.poll();
if (job == null)
break;
@@ -368,22 +390,16 @@ public class AsyncNodeManager {
var iter = ids.intIterator();
while (iter.hasNext()) {
int val = iter.nextInt();
int placeId = results.geometryIdUpdateMap.putIfAbsent(val, results.geometryIdUpdateMap.size());
placeId = placeId==-1?results.geometryIdUpdateMap.size()-1:placeId;
if (results.geometryIdUpdateData.size<=placeId*32L) {
//We need to expand the buffer :(
var old = results.geometryIdUpdateData;
var newBuffer = new MemoryBuffer((long) (old.size*1.5));
Logger.info("Expanding geometry update buffer to " + newBuffer.size);
old.cpyTo(newBuffer.address);
old.free();
results.geometryIdUpdateData = newBuffer;
}
//Write updated data
this.geometryManager.writeMetadata(val, placeId*32L + results.geometryIdUpdateData.address);
int scatterAddr = (val<<1)|(1<<31);//Since we write to the second buffer
//Geometry buffer is index of 1, so mutate to put it in that location, it is also 32 bytes, so needs to be split into 2 separate scatter writes
long ptrA = results.getScatterWritePtr(scatterAddr+0, 1);
long ptrB = results.getScatterWritePtr(scatterAddr+1, 0);
//Write update data
this.geometryManager.writeMetadataSplit(val, ptrA, ptrB);
}
ids.clear();
this.needsWaitForSync |= results.geometryIdUpdateMap.size()>250;
}
//Node updates
@@ -392,28 +408,20 @@ public class AsyncNodeManager {
var iter = ids.intIterator();
while (iter.hasNext()) {
int val = iter.nextInt();
int placeId = results.nodeIdUpdateMap.putIfAbsent(val, results.nodeIdUpdateMap.size());
placeId = placeId==-1?results.nodeIdUpdateMap.size()-1:placeId;
if (results.nodeIdUpdateData.size<=placeId*16L) {
//We need to expand the buffer :(
var old = results.nodeIdUpdateData;
var newBuffer = new MemoryBuffer((long) (old.size*1.5));
Logger.info("Expanding node update buffer to " + newBuffer.size);
old.cpyTo(newBuffer.address);
old.free();
results.nodeIdUpdateData = newBuffer;
}
//Dont need to modify the write location since we write to buffer 0
long ptr = results.getScatterWritePtr(val);
//Write updated data
this.manager.writeNode(val, placeId*16L + results.nodeIdUpdateData.address);
this.manager.writeNode(val, ptr);
}
ids.clear();
this.needsWaitForSync |= results.nodeIdUpdateMap.size()>=512;
}
}
results.geometrySectionCount = this.geometryManager.getSectionCount();
results.currentMaxNodeId = this.manager.getCurrentMaxNodeId();
this.needsWaitForSync |= results.geometryUploads.size() > UPLOAD_LIMIT;//Max of 200 uploads per frame :(
if (!RESULT_HANDLE.compareAndSet(this, null, results)) {
throw new IllegalArgumentException("Should always have null");
}
@@ -441,9 +449,9 @@ public class AsyncNodeManager {
//Dont need to clear as is not used again
}
boolean doCommit = false;
{//Update basic geometry data
var store = (BasicSectionGeometryData)this.geometryData;
store.setSectionCount(results.geometrySectionCount);
//Do geometry uploads
@@ -455,35 +463,27 @@ public class AsyncNodeManager {
UploadStream.INSTANCE.upload(store.getGeometryBuffer(), Integer.toUnsignedLong(val.getIntKey()) * 8L, buffer);
buffer.free();//Free the buffer was uploading
}
doCommit = true;
}
//Do geometry id updates
if (!results.geometryIdUpdateMap.isEmpty()) {
var iter = results.geometryIdUpdateMap.int2IntEntrySet().fastIterator();
while (iter.hasNext()) {
var val = iter.next();
long ptr = UploadStream.INSTANCE.upload(store.getMetadataBuffer(), Integer.toUnsignedLong(val.getIntKey()) * SECTION_METADATA_SIZE, SECTION_METADATA_SIZE);
MemoryUtil.memCopy(results.geometryIdUpdateData.address + Integer.toUnsignedLong(val.getIntValue()) * SECTION_METADATA_SIZE, ptr, SECTION_METADATA_SIZE);
}
doCommit = true;
UploadStream.INSTANCE.commit();
}
}
//Do node id updates
if (!results.nodeIdUpdateMap.isEmpty()) {
var iter = results.nodeIdUpdateMap.int2IntEntrySet().fastIterator();
while (iter.hasNext()) {
var val = iter.next();
long ptr = UploadStream.INSTANCE.upload(nodeBuffer, Integer.toUnsignedLong(val.getIntKey()) * 16L, 16L);
MemoryUtil.memCopy(results.nodeIdUpdateData.address + Integer.toUnsignedLong(val.getIntValue()) * 16L, ptr, 16L);
}
doCommit = true;
}
if (!results.scatterWriteLocationMap.isEmpty()) {//Scatter write
int count = results.scatterWriteLocationMap.size();//Number of writes, not chunks or uvec4 count
int chunks = (count+3)/4;
int streamSize = chunks*80;//80 bytes per chunk, it is guaranteed the buffer is big enough
long ptr = UploadStream.INSTANCE.rawUploadAddress(streamSize + 16);//Ensure it is 16 byte aligned
ptr = (ptr+15L)&~0xFL;//Align up to 16 bytes
MemoryUtil.memCopy(results.scatterWriteBuffer.address, UploadStream.INSTANCE.getBaseAddress() + ptr, streamSize);
UploadStream.INSTANCE.commit();//Commit the buffer
if (doCommit) {
UploadStream.INSTANCE.commit();
this.scatterWrite.bind();
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, UploadStream.INSTANCE.getRawBufferId(), ptr, streamSize);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, nodeBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ((BasicSectionGeometryData) this.geometryData).getMetadataBuffer().id);
glUniform1ui(0, count);
glMemoryBarrier(GL_UNIFORM_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT|GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
glDispatchCompute((count+127)/128, 1, 1);
glMemoryBarrier(GL_UNIFORM_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
}
//Insert the result set into the cache
@@ -619,20 +619,20 @@ public class AsyncNodeManager {
if (RESULT_HANDLE.get(this) != null) {
var result = (SyncResults)RESULT_HANDLE.getAndSet(this, null);
result.geometryUploads.forEach((a,b)->b.free());
result.nodeIdUpdateData.free();
result.geometryIdUpdateData.free();
result.scatterWriteBuffer.free();
}
if (RESULT_CACHE_1_HANDLE.get(this) != null) {//Clear cache 1
var result = (SyncResults)RESULT_CACHE_1_HANDLE.getAndSet(this, null);
result.nodeIdUpdateData.free();
result.geometryIdUpdateData.free();
result.scatterWriteBuffer.free();
}
if (RESULT_CACHE_2_HANDLE.get(this) != null) {//Clear cache 2
var result = (SyncResults)RESULT_CACHE_2_HANDLE.getAndSet(this, null);
result.nodeIdUpdateData.free();
result.geometryIdUpdateData.free();
result.scatterWriteBuffer.free();
}
this.scatterWrite.free();
}
//Results object, which is to be synced between the render thread and worker thread
@@ -644,8 +644,6 @@ public class AsyncNodeManager {
// cleaner move and set operations
//Node id updates + size
private final Int2IntOpenHashMap nodeIdUpdateMap = new Int2IntOpenHashMap();//node id to update data location
private MemoryBuffer nodeIdUpdateData = new MemoryBuffer(8192*2);//capacity for 1024 entries, TODO: ADD RESIZE
private int currentMaxNodeId;// the id of the ending of the node ids
//TLN add/rem
@@ -654,22 +652,60 @@ public class AsyncNodeManager {
//Deltas for geometry store
private int geometrySectionCount;
private final Int2ObjectOpenHashMap<MemoryBuffer> geometryUploads = new Int2ObjectOpenHashMap<>();
private final Int2IntOpenHashMap geometryIdUpdateMap = new Int2IntOpenHashMap();//geometry id to update data location
private MemoryBuffer geometryIdUpdateData = new MemoryBuffer(8192*2);//capacity for 512 entries, TODO: ADD RESIZE
//Scatter writes for both geometry and node metadata
private MemoryBuffer scatterWriteBuffer = new MemoryBuffer(8192*2);
private final Int2IntOpenHashMap scatterWriteLocationMap = new Int2IntOpenHashMap(1024);
public SyncResults() {
this.nodeIdUpdateMap.defaultReturnValue(-1);
this.geometryIdUpdateMap.defaultReturnValue(-1);
this.scatterWriteLocationMap.defaultReturnValue(-1);
}
public void reset() {
this.nodeIdUpdateMap.clear();
this.scatterWriteLocationMap.clear();
this.currentMaxNodeId = 0;
this.tlnDelta.clear();
this.geometrySectionCount = 0;
this.geometryUploads.clear();
this.geometryIdUpdateMap.clear();
}
//Get or create a scatter write address for the given location
public long getScatterWritePtr(int location) {
return this.getScatterWritePtr(location, 0);
}
//ensureExtra is used to ensure that allocations are "effectivly" in the same memory block (kinda?)
public long getScatterWritePtr(int location, int ensureExtra) {
int loc = this.scatterWriteLocationMap.get(location);
if (loc == -1) {//Location doesnt exist, create it
this.ensureScatterBufferCapacity(1+ensureExtra);//Ensure can contain capacity for this + extra
int baseId = this.scatterWriteLocationMap.size();
int chunkBase = (baseId/4)*5;//Base uvec4 index
int innerId = baseId&3;
MemoryUtil.memPutInt(this.scatterWriteBuffer.address + (chunkBase*16L) + (innerId*4L), location);//Set the write location
int writeLocation = (chunkBase+1+innerId);//Write location in uvec4
this.scatterWriteLocationMap.put(location, writeLocation);
return this.scatterWriteBuffer.address + (writeLocation*16L);
} else {
return this.scatterWriteBuffer.address + (16L*loc);
}
}
private void ensureScatterBufferCapacity(int extra) {
int requiredChunks = ((this.scatterWriteLocationMap.size()+extra)+3)/4;//4 entries in a chunk
long requiredSize = requiredChunks*5L*16L;//5 uvec4 per chunk, 16 bytes per uvec4
if (this.scatterWriteBuffer.size <= requiredSize) {//Needs resize
long newSize = (long) ((this.scatterWriteBuffer.size*1.5) + extra*80L);
newSize = ((newSize+79)/80)*80;//Ceil to chunk size
Logger.info("Expanding scatter update buffer to " + newSize);
var newBuffer = new MemoryBuffer(newSize);
this.scatterWriteBuffer.cpyTo(newBuffer.address);
this.scatterWriteBuffer.free();
this.scatterWriteBuffer = newBuffer;
}
}
}
}

View File

@@ -140,18 +140,36 @@ public class BasicAsyncGeometryManager implements IGeometryManager {
}
}
public void writeMetadataSplit(int sectionId, long ptrA, long ptrB) {
if (SECTION_METADATA_SIZE != 32) {//This system only works with 32 byte metadata
throw new IllegalStateException();
}
var sec = this.sectionMetadata.get(sectionId);
if (sec == null) {
//Write nothing
MemoryUtil.memSet(ptrA, 0, 16);
MemoryUtil.memSet(ptrB, 0, 16);
} else {
sec.writeMetadataSplitParts(ptrA, ptrB);
}
}
private record SectionMeta(long position, int aabb, int geometryPtr, int itemCount, int[] offsets, byte childExistence) {
public void writeMetadata(long ptr) {
//Split the long into 2 ints to solve endian issues
MemoryUtil.memPutInt(ptr, (int) (this.position>>32)); ptr += 4;
MemoryUtil.memPutInt(ptr, (int) this.position); ptr += 4;
MemoryUtil.memPutInt(ptr, (int) this.aabb); ptr += 4;
MemoryUtil.memPutInt(ptr, this.geometryPtr + this.offsets[0]); ptr += 4;
this.writeMetadataSplitParts(ptr, ptr+16);
}
MemoryUtil.memPutInt(ptr, (this.offsets[1]-this.offsets[0])|((this.offsets[2]-this.offsets[1])<<16)); ptr += 4;
MemoryUtil.memPutInt(ptr, (this.offsets[3]-this.offsets[2])|((this.offsets[4]-this.offsets[3])<<16)); ptr += 4;
MemoryUtil.memPutInt(ptr, (this.offsets[5]-this.offsets[4])|((this.offsets[6]-this.offsets[5])<<16)); ptr += 4;
MemoryUtil.memPutInt(ptr, (this.offsets[7]-this.offsets[6])|((this.itemCount -this.offsets[7])<<16)); ptr += 4;
public void writeMetadataSplitParts(long ptrA, long ptrB) {//First 16 bytes are put into ptrA the remaining 16 bytes are put into ptrB
//Split the long into 2 ints to solve endian issues
MemoryUtil.memPutInt(ptrA, (int) (this.position>>32)); ptrA += 4;
MemoryUtil.memPutInt(ptrA, (int) this.position); ptrA += 4;
MemoryUtil.memPutInt(ptrA, (int) this.aabb); ptrA += 4;
MemoryUtil.memPutInt(ptrA, this.geometryPtr + this.offsets[0]); ptrA += 4;
MemoryUtil.memPutInt(ptrB, (this.offsets[1]-this.offsets[0])|((this.offsets[2]-this.offsets[1])<<16)); ptrB += 4;
MemoryUtil.memPutInt(ptrB, (this.offsets[3]-this.offsets[2])|((this.offsets[4]-this.offsets[3])<<16)); ptrB += 4;
MemoryUtil.memPutInt(ptrB, (this.offsets[5]-this.offsets[4])|((this.offsets[6]-this.offsets[5])<<16)); ptrB += 4;
MemoryUtil.memPutInt(ptrB, (this.offsets[7]-this.offsets[6])|((this.itemCount -this.offsets[7])<<16)); ptrB += 4;
}
}
}

View File

@@ -40,11 +40,25 @@ public class UploadStream {
public void upload(GlBuffer buffer, long destOffset, MemoryBuffer data) {//Note: does not free data, nor does it commit
data.cpyTo(this.upload(buffer, destOffset, data.size));
}
public long upload(GlBuffer buffer, long destOffset, long size) {
if (destOffset<0) {
throw new IllegalArgumentException();
long addr = this.rawUploadAddress((int) size);
this.uploadList.add(new UploadData(buffer, addr, destOffset, size));
return this.uploadBuffer.addr() + addr;
}
public long rawUpload(int size) {
return this.uploadBuffer.addr() + this.rawUploadAddress(size);
}
public long rawUploadAddress(int size) {
if (size < 0) {
throw new IllegalStateException("Negative size");
}
if (size > Integer.MAX_VALUE) {
if (size > this.uploadBuffer.size()) {
throw new IllegalArgumentException();
}
@@ -78,12 +92,9 @@ public class UploadStream {
throw new IllegalStateException();
}
this.uploadList.add(new UploadData(buffer, addr, destOffset, size));
return this.uploadBuffer.addr() + addr;
return addr;
}
public void commit() {
if (this.uploadList.isEmpty()) {
return;
@@ -127,6 +138,14 @@ public class UploadStream {
}
}
public long getBaseAddress() {
return this.uploadBuffer.addr();
}
public int getRawBufferId() {
return this.uploadBuffer.id;
}
private record UploadFrame(GlFence fence, LongArrayList allocations) {}
private record UploadData(GlBuffer target, long uploadOffset, long targetOffset, long size) {}

View File

@@ -0,0 +1,40 @@
#version 460 core
layout(local_size_x=128) in;
//In data is layed out in chunks, 4 dest locations (packed into a uvec4) followed by 4 data values, each a uvec4
// meaning each chunk is 5 uvec4's or 80 bytes
layout(binding = INPUT_BUFFER_BINDING, std430) restrict readonly buffer InputBuffer {
uvec4[] inData;
};
layout(binding = OUTPUT_BUFFER1_BINDING, std430) restrict writeonly buffer OutputBuffer1 {
uvec4[] output1;
};
layout(binding = OUTPUT_BUFFER2_BINDING, std430) restrict writeonly buffer OutputBuffer2 {
uvec4[] output2;
};
layout(location=0) uniform uint count;
void main() {
uint id = gl_GlobalInvocationID.x;
if (count <= id) {
return;
}
uint chunkBase = (id>>2u)*5u;
uint innerId = id&3u;
uint writeLocation = inData[chunkBase][innerId];//Select the write index based on the inner id
uvec4 writeData = inData[chunkBase+1+innerId];//+1 since the base is the write locations
uint writeBuffer = writeLocation>>31;//Top bit indicates what buffer to write to
writeLocation &= (1u<<31)-1;//Remove the buffer bit
if (writeBuffer == 0) {//Buffer 1
output1[writeLocation] = writeData;
} else {//Buffer 2
output2[writeLocation] = writeData;
}
}