Added gpu compute memcpy + cpu side timing statistics

This commit is contained in:
mcrcortex
2025-05-17 11:35:56 +10:00
parent f3aecbe944
commit 0b1d8b9fd9
5 changed files with 269 additions and 54 deletions

View File

@@ -75,6 +75,17 @@ public class TimingStatistics {
public static TimeSampler dynamic = new TimeSampler();
public static TimeSampler postDynamic = new TimeSampler();
public static TimeSampler A = new TimeSampler();
public static TimeSampler B = new TimeSampler();
public static TimeSampler C = new TimeSampler();
public static TimeSampler D = new TimeSampler();
public static TimeSampler E = new TimeSampler();
public static TimeSampler F = new TimeSampler();
public static TimeSampler G = new TimeSampler();
public static TimeSampler H = new TimeSampler();
public static TimeSampler I = new TimeSampler();
public static void update() {
updateSamplers();

View File

@@ -188,20 +188,32 @@ public class VoxyRenderSystem {
throw new IllegalStateException("Cannot use the default framebuffer as cannot source from it");
}
TimingStatistics.E.start();
this.chunkBoundRenderer.render(viewport);
TimingStatistics.E.stop();
TimingStatistics.F.start();
this.postProcessing.setup(target.textureWidth, target.textureHeight, boundFB);
TimingStatistics.F.stop();
this.renderer.renderFarAwayOpaque(viewport, this.chunkBoundRenderer.getDepthBoundTexture(), startTime);
TimingStatistics.F.start();
//Compute the SSAO of the rendered terrain, TODO: fix it breaking depth or breaking _something_ am not sure what
this.postProcessing.computeSSAO(viewport.MVP);
TimingStatistics.F.stop();
TimingStatistics.G.start();
//We can render the translucent directly after as it is the furthest translucent objects
this.renderer.renderFarAwayTranslucent(viewport, this.chunkBoundRenderer.getDepthBoundTexture());
TimingStatistics.G.stop();
TimingStatistics.F.start();
this.postProcessing.renderPost(projection, matrices.projection(), boundFB);
TimingStatistics.F.stop();
TimingStatistics.main.stop();
TimingStatistics.postDynamic.start();
@@ -245,6 +257,8 @@ public class VoxyRenderSystem {
{
TimingStatistics.update();
debug.add("Voxy frame runtime (millis): " + TimingStatistics.dynamic.pVal() + ", " + TimingStatistics.main.pVal()+ ", " + TimingStatistics.postDynamic.pVal()+ ", " + TimingStatistics.all.pVal());
debug.add("Extra time: " + TimingStatistics.A.pVal() + ", " + TimingStatistics.B.pVal() + ", " + TimingStatistics.C.pVal() + ", " + TimingStatistics.D.pVal());
debug.add("Extra 2 time: " + TimingStatistics.E.pVal() + ", " + TimingStatistics.F.pVal() + ", " + TimingStatistics.G.pVal() + ", " + TimingStatistics.H.pVal() + ", " + TimingStatistics.I.pVal());
}
PrintfDebugUtil.addToOut(debug);
}

View File

@@ -112,7 +112,9 @@ public class RenderService<T extends AbstractSectionRenderer<J, Q>, J extends Vi
// the section renderer is as it might have different backends, but they all accept a buffer containing the section list
TimingStatistics.G.start();
this.sectionRenderer.renderOpaque(viewport, depthBoundTexture);
TimingStatistics.G.stop();
//NOTE: need to do the upload and download tick here, after the section renderer renders the world, to ensure "stable"
// sections
@@ -140,8 +142,10 @@ public class RenderService<T extends AbstractSectionRenderer<J, Q>, J extends Vi
}*/
TimingStatistics.D.start();
//Tick download stream
DownloadStream.INSTANCE.tick();
TimingStatistics.D.stop();
this.nodeManager.tick(this.traversal.getNodeBuffer(), this.nodeCleaner);
//glFlush();
@@ -158,10 +162,17 @@ public class RenderService<T extends AbstractSectionRenderer<J, Q>, J extends Vi
if (depthBuffer == 0) {
depthBuffer = glGetFramebufferAttachmentParameteri(GL_FRAMEBUFFER, GL_DEPTH_ATTACHMENT, GL_FRAMEBUFFER_ATTACHMENT_OBJECT_NAME);
}
TimingStatistics.I.start();
this.traversal.doTraversal(viewport, depthBuffer);
TimingStatistics.I.stop();
TimingStatistics.H.start();
this.sectionRenderer.buildDrawCalls(viewport);
TimingStatistics.H.stop();
TimingStatistics.G.start();
this.sectionRenderer.renderTemporal(depthBoundTexture);
TimingStatistics.G.stop();
}
public void renderFarAwayTranslucent(J viewport, GlTexture depthBoundTexture) {
@@ -172,6 +183,7 @@ public class RenderService<T extends AbstractSectionRenderer<J, Q>, J extends Vi
this.modelService.addDebugData(debug);
this.renderGen.addDebugData(debug);
this.sectionRenderer.addDebug(debug);
this.nodeManager.addDebug(debug);
if (RenderStatistics.enabled) {
debug.add("HTC: [" + Arrays.stream(flipCopy(RenderStatistics.hierarchicalTraversalCounts)).mapToObj(Integer::toString).collect(Collectors.joining(", "))+"]");

View File

@@ -1,10 +1,8 @@
package me.cortex.voxy.client.core.rendering.hierachical;
import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap;
import it.unimi.dsi.fastutil.ints.Int2ObjectOpenHashMap;
import it.unimi.dsi.fastutil.ints.IntConsumer;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.ints.*;
import it.unimi.dsi.fastutil.longs.LongOpenHashSet;
import me.cortex.voxy.client.TimingStatistics;
import me.cortex.voxy.client.core.gl.GlBuffer;
import me.cortex.voxy.client.core.gl.shader.Shader;
import me.cortex.voxy.client.core.gl.shader.ShaderType;
@@ -15,12 +13,14 @@ import me.cortex.voxy.client.core.rendering.section.geometry.BasicSectionGeometr
import me.cortex.voxy.client.core.rendering.section.geometry.IGeometryData;
import me.cortex.voxy.client.core.rendering.util.UploadStream;
import me.cortex.voxy.common.Logger;
import me.cortex.voxy.common.util.AllocationArena;
import me.cortex.voxy.common.util.MemoryBuffer;
import me.cortex.voxy.common.world.WorldSection;
import org.lwjgl.system.MemoryUtil;
import java.lang.invoke.MethodHandles;
import java.lang.invoke.VarHandle;
import java.util.List;
import java.util.concurrent.ConcurrentLinkedDeque;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.LockSupport;
@@ -31,7 +31,6 @@ import static org.lwjgl.opengl.GL30C.glUniform1ui;
import static org.lwjgl.opengl.GL42C.GL_UNIFORM_BARRIER_BIT;
import static org.lwjgl.opengl.GL42C.glMemoryBarrier;
import static org.lwjgl.opengl.GL43C.*;
import static org.lwjgl.opengl.GL44.GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
//TODO: create an "async upload stream", that is, the upload stream is a raw mapped buffer pointer that can be written to
// which is then synced to the gpu on "render thread sync",
@@ -68,9 +67,6 @@ public class AsyncNodeManager {
private volatile SyncResults resultCache1 = new SyncResults();
private volatile SyncResults resultCache2 = new SyncResults();
//Yes. this is stupid. yes. it is a large amount of runtime. Is it profiler bias, probably
private final ConcurrentLinkedDeque<MemoryBuffer> buffersToFreeQueue = new ConcurrentLinkedDeque<>();
//locals for during iteration
private final IntOpenHashSet tlnIdChange = new IntOpenHashSet();//"Encoded" add/remove id, first bit indicates if its add or remove, 1 is add
@@ -156,15 +152,14 @@ public class AsyncNodeManager {
.add(ShaderType.COMPUTE, "voxy:util/scatter.comp")
.compile();
private void run() {
while (true) {
var buffer = this.buffersToFreeQueue.poll();
if (buffer == null) {
break;
}
buffer.free();
}
private final Shader multiMemcpy = Shader.make()
.define("INPUT_HEADER_BUFFER_BINDING", 0)
.define("INPUT_DATA_BUFFER_BINDING", 1)
.define("OUTPUT_BUFFER_BINDING", 2)
.add(ShaderType.COMPUTE, "voxy:util/memcpy.comp")
.compile();
private void run() {
if (this.workCounter.get() <= 0) {
LockSupport.park();
if (this.workCounter.get() <= 0 || !this.running) {//No work
@@ -229,7 +224,7 @@ public class AsyncNodeManager {
job.release();
} while (true);
final int UPLOAD_LIMIT = 200;
final int UPLOAD_LIMIT = 500;
for (int limit = 0; limit < UPLOAD_LIMIT/2; limit++) //Limit uploading, TODO: limit this by frame sync count, not here
{
var job = this.geometryUpdateQueue.poll();
@@ -357,8 +352,16 @@ public class AsyncNodeManager {
results.tlnDelta.addAll(this.tlnIdChange);
this.tlnIdChange.clear();
results.geometryUploads.putAll(this.geometryManager.getUploads());
this.geometryManager.getUploads().clear();//Put in new data into sync set
if (!this.geometryManager.getUploads().isEmpty()){//Put in new data into sync set
var iter = this.geometryManager.getUploads().int2ObjectEntrySet().fastIterator();
while (iter.hasNext()) {
var val = iter.next();
results.geometryUpload.upload(val.getIntKey(), val.getValue());
val.getValue().free();
}
this.geometryManager.getUploads().clear();
}
this.geometryManager.getHeapRemovals().clear();//We dont do removals on new data (as there is "none")
results.cleanerOperations.addAll(this.cleanerIdResetClear); this.cleanerIdResetClear.clear();
} else {
@@ -390,10 +393,7 @@ public class AsyncNodeManager {
var rem = this.geometryManager.getHeapRemovals();
var iter = rem.intIterator();
while (iter.hasNext()) {
var buffer = results.geometryUploads.remove(iter.nextInt());
if (buffer != null) {
buffer.free();
}
results.geometryUpload.remove(iter.nextInt());
}
rem.clear();
}
@@ -403,10 +403,8 @@ public class AsyncNodeManager {
var iter = add.int2ObjectEntrySet().fastIterator();
while (iter.hasNext()) {
var val = iter.next();
var prevBuffer = results.geometryUploads.put(val.getIntKey(), val.getValue());
if (prevBuffer != null) {
prevBuffer.free();
}
results.geometryUpload.upload(val.getIntKey(), val.getValue());
val.getValue().free();
}
add.clear();
}
@@ -450,7 +448,7 @@ public class AsyncNodeManager {
results.usedGeometry = this.geometryManager.getGeometryUsedBytes();
results.currentMaxNodeId = this.manager.getCurrentMaxNodeId();
this.needsWaitForSync |= results.geometryUploads.size() > UPLOAD_LIMIT;//Max of 200 uploads per frame :(
this.needsWaitForSync |= results.geometryUpload.currentElemCopyAmount*8L > 4L<<20;//4mb limit per frame
if (!RESULT_HANDLE.compareAndSet(this, null, results)) {
throw new IllegalArgumentException("Should always have null");
@@ -484,20 +482,35 @@ public class AsyncNodeManager {
store.setSectionCount(results.geometrySectionCount);
//Do geometry uploads
if (!results.geometryUploads.isEmpty()) {
var iter = results.geometryUploads.int2ObjectEntrySet().fastIterator();
while (iter.hasNext()) {
var val = iter.next();
var buffer = val.getValue();
UploadStream.INSTANCE.upload(store.getGeometryBuffer(), Integer.toUnsignedLong(val.getIntKey()) * 8L, buffer);
//Put the queue into the buffer queue to free... yes this is stupid that need todo this...
this.buffersToFreeQueue.add(buffer);//buffer.free();//Free the buffer was uploading
var upload = results.geometryUpload;
if (!upload.dataUploadPoints.isEmpty()) {
TimingStatistics.A.start();
int copies = upload.dataUploadPoints.size();
int scratchSize = (int) upload.arena.getSize() * 8;
long ptr = UploadStream.INSTANCE.rawUploadAddress(scratchSize + copies * 16);
MemoryUtil.memCopy(upload.scratchHeaderBuffer.address, UploadStream.INSTANCE.getBaseAddress() + ptr, copies * 16L);
MemoryUtil.memCopy(upload.scratchDataBuffer.address, UploadStream.INSTANCE.getBaseAddress() + ptr + copies * 16L, scratchSize);
UploadStream.INSTANCE.commit();//Commit the buffer
this.multiMemcpy.bind();
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 0, UploadStream.INSTANCE.getRawBufferId(), ptr, copies*16L);
glBindBufferRange(GL_SHADER_STORAGE_BUFFER, 1, UploadStream.INSTANCE.getRawBufferId(), ptr+copies*16L, scratchSize);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ((BasicSectionGeometryData) this.geometryData).getGeometryBuffer().id);
if (copies > 500) {
Logger.warn("Large amount of copies, lag will probably happen: " + copies);
}
UploadStream.INSTANCE.commit();
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
glDispatchCompute(copies, 1, 1);//Execute the copies
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT);
TimingStatistics.A.stop();
}
}
TimingStatistics.B.start();
if (!results.scatterWriteLocationMap.isEmpty()) {//Scatter write
int count = results.scatterWriteLocationMap.size();//Number of writes, not chunks or uvec4 count
int chunks = (count+3)/4;
@@ -512,14 +525,17 @@ public class AsyncNodeManager {
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 1, nodeBuffer.id);
glBindBufferBase(GL_SHADER_STORAGE_BUFFER, 2, ((BasicSectionGeometryData) this.geometryData).getMetadataBuffer().id);
glUniform1ui(0, count);
glMemoryBarrier(GL_UNIFORM_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT|GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
glMemoryBarrier(GL_UNIFORM_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
glDispatchCompute((count+127)/128, 1, 1);
glMemoryBarrier(GL_UNIFORM_BARRIER_BIT|GL_SHADER_STORAGE_BARRIER_BIT);
}
TimingStatistics.B.stop();
TimingStatistics.C.start();
if (!results.cleanerOperations.isEmpty()) {
cleaner.updateIds(results.cleanerOperations);
}
TimingStatistics.C.stop();
this.currentMaxNodeId = results.currentMaxNodeId;
this.usedGeometryAmount = results.usedGeometry;
@@ -671,30 +687,28 @@ public class AsyncNodeManager {
if (RESULT_HANDLE.get(this) != null) {
var result = (SyncResults)RESULT_HANDLE.getAndSet(this, null);
result.geometryUploads.forEach((a,b)->b.free());
result.geometryUpload.free();
result.scatterWriteBuffer.free();
}
if (RESULT_CACHE_1_HANDLE.get(this) != null) {//Clear cache 1
var result = (SyncResults)RESULT_CACHE_1_HANDLE.getAndSet(this, null);
result.geometryUpload.free();
result.scatterWriteBuffer.free();
}
if (RESULT_CACHE_2_HANDLE.get(this) != null) {//Clear cache 2
var result = (SyncResults)RESULT_CACHE_2_HANDLE.getAndSet(this, null);
result.geometryUpload.free();
result.scatterWriteBuffer.free();
}
this.scatterWrite.free();
while (true) {
var buffer = this.buffersToFreeQueue.poll();
if (buffer == null) {
break;
}
buffer.free();
this.multiMemcpy.free();
}
public void addDebug(List<String> debug) {
debug.add("UC/GC: " + (this.getUsedGeometryCapacity()/(1<<20))+"/"+(this.getGeometryCapacity()/(1<<20)));
}
//Results object, which is to be synced between the render thread and worker thread
@@ -714,27 +728,25 @@ public class AsyncNodeManager {
//Deltas for geometry store
private int geometrySectionCount;
private long usedGeometry;
private final Int2ObjectOpenHashMap<MemoryBuffer> geometryUploads = new Int2ObjectOpenHashMap<>();
private final ComputeMemoryCopy geometryUpload = new ComputeMemoryCopy();
//Scatter writes for both geometry and node metadata
private MemoryBuffer scatterWriteBuffer = new MemoryBuffer(8192*2);
private final Int2IntOpenHashMap scatterWriteLocationMap = new Int2IntOpenHashMap(1024);
{this.scatterWriteLocationMap.defaultReturnValue(-1);}
//Cleaner operations
private final IntOpenHashSet cleanerOperations = new IntOpenHashSet();
public SyncResults() {
this.scatterWriteLocationMap.defaultReturnValue(-1);
}
public void reset() {
this.cleanerOperations.clear();
this.scatterWriteLocationMap.clear();
this.currentMaxNodeId = 0;
this.tlnDelta.clear();
this.geometrySectionCount = 0;
this.geometryUploads.clear();
this.usedGeometry = 0;
this.geometryUpload.reset();
}
//Get or create a scatter write address for the given location
@@ -775,4 +787,142 @@ public class AsyncNodeManager {
}
}
}
private static class ComputeMemoryCopy {
public int currentElemCopyAmount;
private MemoryBuffer scratchHeaderBuffer = new MemoryBuffer(1<<16);
private MemoryBuffer scratchDataBuffer = new MemoryBuffer(1<<20);
private final AllocationArena arena = new AllocationArena();
private final Int2IntOpenHashMap dataUploadPoints = new Int2IntOpenHashMap();//Points to the header index
{this.dataUploadPoints.defaultReturnValue(-1);}
public void remove(int point) {
int header = this.dataUploadPoints.remove(point);
if (header == -1) {//No upload for point
return;
}
int size = MemoryUtil.memGetInt(this.scratchHeaderBuffer.address + header*16L + 8L);
this.currentElemCopyAmount -= size;
//Free the old memory addr from arena
if (this.arena.free(MemoryUtil.memGetInt(this.scratchHeaderBuffer.address + header*16L)) != size) {
throw new IllegalStateException("Freed memory not same size as expected");
}
if (MemoryUtil.memGetInt(this.scratchHeaderBuffer.address + header*16L + 4L) != point) {
throw new IllegalStateException("Destination not the same as point");
}
//If we were the end upload header, return as we dont need to shuffle
if (header == this.dataUploadPoints.size()) {
long A = this.scratchHeaderBuffer.address + header*16L;
//Zero the memory, for consistancy
MemoryUtil.memPutLong(A, 0);
MemoryUtil.memPutLong(A+8, 0);
return;
}
//Else: we need to move the ending upload header from the end to where the freed point was
int endingPoint = MemoryUtil.memGetInt(this.scratchHeaderBuffer.address + this.dataUploadPoints.size()*16L + 4);
if (this.dataUploadPoints.get(endingPoint) != this.dataUploadPoints.size()) {
throw new IllegalStateException("ending header not pointing at end point");
}
//Move the end header to the old header location
long A = this.scratchHeaderBuffer.address + this.dataUploadPoints.size()*16L;
long B = this.scratchHeaderBuffer.address + header*16L;
MemoryUtil.memPutLong(B, MemoryUtil.memGetLong(A)); MemoryUtil.memPutLong(A, 0);
MemoryUtil.memPutLong(B+8, MemoryUtil.memGetLong(A+8)); MemoryUtil.memPutLong(A+8, 0);
//Update the map
this.dataUploadPoints.put(endingPoint, header);
}
public void upload(int point, MemoryBuffer data) {
if ((data.size%8)!=0) throw new IllegalStateException("Data must be of size multiple 8");
int elemSize = (int) (data.size / 8);
int header = this.dataUploadPoints.get(point);
if (header != -1) {
//If we already have a header location, we just need to reallocate the data
long headerPtr = this.scratchHeaderBuffer.address + header*16L;
if (MemoryUtil.memGetInt(headerPtr+4L) != point) {
throw new IllegalStateException("Existing destination not the point");
}
int pSize = MemoryUtil.memGetInt(headerPtr+8L);//Previous size
if (pSize == elemSize) {
//The data we are replacing is the same size, so just overwrite it, this is the easiest
data.cpyTo(this.scratchDataBuffer.address+MemoryUtil.memGetInt(headerPtr)*8L);
} else {
//Dealloc
if (this.arena.free(MemoryUtil.memGetInt(headerPtr)) != pSize) {
throw new IllegalStateException("Freed allocation not size as expected");
}
this.currentElemCopyAmount -= pSize;
this.currentElemCopyAmount += elemSize;
int alloc = this.allocScratchDataPos(elemSize);//New allocation position
//Copy data into position
data.cpyTo(this.scratchDataBuffer.address+alloc*8L);
//Update the header
MemoryUtil.memPutInt(headerPtr, alloc);
MemoryUtil.memPutInt(headerPtr+8, elemSize);
}
} else {
//We need to create and allocate a new header for the upload
header = this.dataUploadPoints.size();
this.dataUploadPoints.put(point, header);
if (this.scratchHeaderBuffer.size<=header*16L) {
//We must resize the header buffer
long newSize = Math.max(this.scratchHeaderBuffer.size*2, header*16L);
Logger.info("Resizing scratch header buffer to: " + newSize);
var newScratch = new MemoryBuffer(newSize);
this.scratchHeaderBuffer.cpyTo(newScratch.address);
this.scratchHeaderBuffer.free();
this.scratchHeaderBuffer = newScratch;
}
long headerPtr = this.scratchHeaderBuffer.address + header*16L;//Header resize has happened so this is a stable address
this.currentElemCopyAmount += elemSize;
int alloc = this.allocScratchDataPos(elemSize);//New allocation position
//Copy data into position
data.cpyTo(this.scratchDataBuffer.address+alloc*8L);
//Set header data
MemoryUtil.memPutInt(headerPtr, alloc);
MemoryUtil.memPutInt(headerPtr+4, point);
MemoryUtil.memPutInt(headerPtr+8, elemSize);
}
}
//This is done here as it enables easily doing scratch data resizing
private int allocScratchDataPos(int size) {
int pos = (int) this.arena.alloc(size);
if (this.scratchDataBuffer.size <= (pos+size)*8L) {
//We must resize :cri:
long newSize = Math.max(this.scratchDataBuffer.size*2, (pos+size)*8L);
Logger.info("Resizing scratch data buffer to: " + newSize);
var newScratch = new MemoryBuffer(newSize);
this.scratchDataBuffer.cpyTo(newScratch.address);
this.scratchDataBuffer.free();
this.scratchDataBuffer = newScratch;
}
return pos;
}
public void reset() {
this.currentElemCopyAmount = 0;
this.dataUploadPoints.clear();
this.arena.reset();
}
public void free() {
this.scratchHeaderBuffer.free(); this.scratchHeaderBuffer = null;
this.scratchDataBuffer.free(); this.scratchDataBuffer = null;
}
}
}

View File

@@ -0,0 +1,28 @@
#version 460 core
#define WORK_SIZE 256
layout(local_size_x=WORK_SIZE) in;
//Header data about destination, size and location of what is being copied (NOTE: can probably make it a uvec2?)
layout(binding = INPUT_HEADER_BUFFER_BINDING, std430) restrict readonly buffer InputHeaderBuffer {
uvec4[] dataCopyHeader;
};
layout(binding = INPUT_DATA_BUFFER_BINDING, std430) restrict readonly buffer InputDataBuffer {
uvec2[] dataInBuffer;
};
layout(binding = OUTPUT_BUFFER_BINDING, std430) restrict writeonly buffer OutputBuffer {
uvec2[] outputBuffer;
};
void main() {
uvec4 job = dataCopyHeader[gl_WorkGroupID.x];
//Copy from input to output
uint src = job.x;
uint dst = job.y;
uint siz = job.z;
for (uint i = gl_LocalInvocationID.x; i < siz; i+=WORK_SIZE) {
outputBuffer[dst+i] = dataInBuffer[src+i];
}
}