This commit is contained in:
mcrcortex
2024-06-11 23:06:24 +10:00
parent 584028ae7a
commit 25ddb83d22
18 changed files with 882 additions and 158 deletions

View File

@@ -14,7 +14,7 @@ public class GlPersistentMappedBuffer extends TrackedObject {
this.id = glCreateBuffers(); this.id = glCreateBuffers();
this.size = size; this.size = size;
glNamedBufferStorage(this.id, size, GL_CLIENT_STORAGE_BIT|GL_MAP_PERSISTENT_BIT|(flags&(GL_MAP_COHERENT_BIT|GL_MAP_WRITE_BIT|GL_MAP_READ_BIT))); glNamedBufferStorage(this.id, size, GL_CLIENT_STORAGE_BIT|GL_MAP_PERSISTENT_BIT|(flags&(GL_MAP_COHERENT_BIT|GL_MAP_WRITE_BIT|GL_MAP_READ_BIT)));
this.addr = nglMapNamedBufferRange(this.id, 0, size, flags|GL_MAP_PERSISTENT_BIT); this.addr = nglMapNamedBufferRange(this.id, 0, size, (flags&(GL_MAP_WRITE_BIT|GL_MAP_READ_BIT|GL_MAP_UNSYNCHRONIZED_BIT|GL_MAP_FLUSH_EXPLICIT_BIT))|GL_MAP_PERSISTENT_BIT);
} }
@Override @Override

View File

@@ -0,0 +1,12 @@
package me.cortex.voxy.client.core.gl.shader;
import java.util.regex.Pattern;
public class GenericsProcessor implements IShaderProcessor {
private static final Pattern GENERIC_DEFINE = Pattern.compile("#defineGen (?<name>[A-Za-z0-9]+)<(?<generic>[A-Za-z0-9]*)>");
private static final Pattern GENERIC_USE = Pattern.compile("(?<type>[A-Za-z0-9]+)<(?<generic>[A-Za-z0-9]*)>");
@Override
public String process(ShaderType type, String source) {
return null;
}
}

View File

@@ -3,8 +3,7 @@ package me.cortex.voxy.client.core.gl.shader;
import me.cortex.voxy.common.util.TrackedObject; import me.cortex.voxy.common.util.TrackedObject;
import org.lwjgl.opengl.GL20C; import org.lwjgl.opengl.GL20C;
import java.util.HashMap; import java.util.*;
import java.util.Map;
import java.util.stream.Collectors; import java.util.stream.Collectors;
import static org.lwjgl.opengl.GL20.glDeleteProgram; import static org.lwjgl.opengl.GL20.glDeleteProgram;
@@ -16,8 +15,15 @@ public class Shader extends TrackedObject {
id = program; id = program;
} }
public static Builder make(IShaderProcessor processor) { public static Builder make(IShaderProcessor... processors) {
return new Builder(processor); List<IShaderProcessor> aa = new ArrayList<>(List.of(processors));
Collections.reverse(aa);
IShaderProcessor applicator = (type,source)->source;
for (IShaderProcessor processor : processors) {
IShaderProcessor finalApplicator = applicator;
applicator = (type, source) -> finalApplicator.process(type, processor.process(type, source));
}
return new Builder(applicator);
} }
public static Builder make() { public static Builder make() {

View File

@@ -0,0 +1,50 @@
package me.cortex.voxy.client.core.rendering;
import me.cortex.voxy.client.core.gl.GlBuffer;
import me.cortex.voxy.client.core.gl.shader.Shader;
import me.cortex.voxy.client.core.gl.shader.ShaderType;
import me.cortex.voxy.client.core.rendering.hierarchical.NodeManager;
import me.cortex.voxy.common.util.HierarchicalBitSet;
import static org.lwjgl.opengl.GL42C.*;
import static org.lwjgl.opengl.GL43C.GL_SHADER_STORAGE_BARRIER_BIT;
import static org.lwjgl.opengl.GL43C.glDispatchCompute;
import static org.lwjgl.opengl.GL44.GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
public class HierarchicalOcclusionRenderer {
private final int workgroup_dispatch_size_x;//The number of workgroups required to saturate the gpu efficiently
private final NodeManager nodeManager = new NodeManager(null);
private final HiZBuffer hiz = new HiZBuffer();
private Shader hiercarchialShader = Shader.make()
.add(ShaderType.COMPUTE, "voxy:lod/hierarchical/selector.comp")
.compile();
public HierarchicalOcclusionRenderer(int workgroup_size) {
this.workgroup_dispatch_size_x = workgroup_size;
}
private void bind() {
}
public void render(int depthBuffer, int width, int height) {
//Make hiz
this.hiz.buildMipChain(depthBuffer, width, height);
//Node upload phase
this.nodeManager.uploadPhase();
//Node download phase (pulls from previous frame (should maybe result in lower latency)) also clears and resets the queues
this.nodeManager.downloadPhase();
//Bind all the resources
this.bind();
//run hierachial selection shader
this.hiercarchialShader.bind();
//barrier
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_UNIFORM_BARRIER_BIT|GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT|GL_FRAMEBUFFER_BARRIER_BIT);
//Emit enough work to fully populate the gpu
glDispatchCompute(this.workgroup_dispatch_size_x, 1, 1);
glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_COMMAND_BARRIER_BIT|GL_UNIFORM_BARRIER_BIT);
}
}

View File

@@ -0,0 +1,15 @@
package me.cortex.voxy.client.core.rendering.hierarchical;
import me.cortex.voxy.client.core.rendering.building.BuiltSection;
import java.util.function.Consumer;
//Interface for node manager to interact with the outside world
public interface INodeInteractor {
void watchUpdates(long pos);//marks pos as watching for updates, i.e. any LoD updates will trigger a callback
void unwatchUpdates(long pos);//Unmarks a position for updates
void requestMesh(long pos);//Explicitly requests a mesh at a position, run the callback
void setMeshUpdateCallback(Consumer<BuiltSection> mesh);
}

View File

@@ -0,0 +1,12 @@
package me.cortex.voxy.client.core.rendering.hierarchical;
public interface ITrimInterface {
//Last recorded/known use time of a nodes mesh, returns -1 if node doesnt have a mesh
int lastUsedTime(int node);
//Returns an integer with the bottom 24 bits being the ptr top 8 bits being count or something
int getChildren(int node);
//Returns a size of the nodes mesh, -1 if the node doesnt have a mesh
int getNodeSize(int node);
}

View File

@@ -0,0 +1,5 @@
package me.cortex.voxy.client.core.rendering.hierarchical;
//Uses a persistently mapped coherient buffer with off thread polling to pull in requests
public class NodeLoadSystem {
}

View File

@@ -0,0 +1,219 @@
package me.cortex.voxy.client.core.rendering.hierarchical;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import me.cortex.voxy.client.core.gl.GlBuffer;
import me.cortex.voxy.client.core.rendering.building.BuiltSection;
import me.cortex.voxy.client.core.rendering.util.DownloadStream;
import me.cortex.voxy.common.util.HierarchicalBitSet;
import org.lwjgl.system.MemoryUtil;
import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT;
import static org.lwjgl.opengl.GL30.GL_R32UI;
import static org.lwjgl.opengl.GL30C.GL_RED_INTEGER;
import static org.lwjgl.opengl.GL45.nglClearNamedBufferSubData;
public class NodeManager {
public static final int MAX_NODE_COUNT = 1<<22;
public static final int MAX_REQUESTS = 1024;
private final HierarchicalBitSet bitSet = new HierarchicalBitSet(MAX_NODE_COUNT);
private final GlBuffer nodeBuffer = new GlBuffer(MAX_NODE_COUNT*16);//Node size is 16 bytes
//TODO: maybe make this a coherent persistent mapped read buffer, instead of download synced buffer copy thing
//a request payload is a single uint, first 8 bits are flags followed by 24 bit node identifier
// (e.g. load child nodes, load child nodes + meshs, load self meshes )
private final int REQUEST_QUEUE_SIZE = 4 + MAX_REQUESTS * 4;//TODO: add a priority system
private final GlBuffer requestQueue = new GlBuffer(4 + MAX_REQUESTS * 4);
//Buffer containing the index of the root nodes
private final GlBuffer roots = new GlBuffer(1024*4);
//500mb TODO: SEE IF CAN SHRINK IT BY EITHER NOT NEEDING AS MUCH SPACE or reducing max node count
private final long[] localNodes = new long[MAX_NODE_COUNT * 3];//1.5x the size of the gpu copy to store extra metadata
//LocalNodes have an up value pointing to the parent, enabling full traversal
private final INodeInteractor interactor;
public NodeManager(INodeInteractor interactor) {
this.interactor = interactor;
this.pos2meshId.defaultReturnValue(NO_NODE);
}
//Returns true if it has its own mesh loaded
private static boolean nodeHasMeshLoaded(long metaA, long metaB) {
return false;
}
private static final int REQUEST_SELF = 0;
private static final int REQUEST_CHILDREN = 1;
//A node can be loaded in the tree but have no mesh associated with it
// this is so that higher level nodes dont waste mesh space
//The reason that nodes have both child and own mesh pointers
// is so that on an edge of the screen or when moving, nodes arnt constantly being swapped back and forth
// it basicly acts as an inline cache :tm: however it does present some painpoints
// especially in managing the graph
//It might be easier to have nodes strictly either point to child nodes or meshes
// if a parent needs to be rendered instead of the child, request for node change to self
// while this will generate a shitton more requests it should be alot easier to manage graph wise
// can probably add a caching service via a compute shader that ingests a request list
// sees if the requested nodes are already cached, if so swap them in, otherwise dispatch a request
// to cpu
private void processRequestQueue(long ptr, long size) {
int count = MemoryUtil.memGetInt(ptr); ptr += 4;
for (int i = 0; i < count; i++) {
int request = MemoryUtil.memGetInt(ptr + i*4L);
int args = request&(0xFF<<24);
int nodeId = request&(0xFFFFFF);
long pos = this.localNodes[nodeId*3];
long metaA = this.localNodes[nodeId*3 + 1];
long metaB = this.localNodes[nodeId*3 + 2];
int type = args&0b11;//2 bits for future request types such as parent and ensure stable (i.e. both parent and child loaded)
if (type == REQUEST_SELF) {
//Requires own mesh loaded (it can have 2 different priorites, it can fallback to using its children to render if they are loaded)
// else it is critical priority
if (nodeHasMeshLoaded(metaA, metaB)) {
throw new IllegalStateException("Node requested a mesh load, but mesh is already loaded: " + pos);
}
//watch the mesh and request it
this.interactor.watchUpdates(pos);
this.interactor.requestMesh(pos);
} else if (type == REQUEST_CHILDREN) {
//Node requires children to be loaded NOTE: when this is the case, it doesnt just mean the nodes,
// it means the meshes aswell,
// meshes may be unloaded later
//when this case is hit it means that the child nodes arnt even loaded, so it becomes a bit more complex
// basicly, need to request all child nodes be loaded in a batch
// then in the upload tick need to do update many things
} else {
throw new IllegalArgumentException("Unknown update type: " + type + " @pos:" + pos);
}
}
}
public void uploadPhase() {
//All uploads
//Have a set of upload tasks for nodes,
// this could include updating the mesh ptr
// or child ptr or uploading new nodes
// NOTE: when uploading a set of new nodes (must be clustered as children)
// have to update parent
// same when removing a set of children
//Note: child node upload tasks need to all be complete before they can be uploaded
//The way the graph works and can be cut is that all the leaf nodes _must_ at all times contain a mesh
// this is critical to prevent "cracks"/no geometry being rendered
// when the render mesh buffer is "full" (or even just periodicly), trimming of the tree must occur to keep
// size within reason
//Note tho that there is a feedback delay and such so geometry buffer should probably be trimmed when it reaches
// 80-90% capacity so that new geometry can still be uploaded without being blocked on geometry clearing
// it becomes a critical error if the geometry buffer becomes full while the tree is fully trimmed
//NOTE: while trimming the tree, need to also trim the parents down i.e. the top level should really not have its mesh
// loaded while it isnt really ever used
// however as long as the rule that all leaf nodes have a mesh loaded is held then there should never be
// any geometry holes
}
//Download and upload point, called once per frame
public void downloadPhase() {
DownloadStream.INSTANCE.download(this.requestQueue, 0, REQUEST_QUEUE_SIZE, this::processRequestQueue);
DownloadStream.INSTANCE.commit();
//Clear the queue counter, TODO: maybe do it some other way to batch clears
nglClearNamedBufferSubData(this.requestQueue.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0);
//TODO: compute cleanup here of loaded nodes, and what needs to be uploaded
// i.e. if there is more upload stuff than there is free memory, cull nodes in the tree
// to fit upload points, can also create errors if all nodes in the tree are requested but no memory to put
}
//Inserts a top level node into the graph, it has geometry and no children loaded as it is a leaf node
public void insertTopLevelNode(long position) {
}
//Removes a top level node from the graph, doing so also removes all child nodes and associate geometry
// the allocated slots when removing nodes are stored and roped off until it is guarenteed that all requests have
// passed
public void removeTopLevelNode(long position) {
}
//Tracking for nodes that specifically need meshes, if a node doesnt have or doesnt need a mesh node, it is not in the map
// the map should be identical to the currently watched set of sections
//NOTE: that if the id is negative its part of a mesh request
private final Long2IntOpenHashMap pos2meshId = new Long2IntOpenHashMap();
private static final int NO_NODE = -1;
//Need to make this system attatched with a batched worker system, since a mesh update can be a few things
// it can be a mesh update of a tracked render section, in this case we must ensure that it is still tracked and hasnt been removed bla bla bla
// if its still valid and tracked then upload it and update the node aswell ensuring sync bla bla bla
// if it was part of a request, then we need to first check that the request still exists and hasnt been discarded B) probably upload it immediatly still
// B) set the request with that section to have been, well, uploaded and the mesh set, (note if the mesh was updated while a request was inprogress/other requests not fufilled, need to remove the old and replace with the updated)
// if all the meshes in the request are satisfied, upload the request nodes and update its parent
// NOTE! batch requests where this is needed are only strictly required when children are requested in order to guarentee that all
// propertiy of leaf nodes must have meshes remains
//(TODO: see when sync with main thread should be, in the renderer or here since the updates are dispatched offthread)
// Note that the geometry buffer should have idk 20% free? that way meshes can always be inserted (same for the node buffer ig) maybe 10%? idk need to experiement
// if the buffer goes over this threshold, the tree/graph culler must start culling last/least used nodes somehow
// it should be an error if the geometry or node buffer fills up but there are no nodes/meshes to cull/remove
public void meshUpdate(BuiltSection mesh) {
int id = this.pos2meshId.get(mesh.position);
if (id == NO_NODE) {
//The built mesh section is no longer needed, discard it
// TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built??
mesh.free();
return;
}
if ((id&(1<<31))!=0) {
//The mesh is part of a batched request
id = id^(1<<31);
} else {
//The mesh is an update for an existing node
//this.localNodes[id*3]
}
}
//A node has a position (64 bit)
// a ptr to its own mesh (24 bit)
// a ptr to children nodes (24 bit)
// flags (16 bit)
// Total of 128 bits (16 bytes)
//First 2 flag bits are a requested dispatch type (0 meaning no request and the 3 remaining states for different request types)
// this ensures that over multiple frames the same node is not requested
//Bits exist for whether or not the children have meshes loaded or if the parents have meshes loaded
// the idea is to keep +-1 lod meshes loaded into vram to enable seemless transitioning
// the only critical state is that if a mesh wants to be rendered it should be able to be rendered
//Basicly, there are multiple things, it depends on the screensize error
// if a node is close to needing its children loaded but they arnt, then request it but with a lower priority
// if a node must need its children then request at a high prioirty
// if a node doesnt have a mesh but all its children do than dispatch a medium priority to have its own mesh loaded
// but then just use the child meshes for rendering
}

View File

@@ -0,0 +1,103 @@
package me.cortex.voxy.client.core.rendering.hierarchical;
import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
import me.cortex.voxy.client.core.rendering.building.BuiltSection;
import me.cortex.voxy.client.core.rendering.util.MarkedObjectList;
public class NodeManager2 {
//A request for making a new child nodes
private static final class LeafRequest {
//LoD position identifier
public long position;
//Node id of the node the leaf request is for, note! While there is a leaf request, the node should not be unloaded or removed
public int nodeId;
//The mask of what child nodes are required
public byte requiredChildMask;
//The mask of currently supplied child node data
public byte currentChildMask;
//Reset/clear the request so that it may be reused
public void clear() {
}
}
public static final int MAX_NODE_COUNT = 1<<22;
//Local data layout
// first long is position (todo! might not be needed)
// next long contains mesh position ig/id
private final long[] localNodeData = new long[MAX_NODE_COUNT * 3];
private final INodeInteractor interactor;
public NodeManager2(INodeInteractor interactor) {
this.interactor = interactor;
this.pos2meshId.defaultReturnValue(NO_NODE);
this.interactor.setMeshUpdateCallback(this::meshUpdate);
}
public void insertTopLevelNode(long position) {
}
public void removeTopLevelNode(long position) {
}
//Returns the mesh offset/id for the given node or -1 if it doesnt exist
private int getMeshForNode(int node) {
return -1;
}
//Tracking for nodes that specifically need meshes, if a node doesnt have or doesnt need a mesh node, it is not in the map
// the map should be identical to the currently watched set of sections
//NOTE: that if the id is negative its part of a mesh request
private final Long2IntOpenHashMap pos2meshId = new Long2IntOpenHashMap();
private static final int NO_NODE = -1;
//The request queue should be like some array that can reuse objects to prevent gc nightmare + like a bitset to find an avalible free slot
// hashmap might work bar the gc overhead
private final MarkedObjectList<LeafRequest> leafRequests = new MarkedObjectList<>(LeafRequest[]::new, LeafRequest::new);
private void meshUpdate(BuiltSection mesh) {
int id = this.pos2meshId.get(mesh.position);
if (id == NO_NODE) {
//The built mesh section is no longer needed, discard it
// TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built??
mesh.free();
return;
}
if ((id&(1<<31))!=0) {
//The mesh is part of a batched request
id = id^(1<<31);//Basically abs it
//There are a few cases for this branch
// the section could be replacing an existing mesh that is part of the request (due to an update)
// the section mesh could be new to the request
// in this case the section mesh could be the last entry needed to satisfy the request
// in which case! we must either A) mark the request as ready to be uploaded
// and then uploaded after all the mesh updates are processed, or upload it immediately
//The lower 3 bits of the id specify the quadrant (8 pos) of the node in the request
LeafRequest request = this.leafRequests.get(id>>3);
} else {
//The mesh is an update for an existing node
int prevMesh = this.getMeshForNode(id);
if (prevMesh != -1) {
//Node has a mesh attached, remove and replace it
} else {
//Node didnt have a mesh attached, so just set the current mesh
}
}
}
}

View File

@@ -0,0 +1,23 @@
package me.cortex.voxy.client.core.rendering.hierarchical;
//System to determine what nodes to remove from the hericial tree while retaining the property that all
// leaf nodes should have meshes
//This system is critical to prevent the geometry buffer from growing to large or for too many nodes to fill up
// the node system
public class TreeTrimmer {
//Used to interact with the outside world
private final ITrimInterface trimInterface;
public TreeTrimmer(ITrimInterface trimInterface) {
this.trimInterface = trimInterface;
}
public void computeTrimPoints() {
//Do a bfs to find ending points to trim needs to be based on some, last used, metric
//First stratagy is to compute a bfs and or generate a list of nodes sorted by last use time
// the thing is that if we cull a mesh, it cannot be a leaf node
// if it is a leaf node its parent node must have a mesh loaded
}
}

View File

@@ -84,7 +84,7 @@ public class DownloadStream {
for (var entry : this.downloadList) { for (var entry : this.downloadList) {
glCopyNamedBufferSubData(entry.target.id, this.downloadBuffer.id, entry.targetOffset, entry.downloadStreamOffset, entry.size); glCopyNamedBufferSubData(entry.target.id, this.downloadBuffer.id, entry.targetOffset, entry.downloadStreamOffset, entry.size);
} }
thisFrameDownloadList.addAll(this.downloadList); this.thisFrameDownloadList.addAll(this.downloadList);
this.downloadList.clear(); this.downloadList.clear();
this.caddr = -1; this.caddr = -1;

View File

@@ -0,0 +1,51 @@
package me.cortex.voxy.client.core.rendering.util;
import it.unimi.dsi.fastutil.ints.Int2ObjectFunction;
import me.cortex.voxy.common.util.HierarchicalBitSet;
import java.util.function.Supplier;
public class MarkedObjectList<T> {
private static final float GROWTH_FACTOR = 0.75f;
private final Int2ObjectFunction<T[]> arrayGenerator;
private final Supplier<T> nullSupplier;
private final HierarchicalBitSet bitSet = new HierarchicalBitSet(-1);
private T[] objects;//Should maybe make a getter function instead
public MarkedObjectList(Int2ObjectFunction<T[]> arrayGenerator, Supplier<T> nullSupplier) {
this.arrayGenerator = arrayGenerator;
this.nullSupplier = nullSupplier;
this.objects = this.arrayGenerator.apply(16);
}
public int allocate() {
//Gets an unused id for some entry in objects, if its null fill it
int id = this.bitSet.allocateNext();
if (this.objects.length <= id) {
//Resize and copy over the objects array
int newLen = this.objects.length + (int)Math.ceil(this.objects.length*GROWTH_FACTOR);
T[] newArr = this.arrayGenerator.apply(newLen);
System.arraycopy(this.objects, 0, newArr, 0, this.objects.length);
this.objects = newArr;
}
if (this.objects[id] == null) {
this.objects[id] = this.nullSupplier.get();
}
return id;
}
public void release(int id) {
if (!this.bitSet.free(id)) {
throw new IllegalArgumentException("Index " + id + " was already released");
}
}
public T get(int index) {
//Make the checking that index is allocated optional, as it might cause overhead due to multiple cacheline misses
if (!this.bitSet.isSet(index)) {
throw new IllegalArgumentException("Index " + index + " is not allocated");
}
return this.objects[index];
}
}

View File

@@ -0,0 +1,72 @@
package me.cortex.voxy.common.util;
public class HierarchicalBitSet {
private final int limit;
private int cnt;
private long A = 0;
private final long[] B = new long[64];
private final long[] C = new long[64*64];
private final long[] D = new long[64*64*64];
public HierarchicalBitSet(int limit) {//Fixed size of 64^4
this.limit = limit;
if (limit > (1<<(6*4))) {
throw new IllegalArgumentException("Limit greater than capacity");
}
}
public int allocateNext() {
if (this.A==-1) {
return -1;
}
if (this.cnt+1>this.limit) {
return -2;//Limit reached
}
int idx = Long.numberOfTrailingZeros(~this.A);
long bp = this.B[idx];
idx = Long.numberOfTrailingZeros(~bp) + 64*idx;
long cp = this.C[idx];
idx = Long.numberOfTrailingZeros(~cp) + 64*idx;
long dp = this.D[idx];
idx = Long.numberOfTrailingZeros(~dp) + 64*idx;
dp |= 1L<<(idx&0x3f);
this.D[idx>>6] = dp;
int ret = idx;
if (dp==-1) {
idx >>= 6;
cp |= 1L<<(idx&0x3f);
this.C[idx>>6] = cp;
if (cp==-1) {
idx >>= 6;
bp |= 1L<<(idx&0x3f);
this.B[idx>>6] = bp;
if (bp==-1) {
this.A |= 1L<<(idx&0x3f);
}
}
}
this.cnt++;
return ret;
}
public boolean free(int idx) {
long v = this.D[idx>>6];
boolean wasSet = (v&(1L<<(idx&0x3f)))!=0;
this.cnt -= wasSet?1:0;
this.D[idx>>6] = v&~(1L<<(idx&0x3f));
idx >>= 6;
this.C[idx>>6] &= ~(1L<<(idx&0x3f));
idx >>= 6;
this.B[idx>>6] &= ~(1L<<(idx&0x3f));
idx >>= 6;
this.A &= ~(1L<<(idx&0x3f));
return wasSet;
}
public int getCount() {
return this.cnt;
}
public boolean isSet(int idx) {
return (this.D[idx>>6]&(1L<<(idx&0x3f)))!=0;
}
}

View File

@@ -1,6 +1,8 @@
#version 460 core #version 460 core
layout(binding = 0) uniform sampler2D blockModelAtlas; layout(binding = 0) uniform sampler2D blockModelAtlas;
//#define DEBUG_RENDER
//TODO: need to fix when merged quads have discardAlpha set to false but they span multiple tiles //TODO: need to fix when merged quads have discardAlpha set to false but they span multiple tiles
// however they are not a full block // however they are not a full block
@@ -10,8 +12,11 @@ layout(location = 2) in flat vec4 tinting;
layout(location = 3) in flat vec4 addin; layout(location = 3) in flat vec4 addin;
layout(location = 4) in flat uint flags; layout(location = 4) in flat uint flags;
layout(location = 5) in flat vec4 conditionalTinting; layout(location = 5) in flat vec4 conditionalTinting;
//layout(location = 6) in flat vec4 solidColour;
#ifdef DEBUG_RENDER
layout(location = 6) in flat uint quadDebug;
#endif
layout(location = 0) out vec4 outColour; layout(location = 0) out vec4 outColour;
void main() { void main() {
vec2 uv = mod(uv, vec2(1.0))*(1.0/(vec2(3.0,2.0)*256.0)); vec2 uv = mod(uv, vec2(1.0))*(1.0/(vec2(3.0,2.0)*256.0));
@@ -29,4 +34,14 @@ void main() {
outColour = (colour * tinting) + addin; outColour = (colour * tinting) + addin;
//outColour = vec4(uv + baseUV, 0, 1); //outColour = vec4(uv + baseUV, 0, 1);
#ifdef DEBUG_RENDER
uint hash = quadDebug*1231421+123141;
hash ^= hash>>16;
hash = hash*1231421+123141;
hash ^= hash>>16;
hash = hash * 1827364925 + 123325621;
outColour = vec4(float(hash&15u)/15, float((hash>>4)&15u)/15, float((hash>>8)&15u)/15, 1);
#endif
} }

View File

@@ -1,150 +0,0 @@
#version 460 core
#extension GL_ARB_gpu_shader_int64 : enable
#import <voxy:lod/quad_format.glsl>
#import <voxy:lod/gl46/bindings.glsl>
#import <voxy:lod/block_model.glsl>
#line 8
layout(location = 0) out vec2 uv;
layout(location = 1) out flat vec2 baseUV;
layout(location = 2) out flat vec4 tinting;
layout(location = 3) out flat vec4 addin;
layout(location = 4) out flat uint flags;
layout(location = 5) out flat vec4 conditionalTinting;
//layout(location = 6) out flat vec4 solidColour;
uint extractLodLevel() {
return uint(gl_BaseInstance)>>27;
}
//Note the last 2 bits of gl_BaseInstance are unused
//Gives a relative position of +-255 relative to the player center in its respective lod
ivec3 extractRelativeLodPos() {
return (ivec3(gl_BaseInstance)<<ivec3(5,14,23))>>ivec3(23);
}
vec4 uint2vec4RGBA(uint colour) {
return vec4((uvec4(colour)>>uvec4(24,16,8,0))&uvec4(0xFF))/255.0;
}
//Gets the face offset with respect to the face direction (e.g. some will be + some will be -)
float getDepthOffset(uint faceData, uint face) {
float offset = extractFaceIndentation(faceData);
return offset * (1.0-((int(face)&1)*2.0));
}
vec2 getFaceSizeOffset(uint faceData, uint corner) {
float EPSILON = 0.001f;
vec4 faceOffsetsSizes = extractFaceSizes(faceData);
//Expand the quads by a very small amount
faceOffsetsSizes.xz -= vec2(EPSILON);
faceOffsetsSizes.yw += vec2(EPSILON);
return mix(faceOffsetsSizes.xz, faceOffsetsSizes.yw-1.0f, bvec2(((corner>>1)&1u)==1, (corner&1u)==1));
}
//TODO: add a mechanism so that some quads can ignore backface culling
// this would help alot with stuff like crops as they would look kinda weird i think,
// same with flowers etc
void main() {
int cornerIdx = gl_VertexID&3;
Quad quad = quadData[uint(gl_VertexID)>>2];
vec3 innerPos = extractPos(quad);
uint face = extractFace(quad);
uint modelId = extractStateId(quad);
BlockModel model = modelData[modelId];
uint faceData = model.faceData[face];
bool isTranslucent = modelIsTranslucent(model);
bool hasAO = modelHasMipmaps(model);//TODO: replace with per face AO flag
bool isShaded = hasAO;//TODO: make this a per face flag
//Change the ordering due to backface culling
//NOTE: when rendering, backface culling is disabled as we simply dispatch calls for each face
// this has the advantage of having "unassigned" geometry, that is geometry where the backface isnt culled
//if (face == 0 || (face>>1 != 0 && (face&1)==1)) {
// cornerIdx ^= 1;
//}
uint lodLevel = extractLodLevel();
ivec3 lodCorner = ((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5;
vec3 corner = innerPos * (1<<lodLevel) + lodCorner;
vec2 faceOffset = getFaceSizeOffset(faceData, cornerIdx);
ivec2 quadSize = extractSize(quad);
vec2 respectiveQuadSize = vec2(quadSize * ivec2((cornerIdx>>1)&1, cornerIdx&1));
vec2 size = (respectiveQuadSize + faceOffset) * (1<<lodLevel);
vec3 offset = vec3(size, (float(face&1u) + getDepthOffset(faceData, face)) * (1<<lodLevel));
if ((face>>1) == 0) { //Up/down
offset = offset.xzy;
}
//Not needed, here for readability
//if ((face>>1) == 1) {//north/south
// offset = offset.xyz;
//}
if ((face>>1) == 2) { //west/east
offset = offset.zxy;
}
gl_Position = MVP * vec4(corner + offset, 1.0);
//Compute the uv coordinates
vec2 modelUV = vec2(modelId&0xFFu, (modelId>>8)&0xFFu)*(1.0/(256.0));
//TODO: make the face orientated by 2x3 so that division is not a integer div and modulo isnt needed
// as these are very slow ops
baseUV = modelUV + (vec2(face>>1, face&1u) * (1.0/(vec2(3.0, 2.0)*256.0)));
//TODO: add an option to scale the quad size by the lod level so that
// e.g. at lod level 2 a face will have 2x2
uv = respectiveQuadSize + faceOffset;//Add in the face offset for 0,0 uv
flags = faceHasAlphaCuttout(faceData);
//We need to have a conditional override based on if the model size is < a full face + quadSize > 1
flags |= uint(any(greaterThan(quadSize, ivec2(1)))) & faceHasAlphaCuttoutOverride(faceData);
flags |= uint(!modelHasMipmaps(model))<<1;
//Compute lighting
tinting = getLighting(extractLightId(quad));
//Apply model colour tinting
uint tintColour = model.colourTint;
if (modelHasBiomeLUT(model)) {
tintColour = colourData[tintColour + extractBiomeId(quad)];
}
conditionalTinting = vec4(0);
if (tintColour != uint(-1)) {
flags |= 1u<<2;
conditionalTinting = uint2vec4RGBA(tintColour).yzwx;
}
addin = vec4(0.0);
if (!isTranslucent) {
tinting.w = 0.0;
//Encode the face, the lod level and
uint encodedData = 0;
encodedData |= face;
encodedData |= (lodLevel<<3);
encodedData |= uint(hasAO)<<6;
addin.w = float(encodedData)/255.0;
}
//Apply face tint
if (isShaded) {
if ((face>>1) == 1) {
tinting.xyz *= 0.8f;
} else if ((face>>1) == 2) {
tinting.xyz *= 0.6f;
} else if (face == 0){
tinting.xyz *= 0.5f;
} else {
//TODO: FIXME: DONT HAVE SOME ARBITARY TINT LIKE THIS
tinting.xyz *= 0.95f;
}
}
//solidColour = vec4(vec3(modelId&0xFu, (modelId>>4)&0xFu, (modelId>>8)&0xFu)*(1f/15f),1f);
}

View File

@@ -6,13 +6,18 @@
#import <voxy:lod/block_model.glsl> #import <voxy:lod/block_model.glsl>
#line 8 #line 8
//#define DEBUG_RENDER
layout(location = 0) out vec2 uv; layout(location = 0) out vec2 uv;
layout(location = 1) out flat vec2 baseUV; layout(location = 1) out flat vec2 baseUV;
layout(location = 2) out flat vec4 tinting; layout(location = 2) out flat vec4 tinting;
layout(location = 3) out flat vec4 addin; layout(location = 3) out flat vec4 addin;
layout(location = 4) out flat uint flags; layout(location = 4) out flat uint flags;
layout(location = 5) out flat vec4 conditionalTinting; layout(location = 5) out flat vec4 conditionalTinting;
//layout(location = 6) out flat vec4 solidColour;
#ifdef DEBUG_RENDER
layout(location = 6) out flat uint quadDebug;
#endif
uint extractLodLevel() { uint extractLodLevel() {
return uint(gl_BaseInstance)>>27; return uint(gl_BaseInstance)>>27;
@@ -143,4 +148,8 @@ void main() {
vec3 origin = vec3(((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5); vec3 origin = vec3(((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5);
gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize,0)))*(1<<lodLevel)+origin, 1.0); gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize,0)))*(1<<lodLevel)+origin, 1.0);
#ifdef DEBUG_RENDER
quadDebug = lodLevel;
#endif
} }

View File

@@ -0,0 +1,20 @@
//Use defines and undefines to define the queue, allows for hacky reuse of imports
#ifndef QUEUE_NAME
#error QUEUE_NAME is not defined
#endif
void push(queue, item) {
}
#

View File

@@ -0,0 +1,262 @@
#version 460 core
#define WORKGROUP 4
#define MINI_BATCH_SIZE 32
//The entire uint is a minibatch (each idx is one)
#define MINI_BATCH_MSK (uint(-1))
//Each y dim is a quadrent in the octree
// multiple x dims to fill up workgroups
layout(local_size_x=WORKGROUP, local_size_y=8) in;
layout(binding = 1, std430) restrict buffer RequestSectionLoadQueue {
uint counter;
uint[] queue;
} requestQueue;
//SectionNodeData is a uvec4 that contains the position + flags + ptr to own render section data + ptr to children
layout(binding = 2, std430) restrict readonly buffer SectionNodeData {
uvec4[] sectionNodes;
};
layout(binding = 3, std430) restrict buffer ActiveWorkingNodeQueue {
uint feedbackStatus;
uint batchIndex;
uint end;
uint start;
uint maxSize;//Needs to be a multiple of local_size_x
uint[] queue;
} nodeQueue;
struct UnpackedNode {
ivec4 position;//x,y,z,detail
uint flags;//16 bits
uint self;
uint children;
};
UnpackedNode unpackNode(uvec4 data) {
UnpackedNode node;
return node;
}
//NOTE: this is different to nanite in the fact that if a node is not loaded, too bad dont render
shared UnpackedNode workingNodes[WORKGROUP];
shared uint miniBatchMsk;
void loadNode() {
if (gl_LocalInvocationIndex == 0) {//Check if we need to
batchMsk = 0;//Reset the minibatch
if (miniBatchMsk == MINI_BATCH_SIZE) {
}
}
barrier();
if (gl_LocalInvocationID.y == 0) {
//Need to make it work in y size 8, but only gl_LocalInvocationId.x == 0
workingNodes[gl_LocalInvocationID.x] = unpackNode(sectionNodes[id]);
}
barrier();//Synchonize, also acts as memory barrier
}
//Computes screensize of the node and whether it should render itself or its children
bool shouldRenderChildren(UnpackedNode node) {
}
//Process a single node and enqueue child nodes if needed into work queue, enqueue self to render and/or request children to load
void processNode(uint id) {//Called even if it doesnt have any work (id==-1) to ensure uniform control flow for barriers
//Bottom 2 bits are status flags, is air and children loaded
// node.flags
//If the childrenloaded flag is not set, send a request for the children of the node to be loaded
// if all the children are loaded but we are not and we need to render, render the children and dispatch
// a request to load self
if (shouldRenderChildren(node)) {
//Dont care about
} else {
}
}
//The activly schedualed/acquired work slot for this group
shared uint workingBatchIndex;
shared uint workingBatchOffset;
void process() {
if (gl_LocalInvocationIndex == 0) {//This includes both x and y
workingBatchIndex = atomicAdd(nodeQueue.batchIndex, BATCH_SIZE);
}
}
void main() {
while (true) {
barrier();
}
}
//when a node is processed,
// compute its screen bounding box is computed using fast trick (e.g. if your viewing it from a quadrent you already know its bounding points (min/max))
// frustum cull, check hiz
// if it passes culling, use the screensize to check wether it must render itself
// or dispatch its children to render
// IF its error is small enough, then render itself, its mesh should always be loaded, if not its a critical error (except maybe if its a top level node or something)
// if its error is too large,
// check that all children are loaded (or empty), if they are not all loaded, enqueu a request for the cpu to load
// that nodes children
// if the load queue is full, dont enqueue it to the queue
// then instead of rendering children, render its own mesh since it should always be loaded
//Can also reverse the above slightly and make it so that it checks the children before enqueuing them
//the main thing to worry about is if there is enough work to fill the inital few rounds of this
// before amplification takes effect
// can do a thing where it initally just blasts child nodes out until the size is small enough
// NOTE: since matrix multiplication distributes over addition
// can precompute the AABB corners with respect to the matrix
// then you can just add a translation vector
//TODO: can do in another way
// first compute the sections that should either render self or childs
// then in as a seperate job queue work though it
uint getChildCount(UnpackedNode node) {
}
//Checks whether a node should be culled based on hiz/frustum
bool cullNode(UnpackedNode node) {
}
//Should render this node, or recurse to children
bool shouldRenderChildrenInstead(UnpackedNode node) {
}
//Does the node have its own mesh loaded
bool nodeHasSelfMesh(UnpackedNode node) {
}
//Does the node its children loaded (note! not child meshes)
bool nodeHasChildrenLoaded(UnpackedNode node) {
}
//Are all the childrens meshes loaded
bool nodeHasChildMeshesLoaded(UnpackedNode node) {
}
void request(uint type, uint idx) {
}
void renderMesh(uint idx) {
}
void enqueueChildren(uint arg, UnpackedNode node) {
uint cnt = getChildCount(node);
//TODO: the queue needs 2 counters, the pre and post atomic,
// pre is incremented to get index
// queue is written to
// post is then incremented to signal
}
void reportCritical(uint type) {
}
void processNode(uint idx) {
UnpackedNode node = unpackNode(sectionNodes[idx]);
if (!cullNode(node)) {
//Should we render children instead of ourselves with respect to screenspace error
if (shouldRenderChildrenInstead(node)) {
if (nodeHasChildrenLoaded(node)) {
//Dispatch nodes to queue
enqueueChildren(0, node);
} else {
//Children arnt loaded so either render self mesh or if we cant
// abort basicly must request nodes
if (nodeHasSelfMesh(node)) {
//Render self and dispatch request to load children
renderMesh(node.self);
request(1, idx);
} else {
//Critical issue, no are loaded and self has no mesh
reportCritical(0);
}
}
} else {
if (nodeHasSelfMesh(node)) {
//render self
renderMesh(node.self);
} else {
//Request that self mesh is loaded
request(0, idx);
//render children instead
if (nodeHasChildrenLoaded(node)) {//Might need to be node nodeHasChildMeshesLoaded
enqueueChildren(1, node);
} else {
//This is very bad, it means cant render anything
reportCritical(1);
}
}
}
}
}
//Psudo code, one thread, one load
void main() {
while (true) {
//Try to process a node queue entry
uint work = atomicAdd(workingNodeQueuePos, 1);
uint idx = work&0xFFFFFFu;
uint arg = work>>24;
if (idx < workingNodeQueueEnd) {
} else {
//Do other queue work however we still have the work slot allocated
}
}
}