WIP

2024-06-11 23:06:24 +10:00
parent 584028ae7a
commit 25ddb83d22
18 changed files with 882 additions and 158 deletions
--- a/src/main/java/me/cortex/voxy/client/core/gl/GlPersistentMappedBuffer.java
+++ b/src/main/java/me/cortex/voxy/client/core/gl/GlPersistentMappedBuffer.java
@@ -14,7 +14,7 @@ public class GlPersistentMappedBuffer extends TrackedObject {
        this.id = glCreateBuffers();
        this.size = size;
        glNamedBufferStorage(this.id, size, GL_CLIENT_STORAGE_BIT|GL_MAP_PERSISTENT_BIT|(flags&(GL_MAP_COHERENT_BIT|GL_MAP_WRITE_BIT|GL_MAP_READ_BIT)));
-        this.addr = nglMapNamedBufferRange(this.id, 0, size, flags|GL_MAP_PERSISTENT_BIT);
+        this.addr = nglMapNamedBufferRange(this.id, 0, size, (flags&(GL_MAP_WRITE_BIT|GL_MAP_READ_BIT|GL_MAP_UNSYNCHRONIZED_BIT|GL_MAP_FLUSH_EXPLICIT_BIT))|GL_MAP_PERSISTENT_BIT);
    }

    @Override
--- a/src/main/java/me/cortex/voxy/client/core/gl/shader/GenericsProcessor.java
+++ b/src/main/java/me/cortex/voxy/client/core/gl/shader/GenericsProcessor.java
@@ -0,0 +1,12 @@
+package me.cortex.voxy.client.core.gl.shader;
+
+import java.util.regex.Pattern;
+
+public class GenericsProcessor implements IShaderProcessor {
+    private static final Pattern GENERIC_DEFINE = Pattern.compile("#defineGen (?<name>[A-Za-z0-9]+)<(?<generic>[A-Za-z0-9]*)>");
+    private static final Pattern GENERIC_USE = Pattern.compile("(?<type>[A-Za-z0-9]+)<(?<generic>[A-Za-z0-9]*)>");
+    @Override
+    public String process(ShaderType type, String source) {
+        return null;
+    }
+}
--- a/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java
+++ b/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java
@@ -3,8 +3,7 @@ package me.cortex.voxy.client.core.gl.shader;
 import me.cortex.voxy.common.util.TrackedObject;
 import org.lwjgl.opengl.GL20C;

-import java.util.HashMap;
-import java.util.Map;
+import java.util.*;
 import java.util.stream.Collectors;

 import static org.lwjgl.opengl.GL20.glDeleteProgram;
@@ -16,8 +15,15 @@ public class Shader extends TrackedObject {
        id = program;
    }

-    public static Builder make(IShaderProcessor processor) {
-        return new Builder(processor);
+    public static Builder make(IShaderProcessor... processors) {
+        List<IShaderProcessor> aa = new ArrayList<>(List.of(processors));
+        Collections.reverse(aa);
+        IShaderProcessor applicator = (type,source)->source;
+        for (IShaderProcessor processor : processors) {
+            IShaderProcessor finalApplicator = applicator;
+            applicator = (type, source) -> finalApplicator.process(type, processor.process(type, source));
+        }
+        return new Builder(applicator);
    }

    public static Builder make() {
--- a/src/main/java/me/cortex/voxy/client/core/rendering/HierarchicalOcclusionRenderer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/HierarchicalOcclusionRenderer.java
@@ -0,0 +1,50 @@
+package me.cortex.voxy.client.core.rendering;
+
+import me.cortex.voxy.client.core.gl.GlBuffer;
+import me.cortex.voxy.client.core.gl.shader.Shader;
+import me.cortex.voxy.client.core.gl.shader.ShaderType;
+import me.cortex.voxy.client.core.rendering.hierarchical.NodeManager;
+import me.cortex.voxy.common.util.HierarchicalBitSet;
+
+import static org.lwjgl.opengl.GL42C.*;
+import static org.lwjgl.opengl.GL43C.GL_SHADER_STORAGE_BARRIER_BIT;
+import static org.lwjgl.opengl.GL43C.glDispatchCompute;
+import static org.lwjgl.opengl.GL44.GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
+
+public class HierarchicalOcclusionRenderer {
+    private final int workgroup_dispatch_size_x;//The number of workgroups required to saturate the gpu efficiently
+    private final NodeManager nodeManager = new NodeManager(null);
+    private final HiZBuffer hiz = new HiZBuffer();
+
+
+    private Shader hiercarchialShader = Shader.make()
+            .add(ShaderType.COMPUTE, "voxy:lod/hierarchical/selector.comp")
+            .compile();
+
+    public HierarchicalOcclusionRenderer(int workgroup_size) {
+        this.workgroup_dispatch_size_x = workgroup_size;
+
+    }
+
+    private void bind() {
+
+    }
+
+    public void render(int depthBuffer, int width, int height) {
+        //Make hiz
+        this.hiz.buildMipChain(depthBuffer, width, height);
+        //Node upload phase
+        this.nodeManager.uploadPhase();
+        //Node download phase (pulls from previous frame (should maybe result in lower latency)) also clears and resets the queues
+        this.nodeManager.downloadPhase();
+        //Bind all the resources
+        this.bind();
+        //run hierachial selection shader
+        this.hiercarchialShader.bind();
+        //barrier
+        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_UNIFORM_BARRIER_BIT|GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT|GL_FRAMEBUFFER_BARRIER_BIT);
+        //Emit enough work to fully populate the gpu
+        glDispatchCompute(this.workgroup_dispatch_size_x, 1, 1);
+        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_COMMAND_BARRIER_BIT|GL_UNIFORM_BARRIER_BIT);
+    }
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/INodeInteractor.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/INodeInteractor.java
@@ -0,0 +1,15 @@
+package me.cortex.voxy.client.core.rendering.hierarchical;
+
+import me.cortex.voxy.client.core.rendering.building.BuiltSection;
+
+import java.util.function.Consumer;
+
+//Interface for node manager to interact with the outside world
+public interface INodeInteractor {
+    void watchUpdates(long pos);//marks pos as watching for updates, i.e. any LoD updates will trigger a callback
+    void unwatchUpdates(long pos);//Unmarks a position for updates
+
+    void requestMesh(long pos);//Explicitly requests a mesh at a position, run the callback
+
+    void setMeshUpdateCallback(Consumer<BuiltSection> mesh);
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/ITrimInterface.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/ITrimInterface.java
@@ -0,0 +1,12 @@
+package me.cortex.voxy.client.core.rendering.hierarchical;
+
+public interface ITrimInterface {
+    //Last recorded/known use time of a nodes mesh, returns -1 if node doesnt have a mesh
+    int lastUsedTime(int node);
+
+    //Returns an integer with the bottom 24 bits being the ptr top 8 bits being count or something
+    int getChildren(int node);
+
+    //Returns a size of the nodes mesh, -1 if the node doesnt have a mesh
+    int getNodeSize(int node);
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeLoadSystem.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeLoadSystem.java
@@ -0,0 +1,5 @@
+package me.cortex.voxy.client.core.rendering.hierarchical;
+
+//Uses a persistently mapped coherient buffer with off thread polling to pull in requests
+public class NodeLoadSystem {
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java
@@ -0,0 +1,219 @@
+package me.cortex.voxy.client.core.rendering.hierarchical;
+
+import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
+import me.cortex.voxy.client.core.gl.GlBuffer;
+import me.cortex.voxy.client.core.rendering.building.BuiltSection;
+import me.cortex.voxy.client.core.rendering.util.DownloadStream;
+import me.cortex.voxy.common.util.HierarchicalBitSet;
+import org.lwjgl.system.MemoryUtil;
+
+import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT;
+import static org.lwjgl.opengl.GL30.GL_R32UI;
+import static org.lwjgl.opengl.GL30C.GL_RED_INTEGER;
+import static org.lwjgl.opengl.GL45.nglClearNamedBufferSubData;
+
+public class NodeManager {
+    public static final int MAX_NODE_COUNT = 1<<22;
+    public static final int MAX_REQUESTS = 1024;
+    private final HierarchicalBitSet bitSet = new HierarchicalBitSet(MAX_NODE_COUNT);
+    private final GlBuffer nodeBuffer = new GlBuffer(MAX_NODE_COUNT*16);//Node size is 16 bytes
+
+    //TODO: maybe make this a coherent persistent mapped read buffer, instead of download synced buffer copy thing
+
+    //a request payload is a single uint, first 8 bits are flags followed by 24 bit node identifier
+    // (e.g. load child nodes, load child nodes + meshs, load self meshes )
+    private final int REQUEST_QUEUE_SIZE = 4 + MAX_REQUESTS * 4;//TODO: add a priority system
+    private final GlBuffer requestQueue = new GlBuffer(4 + MAX_REQUESTS * 4);
+
+    //Buffer containing the index of the root nodes
+    private final GlBuffer roots = new GlBuffer(1024*4);
+
+
+
+    //500mb TODO: SEE IF CAN SHRINK IT BY EITHER NOT NEEDING AS MUCH SPACE or reducing max node count
+    private final long[] localNodes = new long[MAX_NODE_COUNT * 3];//1.5x the size of the gpu copy to store extra metadata
+    //LocalNodes have an up value pointing to the parent, enabling full traversal
+
+    private final INodeInteractor interactor;
+
+    public NodeManager(INodeInteractor interactor) {
+        this.interactor = interactor;
+        this.pos2meshId.defaultReturnValue(NO_NODE);
+    }
+
+    //Returns true if it has its own mesh loaded
+    private static boolean nodeHasMeshLoaded(long metaA, long metaB) {
+        return false;
+    }
+
+    private static final int REQUEST_SELF = 0;
+    private static final int REQUEST_CHILDREN = 1;
+    //A node can be loaded in the tree but have no mesh associated with it
+    // this is so that higher level nodes dont waste mesh space
+
+
+    //The reason that nodes have both child and own mesh pointers
+    // is so that on an edge of the screen or when moving, nodes arnt constantly being swapped back and forth
+    // it basicly acts as an inline cache :tm: however it does present some painpoints
+    // especially in managing the graph
+
+    //It might be easier to have nodes strictly either point to child nodes or meshes
+    // if a parent needs to be rendered instead of the child, request for node change to self
+    // while this will generate a shitton more requests it should be alot easier to manage graph wise
+    // can probably add a caching service via a compute shader that ingests a request list
+    // sees if the requested nodes are already cached, if so swap them in, otherwise dispatch a request
+    // to cpu
+
+    private void processRequestQueue(long ptr, long size) {
+        int count = MemoryUtil.memGetInt(ptr); ptr += 4;
+        for (int i = 0; i < count; i++) {
+            int request = MemoryUtil.memGetInt(ptr + i*4L);
+            int args = request&(0xFF<<24);
+            int nodeId = request&(0xFFFFFF);
+
+            long pos = this.localNodes[nodeId*3];
+            long metaA = this.localNodes[nodeId*3 + 1];
+            long metaB = this.localNodes[nodeId*3 + 2];
+
+            int type = args&0b11;//2 bits for future request types such as parent and ensure stable (i.e. both parent and child loaded)
+            if (type == REQUEST_SELF) {
+                //Requires own mesh loaded (it can have 2 different priorites, it can fallback to using its children to render if they are loaded)
+                // else it is critical priority
+                if (nodeHasMeshLoaded(metaA, metaB)) {
+                    throw new IllegalStateException("Node requested a mesh load, but mesh is already loaded: " + pos);
+                }
+
+                //watch the mesh and request it
+                this.interactor.watchUpdates(pos);
+                this.interactor.requestMesh(pos);
+
+            } else if (type == REQUEST_CHILDREN) {
+                //Node requires children to be loaded NOTE: when this is the case, it doesnt just mean the nodes,
+                // it means the meshes aswell,
+                // meshes may be unloaded later
+
+                //when this case is hit it means that the child nodes arnt even loaded, so it becomes a bit more complex
+                // basicly, need to request all child nodes be loaded in a batch
+                // then in the upload tick need to do update many things
+
+            } else {
+                throw new IllegalArgumentException("Unknown update type: " + type + " @pos:" + pos);
+            }
+
+        }
+    }
+
+
+    public void uploadPhase() {
+        //All uploads
+
+        //Have a set of upload tasks for nodes,
+        // this could include updating the mesh ptr
+        // or child ptr or uploading new nodes
+        // NOTE: when uploading a set of new nodes (must be clustered as children)
+        // have to update parent
+        // same when removing a set of children
+
+        //Note: child node upload tasks need to all be complete before they can be uploaded
+
+
+        //The way the graph works and can be cut is that all the leaf nodes _must_ at all times contain a mesh
+        // this is critical to prevent "cracks"/no geometry being rendered
+        // when the render mesh buffer is "full" (or even just periodicly), trimming of the tree must occur to keep
+        // size within reason
+        //Note tho that there is a feedback delay and such so geometry buffer should probably be trimmed when it reaches
+        // 80-90% capacity so that new geometry can still be uploaded without being blocked on geometry clearing
+        // it becomes a critical error if the geometry buffer becomes full while the tree is fully trimmed
+        //NOTE: while trimming the tree, need to also trim the parents down i.e. the top level should really not have its mesh
+        // loaded while it isnt really ever used
+        // however as long as the rule that all leaf nodes have a mesh loaded is held then there should never be
+        // any geometry holes
+    }
+
+
+    //Download and upload point, called once per frame
+    public void downloadPhase() {
+        DownloadStream.INSTANCE.download(this.requestQueue, 0, REQUEST_QUEUE_SIZE, this::processRequestQueue);
+        DownloadStream.INSTANCE.commit();
+        //Clear the queue counter, TODO: maybe do it some other way to batch clears
+        nglClearNamedBufferSubData(this.requestQueue.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0);
+        //TODO: compute cleanup here of loaded nodes, and what needs to be uploaded
+        // i.e. if there is more upload stuff than there is free memory, cull nodes in the tree
+        // to fit upload points, can also create errors if all nodes in the tree are requested but no memory to put
+    }
+
+
+
+
+
+    //Inserts a top level node into the graph, it has geometry and no children loaded as it is a leaf node
+    public void insertTopLevelNode(long position) {
+
+    }
+
+    //Removes a top level node from the graph, doing so also removes all child nodes and associate geometry
+    // the allocated slots when removing nodes are stored and roped off until it is guarenteed that all requests have
+    // passed
+    public void removeTopLevelNode(long position) {
+
+    }
+
+
+
+    //Tracking for nodes that specifically need meshes, if a node doesnt have or doesnt need a mesh node, it is not in the map
+    // the map should be identical to the currently watched set of sections
+    //NOTE: that if the id is negative its part of a mesh request
+    private final Long2IntOpenHashMap pos2meshId = new Long2IntOpenHashMap();
+    private static final int NO_NODE = -1;
+
+    //Need to make this system attatched with a batched worker system, since a mesh update can be a few things
+    // it can be a mesh update of a tracked render section, in this case we must ensure that it is still tracked and hasnt been removed bla bla bla
+    //   if its still valid and tracked then upload it and update the node aswell ensuring sync bla bla bla
+    // if it was part of a request, then we need to first check that the request still exists and hasnt been discarded  B) probably upload it immediatly still
+    //   B) set the request with that section to have been, well, uploaded and the mesh set, (note if the mesh was updated while a request was inprogress/other requests not fufilled, need to remove the old and replace with the updated)
+    //      if all the meshes in the request are satisfied, upload the request nodes and update its parent
+    // NOTE! batch requests where this is needed are only strictly required when children are requested in order to guarentee that all
+    //      propertiy of leaf nodes must have meshes remains
+    //(TODO: see when sync with main thread should be, in the renderer or here since the updates are dispatched offthread)
+    // Note that the geometry buffer should have idk 20% free? that way meshes can always be inserted (same for the node buffer ig) maybe 10%? idk need to experiement
+    //  if the buffer goes over this threshold, the tree/graph culler must start culling last/least used nodes somehow
+    //  it should be an error if the geometry or node buffer fills up but there are no nodes/meshes to cull/remove
+    public void meshUpdate(BuiltSection mesh) {
+        int id = this.pos2meshId.get(mesh.position);
+        if (id == NO_NODE) {
+            //The built mesh section is no longer needed, discard it
+            // TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built??
+            mesh.free();
+            return;
+        }
+        if ((id&(1<<31))!=0) {
+            //The mesh is part of a batched request
+            id = id^(1<<31);
+
+        } else {
+            //The mesh is an update for an existing node
+            //this.localNodes[id*3]
+        }
+    }
+
+
+    //A node has a position    (64 bit)
+    // a ptr to its own mesh   (24 bit)
+    // a ptr to children nodes (24 bit)
+    // flags                   (16 bit)
+    //                         Total of 128 bits (16 bytes)
+
+    //First 2 flag bits are a requested dispatch type (0 meaning no request and the 3 remaining states for different request types)
+    // this ensures that over multiple frames the same node is not requested
+
+    //Bits exist for whether or not the children have meshes loaded or if the parents have meshes loaded
+    // the idea is to keep +-1 lod meshes loaded into vram to enable seemless transitioning
+    // the only critical state is that if a mesh wants to be rendered it should be able to be rendered
+
+    //Basicly, there are multiple things, it depends on the screensize error
+    // if a node is close to needing its children loaded but they arnt, then request it but with a lower priority
+    // if a node must need its children then request at a high prioirty
+    // if a node doesnt have a mesh but all its children do than dispatch a medium priority to have its own mesh loaded
+    //      but then just use the child meshes for rendering
+
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager2.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager2.java
@@ -0,0 +1,103 @@
+package me.cortex.voxy.client.core.rendering.hierarchical;
+
+import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
+import me.cortex.voxy.client.core.rendering.building.BuiltSection;
+import me.cortex.voxy.client.core.rendering.util.MarkedObjectList;
+
+public class NodeManager2 {
+    //A request for making a new child nodes 
+    private static final class LeafRequest {
+        //LoD position identifier
+        public long position;
+
+        //Node id of the node the leaf request is for, note! While there is a leaf request, the node should not be unloaded or removed
+        public int nodeId;
+
+        //The mask of what child nodes are required
+        public byte requiredChildMask;
+
+        //The mask of currently supplied child node data
+        public byte currentChildMask;
+
+        //Reset/clear the request so that it may be reused
+        public void clear() {
+
+        }
+    }
+
+    public static final int MAX_NODE_COUNT = 1<<22;
+
+    //Local data layout
+    // first long is position (todo! might not be needed)
+    // next long contains mesh position ig/id
+    private final long[] localNodeData = new long[MAX_NODE_COUNT * 3];
+
+    private final INodeInteractor interactor;
+
+    public NodeManager2(INodeInteractor interactor) {
+        this.interactor = interactor;
+        this.pos2meshId.defaultReturnValue(NO_NODE);
+        this.interactor.setMeshUpdateCallback(this::meshUpdate);
+    }
+
+    public void insertTopLevelNode(long position) {
+
+    }
+
+    public void removeTopLevelNode(long position) {
+
+    }
+
+    //Returns the mesh offset/id for the given node or -1 if it doesnt exist
+    private int getMeshForNode(int node) {
+        return -1;
+    }
+
+
+    //Tracking for nodes that specifically need meshes, if a node doesnt have or doesnt need a mesh node, it is not in the map
+    // the map should be identical to the currently watched set of sections
+    //NOTE: that if the id is negative its part of a mesh request
+    private final Long2IntOpenHashMap pos2meshId = new Long2IntOpenHashMap();
+    private static final int NO_NODE = -1;
+
+    //The request queue should be like some array that can reuse objects to prevent gc nightmare + like a bitset to find an avalible free slot
+    // hashmap might work bar the gc overhead
+    private final MarkedObjectList<LeafRequest> leafRequests = new MarkedObjectList<>(LeafRequest[]::new, LeafRequest::new);
+
+
+    private void meshUpdate(BuiltSection mesh) {
+        int id = this.pos2meshId.get(mesh.position);
+        if (id == NO_NODE) {
+            //The built mesh section is no longer needed, discard it
+            // TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built??
+            mesh.free();
+            return;
+        }
+        if ((id&(1<<31))!=0) {
+            //The mesh is part of a batched request
+            id = id^(1<<31);//Basically abs it
+
+            //There are a few cases for this branch
+            // the section could be replacing an existing mesh that is part of the request (due to an update)
+            // the section mesh could be new to the request
+            //  in this case the section mesh could be the last entry needed to satisfy the request
+            //      in which case! we must either A) mark the request as ready to be uploaded
+            //      and then uploaded after all the mesh updates are processed, or upload it immediately
+
+            //The lower 3 bits of the id specify the quadrant (8 pos) of the node in the request
+            LeafRequest request = this.leafRequests.get(id>>3);
+
+
+        } else {
+            //The mesh is an update for an existing node
+
+            int prevMesh = this.getMeshForNode(id);
+            if (prevMesh != -1) {
+                //Node has a mesh attached, remove and replace it
+            } else {
+                //Node didnt have a mesh attached, so just set the current mesh
+            }
+        }
+    }
+
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/TreeTrimmer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/TreeTrimmer.java
@@ -0,0 +1,23 @@
+package me.cortex.voxy.client.core.rendering.hierarchical;
+
+//System to determine what nodes to remove from the hericial tree while retaining the property that all
+// leaf nodes should have meshes
+//This system is critical to prevent the geometry buffer from growing to large or for too many nodes to fill up
+// the node system
+public class TreeTrimmer {
+    //Used to interact with the outside world
+    private final ITrimInterface trimInterface;
+
+    public TreeTrimmer(ITrimInterface trimInterface) {
+        this.trimInterface = trimInterface;
+    }
+
+    public void computeTrimPoints() {
+        //Do a bfs to find ending points to trim needs to be based on some, last used, metric
+
+        //First stratagy is to compute a bfs and or generate a list of nodes sorted by last use time
+        // the thing is that if we cull a mesh, it cannot be a leaf node
+        // if it is a leaf node its parent node must have a mesh loaded
+
+    }
+}
--- a/src/main/java/me/cortex/voxy/client/core/rendering/util/DownloadStream.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/util/DownloadStream.java
@@ -84,7 +84,7 @@ public class DownloadStream {
        for (var entry : this.downloadList) {
            glCopyNamedBufferSubData(entry.target.id, this.downloadBuffer.id, entry.targetOffset, entry.downloadStreamOffset, entry.size);
        }
-        thisFrameDownloadList.addAll(this.downloadList);
+        this.thisFrameDownloadList.addAll(this.downloadList);
        this.downloadList.clear();

        this.caddr = -1;
--- a/src/main/java/me/cortex/voxy/client/core/rendering/util/MarkedObjectList.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/util/MarkedObjectList.java
@@ -0,0 +1,51 @@
+package me.cortex.voxy.client.core.rendering.util;
+
+import it.unimi.dsi.fastutil.ints.Int2ObjectFunction;
+import me.cortex.voxy.common.util.HierarchicalBitSet;
+
+import java.util.function.Supplier;
+
+public class MarkedObjectList<T> {
+    private static final float GROWTH_FACTOR = 0.75f;
+
+    private final Int2ObjectFunction<T[]> arrayGenerator;
+    private final Supplier<T> nullSupplier;
+    private final HierarchicalBitSet bitSet = new HierarchicalBitSet(-1);
+    private T[] objects;//Should maybe make a getter function instead
+
+    public MarkedObjectList(Int2ObjectFunction<T[]> arrayGenerator, Supplier<T> nullSupplier) {
+        this.arrayGenerator = arrayGenerator;
+        this.nullSupplier = nullSupplier;
+        this.objects = this.arrayGenerator.apply(16);
+    }
+
+    public int allocate() {
+        //Gets an unused id for some entry in objects, if its null fill it
+        int id = this.bitSet.allocateNext();
+        if (this.objects.length <= id) {
+            //Resize and copy over the objects array
+            int newLen = this.objects.length + (int)Math.ceil(this.objects.length*GROWTH_FACTOR);
+            T[] newArr = this.arrayGenerator.apply(newLen);
+            System.arraycopy(this.objects, 0, newArr, 0, this.objects.length);
+            this.objects = newArr;
+        }
+        if (this.objects[id] == null) {
+            this.objects[id] = this.nullSupplier.get();
+        }
+        return id;
+    }
+
+    public void release(int id) {
+        if (!this.bitSet.free(id)) {
+            throw new IllegalArgumentException("Index " + id + " was already released");
+        }
+    }
+
+    public T get(int index) {
+        //Make the checking that index is allocated optional, as it might cause overhead due to multiple cacheline misses
+        if (!this.bitSet.isSet(index)) {
+            throw new IllegalArgumentException("Index " + index + " is not allocated");
+        }
+        return this.objects[index];
+    }
+}
--- a/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java
+++ b/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java
@@ -0,0 +1,72 @@
+package me.cortex.voxy.common.util;
+
+public class HierarchicalBitSet {
+    private final int limit;
+    private int cnt;
+    private long A = 0;
+    private final long[] B = new long[64];
+    private final long[] C = new long[64*64];
+    private final long[] D = new long[64*64*64];
+    public HierarchicalBitSet(int limit) {//Fixed size of 64^4
+        this.limit = limit;
+        if (limit > (1<<(6*4))) {
+            throw new IllegalArgumentException("Limit greater than capacity");
+        }
+    }
+
+    public int allocateNext() {
+        if (this.A==-1) {
+            return -1;
+        }
+        if (this.cnt+1>this.limit) {
+            return -2;//Limit reached
+        }
+        int idx = Long.numberOfTrailingZeros(~this.A);
+        long bp = this.B[idx];
+        idx = Long.numberOfTrailingZeros(~bp) + 64*idx;
+        long cp = this.C[idx];
+        idx = Long.numberOfTrailingZeros(~cp) + 64*idx;
+        long dp = this.D[idx];
+        idx =  Long.numberOfTrailingZeros(~dp) + 64*idx;
+        dp |= 1L<<(idx&0x3f);
+        this.D[idx>>6] = dp;
+        int ret = idx;
+        if (dp==-1) {
+            idx >>= 6;
+            cp |= 1L<<(idx&0x3f);
+            this.C[idx>>6] = cp;
+            if (cp==-1) {
+                idx >>= 6;
+                bp |= 1L<<(idx&0x3f);
+                this.B[idx>>6] = bp;
+                if (bp==-1) {
+                    this.A |= 1L<<(idx&0x3f);
+                }
+            }
+        }
+        this.cnt++;
+        return ret;
+    }
+
+    public boolean free(int idx) {
+        long v = this.D[idx>>6];
+        boolean wasSet = (v&(1L<<(idx&0x3f)))!=0;
+        this.cnt -= wasSet?1:0;
+        this.D[idx>>6] = v&~(1L<<(idx&0x3f));
+        idx >>= 6;
+        this.C[idx>>6] &= ~(1L<<(idx&0x3f));
+        idx >>= 6;
+        this.B[idx>>6] &= ~(1L<<(idx&0x3f));
+        idx >>= 6;
+        this.A &= ~(1L<<(idx&0x3f));
+        return wasSet;
+    }
+
+    public int getCount() {
+        return this.cnt;
+    }
+
+    public boolean isSet(int idx) {
+        return (this.D[idx>>6]&(1L<<(idx&0x3f)))!=0;
+    }
+}
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads.frag
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads.frag
@@ -1,6 +1,8 @@
 #version 460 core
 layout(binding = 0) uniform sampler2D blockModelAtlas;

+//#define DEBUG_RENDER
+
 //TODO: need to fix when merged quads have discardAlpha set to false but they span multiple tiles
 // however they are not a full block

@@ -10,8 +12,11 @@ layout(location = 2) in flat vec4 tinting;
 layout(location = 3) in flat vec4 addin;
 layout(location = 4) in flat uint flags;
 layout(location = 5) in flat vec4 conditionalTinting;
-//layout(location = 6) in flat vec4 solidColour;

+
+#ifdef DEBUG_RENDER
+layout(location = 6) in flat uint quadDebug;
+#endif
 layout(location = 0) out vec4 outColour;
 void main() {
    vec2 uv = mod(uv, vec2(1.0))*(1.0/(vec2(3.0,2.0)*256.0));
@@ -29,4 +34,14 @@ void main() {
    outColour = (colour * tinting) + addin;

    //outColour = vec4(uv + baseUV, 0, 1);
+
+
+    #ifdef DEBUG_RENDER
+    uint hash = quadDebug*1231421+123141;
+    hash ^= hash>>16;
+    hash = hash*1231421+123141;
+    hash ^= hash>>16;
+    hash = hash * 1827364925 + 123325621;
+    outColour = vec4(float(hash&15u)/15, float((hash>>4)&15u)/15, float((hash>>8)&15u)/15, 1);
+    #endif
 }
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads.vert
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads.vert
@@ -1,150 +0,0 @@
-#version 460 core
-#extension GL_ARB_gpu_shader_int64 : enable
-
-#import <voxy:lod/quad_format.glsl>
-#import <voxy:lod/gl46/bindings.glsl>
-#import <voxy:lod/block_model.glsl>
-#line 8
-
-layout(location = 0) out vec2 uv;
-layout(location = 1) out flat vec2 baseUV;
-layout(location = 2) out flat vec4 tinting;
-layout(location = 3) out flat vec4 addin;
-layout(location = 4) out flat uint flags;
-layout(location = 5) out flat vec4 conditionalTinting;
-//layout(location = 6) out flat vec4 solidColour;
-
-uint extractLodLevel() {
-    return uint(gl_BaseInstance)>>27;
-}
-
-//Note the last 2 bits of gl_BaseInstance are unused
-//Gives a relative position of +-255 relative to the player center in its respective lod
-ivec3 extractRelativeLodPos() {
-    return (ivec3(gl_BaseInstance)<<ivec3(5,14,23))>>ivec3(23);
-}
-
-vec4 uint2vec4RGBA(uint colour) {
-    return vec4((uvec4(colour)>>uvec4(24,16,8,0))&uvec4(0xFF))/255.0;
-}
-
-//Gets the face offset with respect to the face direction (e.g. some will be + some will be -)
-float getDepthOffset(uint faceData, uint face) {
-    float offset = extractFaceIndentation(faceData);
-    return offset * (1.0-((int(face)&1)*2.0));
-}
-
-vec2 getFaceSizeOffset(uint faceData, uint corner) {
-    float EPSILON = 0.001f;
-    vec4 faceOffsetsSizes = extractFaceSizes(faceData);
-    //Expand the quads by a very small amount
-    faceOffsetsSizes.xz -= vec2(EPSILON);
-    faceOffsetsSizes.yw += vec2(EPSILON);
-    return mix(faceOffsetsSizes.xz, faceOffsetsSizes.yw-1.0f, bvec2(((corner>>1)&1u)==1, (corner&1u)==1));
-}
-
-//TODO: add a mechanism so that some quads can ignore backface culling
-// this would help alot with stuff like crops as they would look kinda weird i think,
-// same with flowers etc
-void main() {
-    int cornerIdx = gl_VertexID&3;
-    Quad quad = quadData[uint(gl_VertexID)>>2];
-    vec3 innerPos = extractPos(quad);
-    uint face = extractFace(quad);
-    uint modelId = extractStateId(quad);
-    BlockModel model = modelData[modelId];
-    uint faceData = model.faceData[face];
-    bool isTranslucent = modelIsTranslucent(model);
-    bool hasAO = modelHasMipmaps(model);//TODO: replace with per face AO flag
-    bool isShaded = hasAO;//TODO: make this a per face flag
-    //Change the ordering due to backface culling
-    //NOTE: when rendering, backface culling is disabled as we simply dispatch calls for each face
-    // this has the advantage of having "unassigned" geometry, that is geometry where the backface isnt culled
-    //if (face == 0 || (face>>1 != 0 && (face&1)==1)) {
-    //    cornerIdx ^= 1;
-    //}
-
-    uint lodLevel = extractLodLevel();
-    ivec3 lodCorner = ((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5;
-    vec3 corner = innerPos * (1<<lodLevel) + lodCorner;
-
-    vec2 faceOffset = getFaceSizeOffset(faceData, cornerIdx);
-    ivec2 quadSize = extractSize(quad);
-    vec2 respectiveQuadSize = vec2(quadSize * ivec2((cornerIdx>>1)&1, cornerIdx&1));
-    vec2 size = (respectiveQuadSize + faceOffset) * (1<<lodLevel);
-
-    vec3 offset = vec3(size, (float(face&1u) + getDepthOffset(faceData, face)) * (1<<lodLevel));
-
-    if ((face>>1) == 0) { //Up/down
-        offset = offset.xzy;
-    }
-    //Not needed, here for readability
-    //if ((face>>1) == 1) {//north/south
-    //    offset = offset.xyz;
-    //}
-    if ((face>>1) == 2) { //west/east
-        offset = offset.zxy;
-    }
-
-    gl_Position = MVP * vec4(corner + offset, 1.0);
-
-
-    //Compute the uv coordinates
-    vec2 modelUV = vec2(modelId&0xFFu, (modelId>>8)&0xFFu)*(1.0/(256.0));
-    //TODO: make the face orientated by 2x3 so that division is not a integer div and modulo isnt needed
-    // as these are very slow ops
-    baseUV = modelUV + (vec2(face>>1, face&1u) * (1.0/(vec2(3.0, 2.0)*256.0)));
-    //TODO: add an option to scale the quad size by the lod level so that
-    // e.g. at lod level 2 a face will have 2x2
-    uv = respectiveQuadSize + faceOffset;//Add in the face offset for 0,0 uv
-
-    flags = faceHasAlphaCuttout(faceData);
-
-    //We need to have a conditional override based on if the model size is < a full face + quadSize > 1
-    flags |= uint(any(greaterThan(quadSize, ivec2(1)))) & faceHasAlphaCuttoutOverride(faceData);
-
-    flags |= uint(!modelHasMipmaps(model))<<1;
-
-    //Compute lighting
-    tinting = getLighting(extractLightId(quad));
-
-    //Apply model colour tinting
-    uint tintColour = model.colourTint;
-    if (modelHasBiomeLUT(model)) {
-        tintColour = colourData[tintColour + extractBiomeId(quad)];
-    }
-
-    conditionalTinting = vec4(0);
-    if (tintColour != uint(-1)) {
-        flags |= 1u<<2;
-        conditionalTinting = uint2vec4RGBA(tintColour).yzwx;
-    }
-
-    addin = vec4(0.0);
-    if (!isTranslucent) {
-        tinting.w = 0.0;
-        //Encode the face, the lod level and
-        uint encodedData = 0;
-        encodedData |= face;
-        encodedData |= (lodLevel<<3);
-        encodedData |= uint(hasAO)<<6;
-        addin.w = float(encodedData)/255.0;
-    }
-
-    //Apply face tint
-    if (isShaded) {
-        if ((face>>1) == 1) {
-            tinting.xyz *= 0.8f;
-        } else if ((face>>1) == 2) {
-            tinting.xyz *= 0.6f;
-        } else if (face == 0){
-            tinting.xyz *= 0.5f;
-        } else {
-            //TODO: FIXME: DONT HAVE SOME ARBITARY TINT LIKE THIS
-            tinting.xyz *= 0.95f;
-        }
-    }
-
-
-    //solidColour = vec4(vec3(modelId&0xFu, (modelId>>4)&0xFu, (modelId>>8)&0xFu)*(1f/15f),1f);
-}
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads2.vert
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads2.vert
@@ -6,13 +6,18 @@
 #import <voxy:lod/block_model.glsl>
 #line 8

+//#define DEBUG_RENDER
+
 layout(location = 0) out vec2 uv;
 layout(location = 1) out flat vec2 baseUV;
 layout(location = 2) out flat vec4 tinting;
 layout(location = 3) out flat vec4 addin;
 layout(location = 4) out flat uint flags;
 layout(location = 5) out flat vec4 conditionalTinting;
-//layout(location = 6) out flat vec4 solidColour;
+
+#ifdef DEBUG_RENDER
+layout(location = 6) out flat uint quadDebug;
+#endif

 uint extractLodLevel() {
    return uint(gl_BaseInstance)>>27;
@@ -143,4 +148,8 @@ void main() {

    vec3 origin = vec3(((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5);
    gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize,0)))*(1<<lodLevel)+origin, 1.0);
+
+    #ifdef DEBUG_RENDER
+    quadDebug = lodLevel;
+    #endif
 }
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/Queue.glsl
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/Queue.glsl
@@ -0,0 +1,20 @@
+//Use defines and undefines to define the queue, allows for hacky reuse of imports
+#ifndef QUEUE_NAME
+#error QUEUE_NAME is not defined
+#endif
+
+
+
+
+void push(queue, item) {
+
+}
+
+
+
+
+
+
+
+
+#
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/selector.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/selector.comp
@@ -0,0 +1,262 @@
+#version 460 core
+
+#define WORKGROUP 4
+#define MINI_BATCH_SIZE 32
+//The entire uint is a minibatch (each idx is one)
+#define MINI_BATCH_MSK (uint(-1))
+
+//Each y dim is a quadrent in the octree
+// multiple x dims to fill up workgroups
+layout(local_size_x=WORKGROUP, local_size_y=8) in;
+
+layout(binding = 1, std430) restrict buffer RequestSectionLoadQueue {
+    uint counter;
+    uint[] queue;
+} requestQueue;
+
+//SectionNodeData is a uvec4 that contains the position + flags + ptr to own render section data + ptr to children
+layout(binding = 2, std430) restrict readonly buffer SectionNodeData {
+    uvec4[] sectionNodes;
+};
+
+layout(binding = 3, std430) restrict buffer ActiveWorkingNodeQueue {
+    uint feedbackStatus;
+    uint batchIndex;
+    uint end;
+    uint start;
+    uint maxSize;//Needs to be a multiple of local_size_x
+    uint[] queue;
+} nodeQueue;
+
+
+struct UnpackedNode {
+    ivec4 position;//x,y,z,detail
+    uint  flags;//16 bits
+    uint  self;
+    uint  children;
+};
+
+UnpackedNode unpackNode(uvec4 data) {
+    UnpackedNode node;
+
+    return node;
+}
+
+//NOTE: this is different to nanite in the fact that if a node is not loaded, too bad dont render
+
+shared UnpackedNode workingNodes[WORKGROUP];
+shared uint miniBatchMsk;
+void loadNode() {
+    if (gl_LocalInvocationIndex == 0) {//Check if we need to
+        batchMsk = 0;//Reset the minibatch
+        if (miniBatchMsk == MINI_BATCH_SIZE) {
+
+        }
+    }
+    barrier();
+    if (gl_LocalInvocationID.y == 0) {
+
+
+        //Need to make it work in y size 8, but only gl_LocalInvocationId.x == 0
+        workingNodes[gl_LocalInvocationID.x] = unpackNode(sectionNodes[id]);
+    }
+    barrier();//Synchonize, also acts as memory barrier
+}
+
+
+
+//Computes screensize of the node and whether it should render itself or its children
+bool shouldRenderChildren(UnpackedNode node) {
+
+}
+
+//Process a single node and enqueue child nodes if needed into work queue, enqueue self to render and/or request children to load
+void processNode(uint id) {//Called even if it doesnt have any work (id==-1) to ensure uniform control flow for barriers
+
+    //Bottom 2 bits are status flags, is air and children loaded
+    // node.flags
+
+    //If the childrenloaded flag is not set, send a request for the children of the node to be loaded
+    // if all the children are loaded but we are not and we need to render, render the children and dispatch
+    // a request to load self
+
+    if (shouldRenderChildren(node)) {
+        //Dont care about
+    } else {
+
+    }
+
+}
+
+
+//The activly schedualed/acquired work slot for this group
+shared uint workingBatchIndex;
+shared uint workingBatchOffset;
+void process() {
+    if (gl_LocalInvocationIndex == 0) {//This includes both x and y
+        workingBatchIndex = atomicAdd(nodeQueue.batchIndex, BATCH_SIZE);
+    }
+}
+
+
+
+void main() {
+    while (true) {
+        barrier();
+
+    }
+}
+
+
+
+
+//when a node is processed,
+// compute its screen bounding box is computed using fast trick (e.g. if your viewing it from a quadrent you already know its bounding points (min/max))
+// frustum cull, check hiz
+// if it passes culling, use the screensize to check wether it must render itself
+// or dispatch its children to render
+//      IF its error is small enough, then render itself, its mesh should always be loaded, if not its a critical error (except maybe if its a top level node or something)
+//      if its error is too large,
+//          check that all children are loaded (or empty), if they are not all loaded, enqueu a request for the cpu to load
+//          that nodes children
+//              if the load queue is full, dont enqueue it to the queue
+//          then instead of rendering children, render its own mesh since it should always be loaded
+
+//Can also reverse the above slightly and make it so that it checks the children before enqueuing them
+
+
+//the main thing to worry about is if there is enough work to fill the inital few rounds of this
+// before amplification takes effect
+// can do a thing where it initally just blasts child nodes out until the size is small enough
+
+
+
+// NOTE: since matrix multiplication distributes over addition
+//  can precompute the AABB corners with respect to the matrix
+//  then you can just add a translation vector
+
+
+
+
+
+
+
+//TODO: can do in another way
+// first compute the sections that should either render self or childs
+// then in as a seperate job queue work though it
+
+
+
+
+
+
+
+
+
+
+
+uint getChildCount(UnpackedNode node) {
+
+}
+
+
+//Checks whether a node should be culled based on hiz/frustum
+bool cullNode(UnpackedNode node) {
+
+}
+
+//Should render this node, or recurse to children
+bool shouldRenderChildrenInstead(UnpackedNode node) {
+
+}
+
+//Does the node have its own mesh loaded
+bool nodeHasSelfMesh(UnpackedNode node) {
+
+}
+
+//Does the node its children loaded (note! not child meshes)
+bool nodeHasChildrenLoaded(UnpackedNode node) {
+
+}
+
+//Are all the childrens meshes loaded
+bool nodeHasChildMeshesLoaded(UnpackedNode node) {
+
+}
+
+void request(uint type, uint idx) {
+
+}
+
+void renderMesh(uint idx) {
+
+}
+
+void enqueueChildren(uint arg, UnpackedNode node) {
+    uint cnt = getChildCount(node);
+    //TODO: the queue needs 2 counters, the pre and post atomic,
+    // pre is incremented to get index
+    // queue is written to
+    // post is then incremented to signal
+}
+
+void reportCritical(uint type) {
+
+}
+
+void processNode(uint idx) {
+    UnpackedNode node = unpackNode(sectionNodes[idx]);
+    if (!cullNode(node)) {
+        //Should we render children instead of ourselves with respect to screenspace error
+        if (shouldRenderChildrenInstead(node)) {
+            if (nodeHasChildrenLoaded(node)) {
+                //Dispatch nodes to queue
+                enqueueChildren(0, node);
+            } else {
+                //Children arnt loaded so either render self mesh or if we cant
+                // abort basicly must request nodes
+                if (nodeHasSelfMesh(node)) {
+                    //Render self and dispatch request to load children
+                    renderMesh(node.self);
+                    request(1, idx);
+                } else {
+                    //Critical issue, no are loaded and self has no mesh
+                    reportCritical(0);
+                }
+            }
+        } else {
+            if (nodeHasSelfMesh(node)) {
+                //render self
+                renderMesh(node.self);
+            } else {
+                //Request that self mesh is loaded
+                request(0, idx);
+
+                //render children instead
+                if (nodeHasChildrenLoaded(node)) {//Might need to be node nodeHasChildMeshesLoaded
+                    enqueueChildren(1, node);
+                } else {
+                    //This is very bad, it means cant render anything
+                    reportCritical(1);
+                }
+            }
+        }
+    }
+}
+
+//Psudo code, one thread, one load
+void main() {
+    while (true) {
+        //Try to process a node queue entry
+        uint work = atomicAdd(workingNodeQueuePos, 1);
+        uint idx = work&0xFFFFFFu;
+        uint arg = work>>24;
+        if (idx < workingNodeQueueEnd) {
+
+
+        } else {
+            //Do other queue work however we still have the work slot allocated
+        }
+    }
+}