WIP

2024-06-11 23:06:24 +10:00
parent 584028ae7a
commit 25ddb83d22
18 changed files with 882 additions and 158 deletions
--- a/src/main/java/me/cortex/voxy/client/core/gl/GlPersistentMappedBuffer.java
+++ b/src/main/java/me/cortex/voxy/client/core/gl/GlPersistentMappedBuffer.java
@@ -14,7 +14,7 @@ public class GlPersistentMappedBuffer extends TrackedObject {
        this.id = glCreateBuffers();
        this.size = size;
        glNamedBufferStorage(this.id, size, GL_CLIENT_STORAGE_BIT|GL_MAP_PERSISTENT_BIT|(flags&(GL_MAP_COHERENT_BIT|GL_MAP_WRITE_BIT|GL_MAP_READ_BIT)));
-        this.addr = nglMapNamedBufferRange(this.id, 0, size, flags|GL_MAP_PERSISTENT_BIT);
+        this.addr = nglMapNamedBufferRange(this.id, 0, size, (flags&(GL_MAP_WRITE_BIT|GL_MAP_READ_BIT|GL_MAP_UNSYNCHRONIZED_BIT|GL_MAP_FLUSH_EXPLICIT_BIT))|GL_MAP_PERSISTENT_BIT);
    }
    @Override
--- a/src/main/java/me/cortex/voxy/client/core/gl/shader/GenericsProcessor.java
+++ b/src/main/java/me/cortex/voxy/client/core/gl/shader/GenericsProcessor.java
@@ -0,0 +1,12 @@
 package me.cortex.voxy.client.core.gl.shader;
 import java.util.regex.Pattern;
 public class GenericsProcessor implements IShaderProcessor {
    private static final Pattern GENERIC_DEFINE = Pattern.compile("#defineGen (?<name>[A-Za-z0-9]+)<(?<generic>[A-Za-z0-9]*)>");
    private static final Pattern GENERIC_USE = Pattern.compile("(?<type>[A-Za-z0-9]+)<(?<generic>[A-Za-z0-9]*)>");
    @Override
    public String process(ShaderType type, String source) {
        return null;
    }
 }
--- a/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java
+++ b/src/main/java/me/cortex/voxy/client/core/gl/shader/Shader.java
@@ -3,8 +3,7 @@ package me.cortex.voxy.client.core.gl.shader;
 import me.cortex.voxy.common.util.TrackedObject;
 import org.lwjgl.opengl.GL20C;
-import java.util.HashMap;
+import java.util.*;
 import java.util.Map;
 import java.util.stream.Collectors;
 import static org.lwjgl.opengl.GL20.glDeleteProgram;
@@ -16,8 +15,15 @@ public class Shader extends TrackedObject {
        id = program;
    }
-    public static Builder make(IShaderProcessor processor) {
+    public static Builder make(IShaderProcessor... processors) {
-        return new Builder(processor);
+        List<IShaderProcessor> aa = new ArrayList<>(List.of(processors));
        Collections.reverse(aa);
        IShaderProcessor applicator = (type,source)->source;
        for (IShaderProcessor processor : processors) {
            IShaderProcessor finalApplicator = applicator;
            applicator = (type, source) -> finalApplicator.process(type, processor.process(type, source));
        }
        return new Builder(applicator);
    }
    public static Builder make() {
--- a/src/main/java/me/cortex/voxy/client/core/rendering/HierarchicalOcclusionRenderer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/HierarchicalOcclusionRenderer.java
@@ -0,0 +1,50 @@
 package me.cortex.voxy.client.core.rendering;
 import me.cortex.voxy.client.core.gl.GlBuffer;
 import me.cortex.voxy.client.core.gl.shader.Shader;
 import me.cortex.voxy.client.core.gl.shader.ShaderType;
 import me.cortex.voxy.client.core.rendering.hierarchical.NodeManager;
 import me.cortex.voxy.common.util.HierarchicalBitSet;
 import static org.lwjgl.opengl.GL42C.*;
 import static org.lwjgl.opengl.GL43C.GL_SHADER_STORAGE_BARRIER_BIT;
 import static org.lwjgl.opengl.GL43C.glDispatchCompute;
 import static org.lwjgl.opengl.GL44.GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT;
 public class HierarchicalOcclusionRenderer {
    private final int workgroup_dispatch_size_x;//The number of workgroups required to saturate the gpu efficiently
    private final NodeManager nodeManager = new NodeManager(null);
    private final HiZBuffer hiz = new HiZBuffer();
    private Shader hiercarchialShader = Shader.make()
            .add(ShaderType.COMPUTE, "voxy:lod/hierarchical/selector.comp")
            .compile();
    public HierarchicalOcclusionRenderer(int workgroup_size) {
        this.workgroup_dispatch_size_x = workgroup_size;
    }
    private void bind() {
    }
    public void render(int depthBuffer, int width, int height) {
        //Make hiz
        this.hiz.buildMipChain(depthBuffer, width, height);
        //Node upload phase
        this.nodeManager.uploadPhase();
        //Node download phase (pulls from previous frame (should maybe result in lower latency)) also clears and resets the queues
        this.nodeManager.downloadPhase();
        //Bind all the resources
        this.bind();
        //run hierachial selection shader
        this.hiercarchialShader.bind();
        //barrier
        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_UNIFORM_BARRIER_BIT|GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT|GL_FRAMEBUFFER_BARRIER_BIT);
        //Emit enough work to fully populate the gpu
        glDispatchCompute(this.workgroup_dispatch_size_x, 1, 1);
        glMemoryBarrier(GL_SHADER_STORAGE_BARRIER_BIT|GL_COMMAND_BARRIER_BIT|GL_UNIFORM_BARRIER_BIT);
    }
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/INodeInteractor.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/INodeInteractor.java
@@ -0,0 +1,15 @@
 package me.cortex.voxy.client.core.rendering.hierarchical;
 import me.cortex.voxy.client.core.rendering.building.BuiltSection;
 import java.util.function.Consumer;
 //Interface for node manager to interact with the outside world
 public interface INodeInteractor {
    void watchUpdates(long pos);//marks pos as watching for updates, i.e. any LoD updates will trigger a callback
    void unwatchUpdates(long pos);//Unmarks a position for updates
    void requestMesh(long pos);//Explicitly requests a mesh at a position, run the callback
    void setMeshUpdateCallback(Consumer<BuiltSection> mesh);
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/ITrimInterface.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/ITrimInterface.java
@@ -0,0 +1,12 @@
 package me.cortex.voxy.client.core.rendering.hierarchical;
 public interface ITrimInterface {
    //Last recorded/known use time of a nodes mesh, returns -1 if node doesnt have a mesh
    int lastUsedTime(int node);
    //Returns an integer with the bottom 24 bits being the ptr top 8 bits being count or something
    int getChildren(int node);
    //Returns a size of the nodes mesh, -1 if the node doesnt have a mesh
    int getNodeSize(int node);
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeLoadSystem.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeLoadSystem.java
@@ -0,0 +1,5 @@
 package me.cortex.voxy.client.core.rendering.hierarchical;
 //Uses a persistently mapped coherient buffer with off thread polling to pull in requests
 public class NodeLoadSystem {
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager.java
@@ -0,0 +1,219 @@
 package me.cortex.voxy.client.core.rendering.hierarchical;
 import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
 import me.cortex.voxy.client.core.gl.GlBuffer;
 import me.cortex.voxy.client.core.rendering.building.BuiltSection;
 import me.cortex.voxy.client.core.rendering.util.DownloadStream;
 import me.cortex.voxy.common.util.HierarchicalBitSet;
 import org.lwjgl.system.MemoryUtil;
 import static org.lwjgl.opengl.GL11.GL_UNSIGNED_INT;
 import static org.lwjgl.opengl.GL30.GL_R32UI;
 import static org.lwjgl.opengl.GL30C.GL_RED_INTEGER;
 import static org.lwjgl.opengl.GL45.nglClearNamedBufferSubData;
 public class NodeManager {
    public static final int MAX_NODE_COUNT = 1<<22;
    public static final int MAX_REQUESTS = 1024;
    private final HierarchicalBitSet bitSet = new HierarchicalBitSet(MAX_NODE_COUNT);
    private final GlBuffer nodeBuffer = new GlBuffer(MAX_NODE_COUNT*16);//Node size is 16 bytes
    //TODO: maybe make this a coherent persistent mapped read buffer, instead of download synced buffer copy thing
    //a request payload is a single uint, first 8 bits are flags followed by 24 bit node identifier
    // (e.g. load child nodes, load child nodes + meshs, load self meshes )
    private final int REQUEST_QUEUE_SIZE = 4 + MAX_REQUESTS * 4;//TODO: add a priority system
    private final GlBuffer requestQueue = new GlBuffer(4 + MAX_REQUESTS * 4);
    //Buffer containing the index of the root nodes
    private final GlBuffer roots = new GlBuffer(1024*4);
    //500mb TODO: SEE IF CAN SHRINK IT BY EITHER NOT NEEDING AS MUCH SPACE or reducing max node count
    private final long[] localNodes = new long[MAX_NODE_COUNT * 3];//1.5x the size of the gpu copy to store extra metadata
    //LocalNodes have an up value pointing to the parent, enabling full traversal
    private final INodeInteractor interactor;
    public NodeManager(INodeInteractor interactor) {
        this.interactor = interactor;
        this.pos2meshId.defaultReturnValue(NO_NODE);
    }
    //Returns true if it has its own mesh loaded
    private static boolean nodeHasMeshLoaded(long metaA, long metaB) {
        return false;
    }
    private static final int REQUEST_SELF = 0;
    private static final int REQUEST_CHILDREN = 1;
    //A node can be loaded in the tree but have no mesh associated with it
    // this is so that higher level nodes dont waste mesh space
    //The reason that nodes have both child and own mesh pointers
    // is so that on an edge of the screen or when moving, nodes arnt constantly being swapped back and forth
    // it basicly acts as an inline cache :tm: however it does present some painpoints
    // especially in managing the graph
    //It might be easier to have nodes strictly either point to child nodes or meshes
    // if a parent needs to be rendered instead of the child, request for node change to self
    // while this will generate a shitton more requests it should be alot easier to manage graph wise
    // can probably add a caching service via a compute shader that ingests a request list
    // sees if the requested nodes are already cached, if so swap them in, otherwise dispatch a request
    // to cpu
    private void processRequestQueue(long ptr, long size) {
        int count = MemoryUtil.memGetInt(ptr); ptr += 4;
        for (int i = 0; i < count; i++) {
            int request = MemoryUtil.memGetInt(ptr + i*4L);
            int args = request&(0xFF<<24);
            int nodeId = request&(0xFFFFFF);
            long pos = this.localNodes[nodeId*3];
            long metaA = this.localNodes[nodeId*3 + 1];
            long metaB = this.localNodes[nodeId*3 + 2];
            int type = args&0b11;//2 bits for future request types such as parent and ensure stable (i.e. both parent and child loaded)
            if (type == REQUEST_SELF) {
                //Requires own mesh loaded (it can have 2 different priorites, it can fallback to using its children to render if they are loaded)
                // else it is critical priority
                if (nodeHasMeshLoaded(metaA, metaB)) {
                    throw new IllegalStateException("Node requested a mesh load, but mesh is already loaded: " + pos);
                }
                //watch the mesh and request it
                this.interactor.watchUpdates(pos);
                this.interactor.requestMesh(pos);
            } else if (type == REQUEST_CHILDREN) {
                //Node requires children to be loaded NOTE: when this is the case, it doesnt just mean the nodes,
                // it means the meshes aswell,
                // meshes may be unloaded later
                //when this case is hit it means that the child nodes arnt even loaded, so it becomes a bit more complex
                // basicly, need to request all child nodes be loaded in a batch
                // then in the upload tick need to do update many things
            } else {
                throw new IllegalArgumentException("Unknown update type: " + type + " @pos:" + pos);
            }
        }
    }
    public void uploadPhase() {
        //All uploads
        //Have a set of upload tasks for nodes,
        // this could include updating the mesh ptr
        // or child ptr or uploading new nodes
        // NOTE: when uploading a set of new nodes (must be clustered as children)
        // have to update parent
        // same when removing a set of children
        //Note: child node upload tasks need to all be complete before they can be uploaded
        //The way the graph works and can be cut is that all the leaf nodes _must_ at all times contain a mesh
        // this is critical to prevent "cracks"/no geometry being rendered
        // when the render mesh buffer is "full" (or even just periodicly), trimming of the tree must occur to keep
        // size within reason
        //Note tho that there is a feedback delay and such so geometry buffer should probably be trimmed when it reaches
        // 80-90% capacity so that new geometry can still be uploaded without being blocked on geometry clearing
        // it becomes a critical error if the geometry buffer becomes full while the tree is fully trimmed
        //NOTE: while trimming the tree, need to also trim the parents down i.e. the top level should really not have its mesh
        // loaded while it isnt really ever used
        // however as long as the rule that all leaf nodes have a mesh loaded is held then there should never be
        // any geometry holes
    }
    //Download and upload point, called once per frame
    public void downloadPhase() {
        DownloadStream.INSTANCE.download(this.requestQueue, 0, REQUEST_QUEUE_SIZE, this::processRequestQueue);
        DownloadStream.INSTANCE.commit();
        //Clear the queue counter, TODO: maybe do it some other way to batch clears
        nglClearNamedBufferSubData(this.requestQueue.id, GL_R32UI, 0, 4, GL_RED_INTEGER, GL_UNSIGNED_INT, 0);
        //TODO: compute cleanup here of loaded nodes, and what needs to be uploaded
        // i.e. if there is more upload stuff than there is free memory, cull nodes in the tree
        // to fit upload points, can also create errors if all nodes in the tree are requested but no memory to put
    }
    //Inserts a top level node into the graph, it has geometry and no children loaded as it is a leaf node
    public void insertTopLevelNode(long position) {
    }
    //Removes a top level node from the graph, doing so also removes all child nodes and associate geometry
    // the allocated slots when removing nodes are stored and roped off until it is guarenteed that all requests have
    // passed
    public void removeTopLevelNode(long position) {
    }
    //Tracking for nodes that specifically need meshes, if a node doesnt have or doesnt need a mesh node, it is not in the map
    // the map should be identical to the currently watched set of sections
    //NOTE: that if the id is negative its part of a mesh request
    private final Long2IntOpenHashMap pos2meshId = new Long2IntOpenHashMap();
    private static final int NO_NODE = -1;
    //Need to make this system attatched with a batched worker system, since a mesh update can be a few things
    // it can be a mesh update of a tracked render section, in this case we must ensure that it is still tracked and hasnt been removed bla bla bla
    //   if its still valid and tracked then upload it and update the node aswell ensuring sync bla bla bla
    // if it was part of a request, then we need to first check that the request still exists and hasnt been discarded  B) probably upload it immediatly still
    //   B) set the request with that section to have been, well, uploaded and the mesh set, (note if the mesh was updated while a request was inprogress/other requests not fufilled, need to remove the old and replace with the updated)
    //      if all the meshes in the request are satisfied, upload the request nodes and update its parent
    // NOTE! batch requests where this is needed are only strictly required when children are requested in order to guarentee that all
    //      propertiy of leaf nodes must have meshes remains
    //(TODO: see when sync with main thread should be, in the renderer or here since the updates are dispatched offthread)
    // Note that the geometry buffer should have idk 20% free? that way meshes can always be inserted (same for the node buffer ig) maybe 10%? idk need to experiement
    //  if the buffer goes over this threshold, the tree/graph culler must start culling last/least used nodes somehow
    //  it should be an error if the geometry or node buffer fills up but there are no nodes/meshes to cull/remove
    public void meshUpdate(BuiltSection mesh) {
        int id = this.pos2meshId.get(mesh.position);
        if (id == NO_NODE) {
            //The built mesh section is no longer needed, discard it
            // TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built??
            mesh.free();
            return;
        }
        if ((id&(1<<31))!=0) {
            //The mesh is part of a batched request
            id = id^(1<<31);
        } else {
            //The mesh is an update for an existing node
            //this.localNodes[id*3]
        }
    }
    //A node has a position    (64 bit)
    // a ptr to its own mesh   (24 bit)
    // a ptr to children nodes (24 bit)
    // flags                   (16 bit)
    //                         Total of 128 bits (16 bytes)
    //First 2 flag bits are a requested dispatch type (0 meaning no request and the 3 remaining states for different request types)
    // this ensures that over multiple frames the same node is not requested
    //Bits exist for whether or not the children have meshes loaded or if the parents have meshes loaded
    // the idea is to keep +-1 lod meshes loaded into vram to enable seemless transitioning
    // the only critical state is that if a mesh wants to be rendered it should be able to be rendered
    //Basicly, there are multiple things, it depends on the screensize error
    // if a node is close to needing its children loaded but they arnt, then request it but with a lower priority
    // if a node must need its children then request at a high prioirty
    // if a node doesnt have a mesh but all its children do than dispatch a medium priority to have its own mesh loaded
    //      but then just use the child meshes for rendering
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager2.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/NodeManager2.java
@@ -0,0 +1,103 @@
 package me.cortex.voxy.client.core.rendering.hierarchical;
 import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap;
 import me.cortex.voxy.client.core.rendering.building.BuiltSection;
 import me.cortex.voxy.client.core.rendering.util.MarkedObjectList;
 public class NodeManager2 {
    //A request for making a new child nodes 
    private static final class LeafRequest {
        //LoD position identifier
        public long position;
        //Node id of the node the leaf request is for, note! While there is a leaf request, the node should not be unloaded or removed
        public int nodeId;
        //The mask of what child nodes are required
        public byte requiredChildMask;
        //The mask of currently supplied child node data
        public byte currentChildMask;
        //Reset/clear the request so that it may be reused
        public void clear() {
        }
    }
    public static final int MAX_NODE_COUNT = 1<<22;
    //Local data layout
    // first long is position (todo! might not be needed)
    // next long contains mesh position ig/id
    private final long[] localNodeData = new long[MAX_NODE_COUNT * 3];
    private final INodeInteractor interactor;
    public NodeManager2(INodeInteractor interactor) {
        this.interactor = interactor;
        this.pos2meshId.defaultReturnValue(NO_NODE);
        this.interactor.setMeshUpdateCallback(this::meshUpdate);
    }
    public void insertTopLevelNode(long position) {
    }
    public void removeTopLevelNode(long position) {
    }
    //Returns the mesh offset/id for the given node or -1 if it doesnt exist
    private int getMeshForNode(int node) {
        return -1;
    }
    //Tracking for nodes that specifically need meshes, if a node doesnt have or doesnt need a mesh node, it is not in the map
    // the map should be identical to the currently watched set of sections
    //NOTE: that if the id is negative its part of a mesh request
    private final Long2IntOpenHashMap pos2meshId = new Long2IntOpenHashMap();
    private static final int NO_NODE = -1;
    //The request queue should be like some array that can reuse objects to prevent gc nightmare + like a bitset to find an avalible free slot
    // hashmap might work bar the gc overhead
    private final MarkedObjectList<LeafRequest> leafRequests = new MarkedObjectList<>(LeafRequest[]::new, LeafRequest::new);
    private void meshUpdate(BuiltSection mesh) {
        int id = this.pos2meshId.get(mesh.position);
        if (id == NO_NODE) {
            //The built mesh section is no longer needed, discard it
            // TODO: could probably?? cache the mesh in ram that way if its requested? it can be immediatly fetched while a newer mesh is built??
            mesh.free();
            return;
        }
        if ((id&(1<<31))!=0) {
            //The mesh is part of a batched request
            id = id^(1<<31);//Basically abs it
            //There are a few cases for this branch
            // the section could be replacing an existing mesh that is part of the request (due to an update)
            // the section mesh could be new to the request
            //  in this case the section mesh could be the last entry needed to satisfy the request
            //      in which case! we must either A) mark the request as ready to be uploaded
            //      and then uploaded after all the mesh updates are processed, or upload it immediately
            //The lower 3 bits of the id specify the quadrant (8 pos) of the node in the request
            LeafRequest request = this.leafRequests.get(id>>3);
        } else {
            //The mesh is an update for an existing node
            int prevMesh = this.getMeshForNode(id);
            if (prevMesh != -1) {
                //Node has a mesh attached, remove and replace it
            } else {
                //Node didnt have a mesh attached, so just set the current mesh
            }
        }
    }
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/TreeTrimmer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/hierarchical/TreeTrimmer.java
@@ -0,0 +1,23 @@
 package me.cortex.voxy.client.core.rendering.hierarchical;
 //System to determine what nodes to remove from the hericial tree while retaining the property that all
 // leaf nodes should have meshes
 //This system is critical to prevent the geometry buffer from growing to large or for too many nodes to fill up
 // the node system
 public class TreeTrimmer {
    //Used to interact with the outside world
    private final ITrimInterface trimInterface;
    public TreeTrimmer(ITrimInterface trimInterface) {
        this.trimInterface = trimInterface;
    }
    public void computeTrimPoints() {
        //Do a bfs to find ending points to trim needs to be based on some, last used, metric
        //First stratagy is to compute a bfs and or generate a list of nodes sorted by last use time
        // the thing is that if we cull a mesh, it cannot be a leaf node
        // if it is a leaf node its parent node must have a mesh loaded
    }
 }
--- a/src/main/java/me/cortex/voxy/client/core/rendering/util/DownloadStream.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/util/DownloadStream.java
@@ -84,7 +84,7 @@ public class DownloadStream {
        for (var entry : this.downloadList) {
            glCopyNamedBufferSubData(entry.target.id, this.downloadBuffer.id, entry.targetOffset, entry.downloadStreamOffset, entry.size);
        }
-        thisFrameDownloadList.addAll(this.downloadList);
+        this.thisFrameDownloadList.addAll(this.downloadList);
        this.downloadList.clear();
        this.caddr = -1;
--- a/src/main/java/me/cortex/voxy/client/core/rendering/util/MarkedObjectList.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/util/MarkedObjectList.java
@@ -0,0 +1,51 @@
 package me.cortex.voxy.client.core.rendering.util;
 import it.unimi.dsi.fastutil.ints.Int2ObjectFunction;
 import me.cortex.voxy.common.util.HierarchicalBitSet;
 import java.util.function.Supplier;
 public class MarkedObjectList<T> {
    private static final float GROWTH_FACTOR = 0.75f;
    private final Int2ObjectFunction<T[]> arrayGenerator;
    private final Supplier<T> nullSupplier;
    private final HierarchicalBitSet bitSet = new HierarchicalBitSet(-1);
    private T[] objects;//Should maybe make a getter function instead
    public MarkedObjectList(Int2ObjectFunction<T[]> arrayGenerator, Supplier<T> nullSupplier) {
        this.arrayGenerator = arrayGenerator;
        this.nullSupplier = nullSupplier;
        this.objects = this.arrayGenerator.apply(16);
    }
    public int allocate() {
        //Gets an unused id for some entry in objects, if its null fill it
        int id = this.bitSet.allocateNext();
        if (this.objects.length <= id) {
            //Resize and copy over the objects array
            int newLen = this.objects.length + (int)Math.ceil(this.objects.length*GROWTH_FACTOR);
            T[] newArr = this.arrayGenerator.apply(newLen);
            System.arraycopy(this.objects, 0, newArr, 0, this.objects.length);
            this.objects = newArr;
        }
        if (this.objects[id] == null) {
            this.objects[id] = this.nullSupplier.get();
        }
        return id;
    }
    public void release(int id) {
        if (!this.bitSet.free(id)) {
            throw new IllegalArgumentException("Index " + id + " was already released");
        }
    }
    public T get(int index) {
        //Make the checking that index is allocated optional, as it might cause overhead due to multiple cacheline misses
        if (!this.bitSet.isSet(index)) {
            throw new IllegalArgumentException("Index " + index + " is not allocated");
        }
        return this.objects[index];
    }
 }
--- a/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java
+++ b/src/main/java/me/cortex/voxy/common/util/HierarchicalBitSet.java
@@ -0,0 +1,72 @@
 package me.cortex.voxy.common.util;
 public class HierarchicalBitSet {
    private final int limit;
    private int cnt;
    private long A = 0;
    private final long[] B = new long[64];
    private final long[] C = new long[64*64];
    private final long[] D = new long[64*64*64];
    public HierarchicalBitSet(int limit) {//Fixed size of 64^4
        this.limit = limit;
        if (limit > (1<<(6*4))) {
            throw new IllegalArgumentException("Limit greater than capacity");
        }
    }
    public int allocateNext() {
        if (this.A==-1) {
            return -1;
        }
        if (this.cnt+1>this.limit) {
            return -2;//Limit reached
        }
        int idx = Long.numberOfTrailingZeros(~this.A);
        long bp = this.B[idx];
        idx = Long.numberOfTrailingZeros(~bp) + 64*idx;
        long cp = this.C[idx];
        idx = Long.numberOfTrailingZeros(~cp) + 64*idx;
        long dp = this.D[idx];
        idx =  Long.numberOfTrailingZeros(~dp) + 64*idx;
        dp |= 1L<<(idx&0x3f);
        this.D[idx>>6] = dp;
        int ret = idx;
        if (dp==-1) {
            idx >>= 6;
            cp |= 1L<<(idx&0x3f);
            this.C[idx>>6] = cp;
            if (cp==-1) {
                idx >>= 6;
                bp |= 1L<<(idx&0x3f);
                this.B[idx>>6] = bp;
                if (bp==-1) {
                    this.A |= 1L<<(idx&0x3f);
                }
            }
        }
        this.cnt++;
        return ret;
    }
    public boolean free(int idx) {
        long v = this.D[idx>>6];
        boolean wasSet = (v&(1L<<(idx&0x3f)))!=0;
        this.cnt -= wasSet?1:0;
        this.D[idx>>6] = v&~(1L<<(idx&0x3f));
        idx >>= 6;
        this.C[idx>>6] &= ~(1L<<(idx&0x3f));
        idx >>= 6;
        this.B[idx>>6] &= ~(1L<<(idx&0x3f));
        idx >>= 6;
        this.A &= ~(1L<<(idx&0x3f));
        return wasSet;
    }
    public int getCount() {
        return this.cnt;
    }
    public boolean isSet(int idx) {
        return (this.D[idx>>6]&(1L<<(idx&0x3f)))!=0;
    }
 }
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads.frag
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads.frag
@@ -1,6 +1,8 @@
 #version 460 core
 layout(binding = 0) uniform sampler2D blockModelAtlas;
 //#define DEBUG_RENDER
 //TODO: need to fix when merged quads have discardAlpha set to false but they span multiple tiles
 // however they are not a full block
@@ -10,8 +12,11 @@ layout(location = 2) in flat vec4 tinting;
 layout(location = 3) in flat vec4 addin;
 layout(location = 4) in flat uint flags;
 layout(location = 5) in flat vec4 conditionalTinting;
 //layout(location = 6) in flat vec4 solidColour;
 #ifdef DEBUG_RENDER
 layout(location = 6) in flat uint quadDebug;
 #endif
 layout(location = 0) out vec4 outColour;
 void main() {
    vec2 uv = mod(uv, vec2(1.0))*(1.0/(vec2(3.0,2.0)*256.0));
@@ -29,4 +34,14 @@ void main() {
    outColour = (colour * tinting) + addin;
    //outColour = vec4(uv + baseUV, 0, 1);
    #ifdef DEBUG_RENDER
    uint hash = quadDebug*1231421+123141;
    hash ^= hash>>16;
    hash = hash*1231421+123141;
    hash ^= hash>>16;
    hash = hash * 1827364925 + 123325621;
    outColour = vec4(float(hash&15u)/15, float((hash>>4)&15u)/15, float((hash>>8)&15u)/15, 1);
    #endif
 }
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads.vert
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads.vert
@@ -1,150 +0,0 @@
 #version 460 core
 #extension GL_ARB_gpu_shader_int64 : enable
 #import <voxy:lod/quad_format.glsl>
 #import <voxy:lod/gl46/bindings.glsl>
 #import <voxy:lod/block_model.glsl>
 #line 8
 layout(location = 0) out vec2 uv;
 layout(location = 1) out flat vec2 baseUV;
 layout(location = 2) out flat vec4 tinting;
 layout(location = 3) out flat vec4 addin;
 layout(location = 4) out flat uint flags;
 layout(location = 5) out flat vec4 conditionalTinting;
 //layout(location = 6) out flat vec4 solidColour;
 uint extractLodLevel() {
    return uint(gl_BaseInstance)>>27;
 }
 //Note the last 2 bits of gl_BaseInstance are unused
 //Gives a relative position of +-255 relative to the player center in its respective lod
 ivec3 extractRelativeLodPos() {
    return (ivec3(gl_BaseInstance)<<ivec3(5,14,23))>>ivec3(23);
 }
 vec4 uint2vec4RGBA(uint colour) {
    return vec4((uvec4(colour)>>uvec4(24,16,8,0))&uvec4(0xFF))/255.0;
 }
 //Gets the face offset with respect to the face direction (e.g. some will be + some will be -)
 float getDepthOffset(uint faceData, uint face) {
    float offset = extractFaceIndentation(faceData);
    return offset * (1.0-((int(face)&1)*2.0));
 }
 vec2 getFaceSizeOffset(uint faceData, uint corner) {
    float EPSILON = 0.001f;
    vec4 faceOffsetsSizes = extractFaceSizes(faceData);
    //Expand the quads by a very small amount
    faceOffsetsSizes.xz -= vec2(EPSILON);
    faceOffsetsSizes.yw += vec2(EPSILON);
    return mix(faceOffsetsSizes.xz, faceOffsetsSizes.yw-1.0f, bvec2(((corner>>1)&1u)==1, (corner&1u)==1));
 }
 //TODO: add a mechanism so that some quads can ignore backface culling
 // this would help alot with stuff like crops as they would look kinda weird i think,
 // same with flowers etc
 void main() {
    int cornerIdx = gl_VertexID&3;
    Quad quad = quadData[uint(gl_VertexID)>>2];
    vec3 innerPos = extractPos(quad);
    uint face = extractFace(quad);
    uint modelId = extractStateId(quad);
    BlockModel model = modelData[modelId];
    uint faceData = model.faceData[face];
    bool isTranslucent = modelIsTranslucent(model);
    bool hasAO = modelHasMipmaps(model);//TODO: replace with per face AO flag
    bool isShaded = hasAO;//TODO: make this a per face flag
    //Change the ordering due to backface culling
    //NOTE: when rendering, backface culling is disabled as we simply dispatch calls for each face
    // this has the advantage of having "unassigned" geometry, that is geometry where the backface isnt culled
    //if (face == 0 || (face>>1 != 0 && (face&1)==1)) {
    //    cornerIdx ^= 1;
    //}
    uint lodLevel = extractLodLevel();
    ivec3 lodCorner = ((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5;
    vec3 corner = innerPos * (1<<lodLevel) + lodCorner;
    vec2 faceOffset = getFaceSizeOffset(faceData, cornerIdx);
    ivec2 quadSize = extractSize(quad);
    vec2 respectiveQuadSize = vec2(quadSize * ivec2((cornerIdx>>1)&1, cornerIdx&1));
    vec2 size = (respectiveQuadSize + faceOffset) * (1<<lodLevel);
    vec3 offset = vec3(size, (float(face&1u) + getDepthOffset(faceData, face)) * (1<<lodLevel));
    if ((face>>1) == 0) { //Up/down
        offset = offset.xzy;
    }
    //Not needed, here for readability
    //if ((face>>1) == 1) {//north/south
    //    offset = offset.xyz;
    //}
    if ((face>>1) == 2) { //west/east
        offset = offset.zxy;
    }
    gl_Position = MVP * vec4(corner + offset, 1.0);
    //Compute the uv coordinates
    vec2 modelUV = vec2(modelId&0xFFu, (modelId>>8)&0xFFu)*(1.0/(256.0));
    //TODO: make the face orientated by 2x3 so that division is not a integer div and modulo isnt needed
    // as these are very slow ops
    baseUV = modelUV + (vec2(face>>1, face&1u) * (1.0/(vec2(3.0, 2.0)*256.0)));
    //TODO: add an option to scale the quad size by the lod level so that
    // e.g. at lod level 2 a face will have 2x2
    uv = respectiveQuadSize + faceOffset;//Add in the face offset for 0,0 uv
    flags = faceHasAlphaCuttout(faceData);
    //We need to have a conditional override based on if the model size is < a full face + quadSize > 1
    flags |= uint(any(greaterThan(quadSize, ivec2(1)))) & faceHasAlphaCuttoutOverride(faceData);
    flags |= uint(!modelHasMipmaps(model))<<1;
    //Compute lighting
    tinting = getLighting(extractLightId(quad));
    //Apply model colour tinting
    uint tintColour = model.colourTint;
    if (modelHasBiomeLUT(model)) {
        tintColour = colourData[tintColour + extractBiomeId(quad)];
    }
    conditionalTinting = vec4(0);
    if (tintColour != uint(-1)) {
        flags |= 1u<<2;
        conditionalTinting = uint2vec4RGBA(tintColour).yzwx;
    }
    addin = vec4(0.0);
    if (!isTranslucent) {
        tinting.w = 0.0;
        //Encode the face, the lod level and
        uint encodedData = 0;
        encodedData |= face;
        encodedData |= (lodLevel<<3);
        encodedData |= uint(hasAO)<<6;
        addin.w = float(encodedData)/255.0;
    }
    //Apply face tint
    if (isShaded) {
        if ((face>>1) == 1) {
            tinting.xyz *= 0.8f;
        } else if ((face>>1) == 2) {
            tinting.xyz *= 0.6f;
        } else if (face == 0){
            tinting.xyz *= 0.5f;
        } else {
            //TODO: FIXME: DONT HAVE SOME ARBITARY TINT LIKE THIS
            tinting.xyz *= 0.95f;
        }
    }
    //solidColour = vec4(vec3(modelId&0xFu, (modelId>>4)&0xFu, (modelId>>8)&0xFu)*(1f/15f),1f);
 }
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads2.vert
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads2.vert
@@ -6,13 +6,18 @@
 #import <voxy:lod/block_model.glsl>
 #line 8
 //#define DEBUG_RENDER
 layout(location = 0) out vec2 uv;
 layout(location = 1) out flat vec2 baseUV;
 layout(location = 2) out flat vec4 tinting;
 layout(location = 3) out flat vec4 addin;
 layout(location = 4) out flat uint flags;
 layout(location = 5) out flat vec4 conditionalTinting;
-//layout(location = 6) out flat vec4 solidColour;
+
 #ifdef DEBUG_RENDER
 layout(location = 6) out flat uint quadDebug;
 #endif
 uint extractLodLevel() {
    return uint(gl_BaseInstance)>>27;
@@ -143,4 +148,8 @@ void main() {
    vec3 origin = vec3(((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5);
    gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize,0)))*(1<<lodLevel)+origin, 1.0);
    #ifdef DEBUG_RENDER
    quadDebug = lodLevel;
    #endif
 }
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/Queue.glsl
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/Queue.glsl
@@ -0,0 +1,20 @@
 //Use defines and undefines to define the queue, allows for hacky reuse of imports
 #ifndef QUEUE_NAME
 #error QUEUE_NAME is not defined
 #endif
 void push(queue, item) {
 }
 #
--- a/src/main/resources/assets/voxy/shaders/lod/hierarchical/selector.comp
+++ b/src/main/resources/assets/voxy/shaders/lod/hierarchical/selector.comp
@@ -0,0 +1,262 @@
 #version 460 core
 #define WORKGROUP 4
 #define MINI_BATCH_SIZE 32
 //The entire uint is a minibatch (each idx is one)
 #define MINI_BATCH_MSK (uint(-1))
 //Each y dim is a quadrent in the octree
 // multiple x dims to fill up workgroups
 layout(local_size_x=WORKGROUP, local_size_y=8) in;
 layout(binding = 1, std430) restrict buffer RequestSectionLoadQueue {
    uint counter;
    uint[] queue;
 } requestQueue;
 //SectionNodeData is a uvec4 that contains the position + flags + ptr to own render section data + ptr to children
 layout(binding = 2, std430) restrict readonly buffer SectionNodeData {
    uvec4[] sectionNodes;
 };
 layout(binding = 3, std430) restrict buffer ActiveWorkingNodeQueue {
    uint feedbackStatus;
    uint batchIndex;
    uint end;
    uint start;
    uint maxSize;//Needs to be a multiple of local_size_x
    uint[] queue;
 } nodeQueue;
 struct UnpackedNode {
    ivec4 position;//x,y,z,detail
    uint  flags;//16 bits
    uint  self;
    uint  children;
 };
 UnpackedNode unpackNode(uvec4 data) {
    UnpackedNode node;
    return node;
 }
 //NOTE: this is different to nanite in the fact that if a node is not loaded, too bad dont render
 shared UnpackedNode workingNodes[WORKGROUP];
 shared uint miniBatchMsk;
 void loadNode() {
    if (gl_LocalInvocationIndex == 0) {//Check if we need to
        batchMsk = 0;//Reset the minibatch
        if (miniBatchMsk == MINI_BATCH_SIZE) {
        }
    }
    barrier();
    if (gl_LocalInvocationID.y == 0) {
        //Need to make it work in y size 8, but only gl_LocalInvocationId.x == 0
        workingNodes[gl_LocalInvocationID.x] = unpackNode(sectionNodes[id]);
    }
    barrier();//Synchonize, also acts as memory barrier
 }
 //Computes screensize of the node and whether it should render itself or its children
 bool shouldRenderChildren(UnpackedNode node) {
 }
 //Process a single node and enqueue child nodes if needed into work queue, enqueue self to render and/or request children to load
 void processNode(uint id) {//Called even if it doesnt have any work (id==-1) to ensure uniform control flow for barriers
    //Bottom 2 bits are status flags, is air and children loaded
    // node.flags
    //If the childrenloaded flag is not set, send a request for the children of the node to be loaded
    // if all the children are loaded but we are not and we need to render, render the children and dispatch
    // a request to load self
    if (shouldRenderChildren(node)) {
        //Dont care about
    } else {
    }
 }
 //The activly schedualed/acquired work slot for this group
 shared uint workingBatchIndex;
 shared uint workingBatchOffset;
 void process() {
    if (gl_LocalInvocationIndex == 0) {//This includes both x and y
        workingBatchIndex = atomicAdd(nodeQueue.batchIndex, BATCH_SIZE);
    }
 }
 void main() {
    while (true) {
        barrier();
    }
 }
 //when a node is processed,
 // compute its screen bounding box is computed using fast trick (e.g. if your viewing it from a quadrent you already know its bounding points (min/max))
 // frustum cull, check hiz
 // if it passes culling, use the screensize to check wether it must render itself
 // or dispatch its children to render
 //      IF its error is small enough, then render itself, its mesh should always be loaded, if not its a critical error (except maybe if its a top level node or something)
 //      if its error is too large,
 //          check that all children are loaded (or empty), if they are not all loaded, enqueu a request for the cpu to load
 //          that nodes children
 //              if the load queue is full, dont enqueue it to the queue
 //          then instead of rendering children, render its own mesh since it should always be loaded
 //Can also reverse the above slightly and make it so that it checks the children before enqueuing them
 //the main thing to worry about is if there is enough work to fill the inital few rounds of this
 // before amplification takes effect
 // can do a thing where it initally just blasts child nodes out until the size is small enough
 // NOTE: since matrix multiplication distributes over addition
 //  can precompute the AABB corners with respect to the matrix
 //  then you can just add a translation vector
 //TODO: can do in another way
 // first compute the sections that should either render self or childs
 // then in as a seperate job queue work though it
 uint getChildCount(UnpackedNode node) {
 }
 //Checks whether a node should be culled based on hiz/frustum
 bool cullNode(UnpackedNode node) {
 }
 //Should render this node, or recurse to children
 bool shouldRenderChildrenInstead(UnpackedNode node) {
 }
 //Does the node have its own mesh loaded
 bool nodeHasSelfMesh(UnpackedNode node) {
 }
 //Does the node its children loaded (note! not child meshes)
 bool nodeHasChildrenLoaded(UnpackedNode node) {
 }
 //Are all the childrens meshes loaded
 bool nodeHasChildMeshesLoaded(UnpackedNode node) {
 }
 void request(uint type, uint idx) {
 }
 void renderMesh(uint idx) {
 }
 void enqueueChildren(uint arg, UnpackedNode node) {
    uint cnt = getChildCount(node);
    //TODO: the queue needs 2 counters, the pre and post atomic,
    // pre is incremented to get index
    // queue is written to
    // post is then incremented to signal
 }
 void reportCritical(uint type) {
 }
 void processNode(uint idx) {
    UnpackedNode node = unpackNode(sectionNodes[idx]);
    if (!cullNode(node)) {
        //Should we render children instead of ourselves with respect to screenspace error
        if (shouldRenderChildrenInstead(node)) {
            if (nodeHasChildrenLoaded(node)) {
                //Dispatch nodes to queue
                enqueueChildren(0, node);
            } else {
                //Children arnt loaded so either render self mesh or if we cant
                // abort basicly must request nodes
                if (nodeHasSelfMesh(node)) {
                    //Render self and dispatch request to load children
                    renderMesh(node.self);
                    request(1, idx);
                } else {
                    //Critical issue, no are loaded and self has no mesh
                    reportCritical(0);
                }
            }
        } else {
            if (nodeHasSelfMesh(node)) {
                //render self
                renderMesh(node.self);
            } else {
                //Request that self mesh is loaded
                request(0, idx);
                //render children instead
                if (nodeHasChildrenLoaded(node)) {//Might need to be node nodeHasChildMeshesLoaded
                    enqueueChildren(1, node);
                } else {
                    //This is very bad, it means cant render anything
                    reportCritical(1);
                }
            }
        }
    }
 }
 //Psudo code, one thread, one load
 void main() {
    while (true) {
        //Try to process a node queue entry
        uint work = atomicAdd(workingNodeQueuePos, 1);
        uint idx = work&0xFFFFFFu;
        uint arg = work>>24;
        if (idx < workingNodeQueueEnd) {
        } else {
            //Do other queue work however we still have the work slot allocated
        }
    }
 }