Finished mesh implementation optimized gl46 shader

2024-03-13 19:25:09 +10:00
parent 283084cfa8
commit af9c45bb51
7 changed files with 447 additions and 8 deletions
--- a/src/main/java/me/cortex/voxy/client/core/VoxelCore.java
+++ b/src/main/java/me/cortex/voxy/client/core/VoxelCore.java
@@ -64,8 +64,10 @@ public class VoxelCore {
        SharedIndexBuffer.INSTANCE.id();
        if (VoxyConfig.CONFIG.useMeshShaders()) {
            this.renderer = new NvMeshFarWorldRenderer(VoxyConfig.CONFIG.geometryBufferSize, VoxyConfig.CONFIG.maxSections);
+            System.out.println("Using NvMeshFarWorldRenderer");
        } else {
            this.renderer = new Gl46FarWorldRenderer(VoxyConfig.CONFIG.geometryBufferSize, VoxyConfig.CONFIG.maxSections);
+            System.out.println("Using Gl46FarWorldRenderer");
        }
        this.viewportSelector = new ViewportSelector<>(this.renderer::createViewport);
        System.out.println("Renderer initialized");
--- a/src/main/java/me/cortex/voxy/client/core/rendering/Gl46FarWorldRenderer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/Gl46FarWorldRenderer.java
@@ -38,7 +38,7 @@ public class Gl46FarWorldRenderer extends AbstractFarWorldRenderer<Gl46Viewport>
            .compile();

    private final Shader lodShader = Shader.make()
-            .add(ShaderType.VERTEX, "voxy:lod/gl46/quads.vert")
+            .add(ShaderType.VERTEX, "voxy:lod/gl46/quads2.vert")
            .add(ShaderType.FRAGMENT, "voxy:lod/gl46/quads.frag")
            .compile();

--- a/src/main/java/me/cortex/voxy/client/core/rendering/NvMeshFarWorldRenderer.java
+++ b/src/main/java/me/cortex/voxy/client/core/rendering/NvMeshFarWorldRenderer.java
@@ -15,7 +15,9 @@ import org.lwjgl.system.MemoryUtil;
 import java.util.List;

 import static org.lwjgl.opengl.ARBIndirectParameters.GL_PARAMETER_BUFFER_ARB;
+import static org.lwjgl.opengl.ARBIndirectParameters.glMultiDrawElementsIndirectCountARB;
 import static org.lwjgl.opengl.GL11.*;
+import static org.lwjgl.opengl.GL14C.glBlendFuncSeparate;
 import static org.lwjgl.opengl.GL15.GL_ELEMENT_ARRAY_BUFFER;
 import static org.lwjgl.opengl.GL15.glBindBuffer;
 import static org.lwjgl.opengl.GL30.glBindBufferBase;
@@ -39,6 +41,12 @@ public class NvMeshFarWorldRenderer extends AbstractFarWorldRenderer<NvMeshViewp
            .add(ShaderType.FRAGMENT, "voxy:lod/nvmesh/primary.frag")
            .compile();

+    private final Shader translucent = Shader.make()
+            .add(ShaderType.TASK, "voxy:lod/nvmesh/translucent.task")
+            .add(ShaderType.MESH, "voxy:lod/nvmesh/translucent.mesh")
+            .add(ShaderType.FRAGMENT, "voxy:lod/nvmesh/primary.frag")
+            .compile();
+
    private final Shader cull = Shader.make()
            .add(ShaderType.VERTEX, "voxy:lod/nvmesh/cull.vert")
            .add(ShaderType.FRAGMENT, "voxy:lod/nvmesh/cull.frag")
@@ -135,7 +143,35 @@ public class NvMeshFarWorldRenderer extends AbstractFarWorldRenderer<NvMeshViewp
        if (this.geometry.getSectionCount()==0) {
            return;
        }
-        //TODO: make a different task shader for translucent
+        RenderLayer.getTranslucent().startDrawing();
+        glBindVertexArray(AbstractFarWorldRenderer.STATIC_VAO);
+        glDisable(GL_CULL_FACE);
+        glEnable(GL_BLEND);
+
+        //TODO: maybe change this so the alpha isnt applied in the same way or something?? since atm the texture bakery uses a very hacky
+        // blend equation to make it avoid double applying translucency
+        glBlendFuncSeparate(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, GL_ONE, GL_ONE_MINUS_SRC_ALPHA);
+
+
+        glBindSampler(0, this.models.getSamplerId());
+        glBindTextureUnit(0, this.models.getTextureId());
+
+        this.translucent.bind();
+        this.bindResources(viewport);
+
+        glDepthMask(false);
+        glDrawMeshTasksNV(0, this.geometry.getSectionCount());
+        glDepthMask(true);
+
+        glEnable(GL_CULL_FACE);
+        glBindVertexArray(0);
+
+
+        glBindSampler(0, 0);
+        glBindTextureUnit(0, 0);
+        glDisable(GL_BLEND);
+
+        RenderLayer.getTranslucent().endDrawing();
    }

    @Override
--- a/src/main/resources/assets/voxy/shaders/lod/gl46/quads2.vert
+++ b/src/main/resources/assets/voxy/shaders/lod/gl46/quads2.vert
@@ -0,0 +1,146 @@
+#version 460 core
+#extension GL_ARB_gpu_shader_int64 : enable
+
+#import <voxy:lod/quad_format.glsl>
+#import <voxy:lod/gl46/bindings.glsl>
+#import <voxy:lod/block_model.glsl>
+#line 8
+
+layout(location = 0) out vec2 uv;
+layout(location = 1) out flat vec2 baseUV;
+layout(location = 2) out flat vec4 tinting;
+layout(location = 3) out flat vec4 addin;
+layout(location = 4) out flat uint flags;
+layout(location = 5) out flat vec4 conditionalTinting;
+//layout(location = 6) out flat vec4 solidColour;
+
+uint extractLodLevel() {
+    return uint(gl_BaseInstance)>>27;
+}
+
+//Note the last 2 bits of gl_BaseInstance are unused
+//Gives a relative position of +-255 relative to the player center in its respective lod
+ivec3 extractRelativeLodPos() {
+    return (ivec3(gl_BaseInstance)<<ivec3(5,14,23))>>ivec3(23);
+}
+
+vec4 uint2vec4RGBA(uint colour) {
+    return vec4((uvec4(colour)>>uvec4(24,16,8,0))&uvec4(0xFF))/255.0;
+}
+
+vec4 getFaceSize(uint faceData) {
+    float EPSILON = 0.001f;
+    vec4 faceOffsetsSizes = extractFaceSizes(faceData);
+    //Expand the quads by a very small amount
+    faceOffsetsSizes.xz -= vec2(EPSILON);
+    faceOffsetsSizes.yw += vec2(EPSILON);
+
+    //Make the end relative to the start
+    faceOffsetsSizes.yw -= faceOffsetsSizes.xz;
+
+    return faceOffsetsSizes;
+}
+
+//TODO: make branchless by using ternaries i think
+vec3 swizzelDataAxis(uint axis, vec3 data) {
+    if (axis == 0) { //Up/down
+        data = data.xzy;
+    }
+    //Not needed, here for readability
+    //if (axis == 1) {//north/south
+    //    offset = offset.xyz;
+    //}
+    if (axis == 2) { //west/east
+        data = data.zxy;
+    }
+    return data;
+}
+
+//TODO: add a mechanism so that some quads can ignore backface culling
+// this would help alot with stuff like crops as they would look kinda weird i think,
+// same with flowers etc
+void main() {
+    int cornerIdx = gl_VertexID&3;
+    Quad quad = quadData[uint(gl_VertexID)>>2];
+    vec3 innerPos = extractPos(quad);
+    uint face = extractFace(quad);
+    uint modelId = extractStateId(quad);
+    BlockModel model = modelData[modelId];
+    uint faceData = model.faceData[face];
+    bool isTranslucent = modelIsTranslucent(model);
+    bool hasAO = modelHasMipmaps(model);//TODO: replace with per face AO flag
+    bool isShaded = hasAO;//TODO: make this a per face flag
+
+    uint lodLevel = extractLodLevel();
+
+
+    vec2 modelUV = vec2(modelId&0xFFu, (modelId>>8)&0xFFu)*(1.0/(256.0));
+    baseUV = modelUV + (vec2(face>>1, face&1u) * (1.0/(vec2(3.0, 2.0)*256.0)));
+
+    ivec2 quadSize = extractSize(quad);
+
+    { //Generate tinting and flag data
+        flags = faceHasAlphaCuttout(faceData);
+
+        //We need to have a conditional override based on if the model size is < a full face + quadSize > 1
+        flags |= uint(any(greaterThan(quadSize, ivec2(1)))) & faceHasAlphaCuttoutOverride(faceData);
+
+        flags |= uint(!modelHasMipmaps(model))<<1;
+
+        //Compute lighting
+        tinting = getLighting(extractLightId(quad));
+
+        //Apply model colour tinting
+        uint tintColour = model.colourTint;
+        if (modelHasBiomeLUT(model)) {
+            tintColour = colourData[tintColour + extractBiomeId(quad)];
+        }
+
+        conditionalTinting = vec4(0);
+        if (tintColour != uint(-1)) {
+            flags |= 1u<<2;
+            conditionalTinting = uint2vec4RGBA(tintColour).yzwx;
+        }
+
+        addin = vec4(0.0);
+        if (!isTranslucent) {
+            tinting.w = 0.0;
+            //Encode the face, the lod level and
+            uint encodedData = 0;
+            encodedData |= face;
+            encodedData |= (lodLevel<<3);
+            encodedData |= uint(hasAO)<<6;
+            addin.w = float(encodedData)/255.0;
+        }
+
+        //Apply face tint
+        if (isShaded) {
+            //TODO: make branchless, infact apply ahead of time to the texture itself in ModelManager since that is
+            // per face
+            if ((face>>1) == 1) {
+                tinting.xyz *= 0.8f;
+            } else if ((face>>1) == 2) {
+                tinting.xyz *= 0.6f;
+            } else if (face == 0){
+                tinting.xyz *= 0.5f;
+            }
+        }
+    }
+
+
+
+
+
+    vec4 faceSize = getFaceSize(faceData);
+
+    vec2 cQuadSize = (faceSize.yw + quadSize - 1) * vec2((cornerIdx>>1)&1, cornerIdx&1);
+    uv = faceSize.xz + cQuadSize;
+
+    vec3 cornerPos = extractPos(quad);
+    float depthOffset = extractFaceIndentation(faceData);
+    cornerPos += swizzelDataAxis(face>>1, vec3(faceSize.xz, mix(depthOffset, 1-depthOffset, float(face&1u))));
+
+
+    vec3 origin = vec3(((extractRelativeLodPos()<<lodLevel) - (baseSectionPos&(ivec3((1<<lodLevel)-1))))<<5);
+    gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize,0)))*(1<<lodLevel)+origin, 1.0);
+}
--- a/src/main/resources/assets/voxy/shaders/lod/nvmesh/primary.mesh
+++ b/src/main/resources/assets/voxy/shaders/lod/nvmesh/primary.mesh
@@ -43,12 +43,6 @@ vec4 uint2vec4RGBA(uint colour) {
    return vec4((uvec4(colour)>>uvec4(24,16,8,0))&uvec4(0xFF))/255.0;
 }

-//Gets the face offset with respect to the face direction (e.g. some will be + some will be -)
-float getDepthOffset(uint faceData, uint face) {
-    float offset = extractFaceIndentation(faceData);
-    return offset * (1.0-((int(face)&1)*2.0));
-}
-
 vec4 getFaceSize(uint faceData) {
    float EPSILON = 0.001f;
    vec4 faceOffsetsSizes = extractFaceSizes(faceData);
--- a/src/main/resources/assets/voxy/shaders/lod/nvmesh/translucent.mesh
+++ b/src/main/resources/assets/voxy/shaders/lod/nvmesh/translucent.mesh
@@ -0,0 +1,212 @@
+#version 460
+
+#extension GL_ARB_shading_language_include : enable
+#pragma optionNV(unroll all)
+#define UNROLL_LOOP
+
+#extension GL_NV_mesh_shader : require
+#extension GL_NV_gpu_shader5 : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+#import <voxy:lod/nvmesh/bindings.glsl>
+#import <voxy:lod/block_model.glsl>
+#import <voxy:lod/quad_format.glsl>
+#line 13
+
+layout(local_size_x = 16) in;
+layout(triangles, max_vertices=64, max_primitives=32) out;
+
+layout(location=1) out Interpolants {
+    vec2 uv;
+} i_out[];
+
+layout(location=2) perprimitiveNV out PerPrimData {
+    vec2 baseUV;
+    vec4 tinting;
+    vec4 addin;
+    uint flags;
+    vec4 conditionalTinting;
+} per_prim_out[];
+
+void emitIndicies() {
+    uint primBase = gl_LocalInvocationID.x * 6;
+    uint vertBase = gl_LocalInvocationID.x<<2;
+    gl_PrimitiveIndicesNV[primBase+0] = vertBase+0;
+    gl_PrimitiveIndicesNV[primBase+1] = vertBase+1;
+    gl_PrimitiveIndicesNV[primBase+2] = vertBase+2;
+    gl_PrimitiveIndicesNV[primBase+3] = vertBase+2;
+    gl_PrimitiveIndicesNV[primBase+4] = vertBase+3;
+    gl_PrimitiveIndicesNV[primBase+5] = vertBase+0;
+}
+
+vec4 uint2vec4RGBA(uint colour) {
+    return vec4((uvec4(colour)>>uvec4(24,16,8,0))&uvec4(0xFF))/255.0;
+}
+
+vec4 getFaceSize(uint faceData) {
+    float EPSILON = 0.001f;
+    vec4 faceOffsetsSizes = extractFaceSizes(faceData);
+    //Expand the quads by a very small amount
+    faceOffsetsSizes.xz -= vec2(EPSILON);
+    faceOffsetsSizes.yw += vec2(EPSILON);
+
+    //Make the end relative to the start
+    faceOffsetsSizes.yw -= faceOffsetsSizes.xz;
+
+    return faceOffsetsSizes;
+}
+
+//TODO: make branchless by using ternaries i think
+vec3 swizzelDataAxis(uint axis, vec3 data) {
+    if (axis == 0) { //Up/down
+        data = data.xzy;
+    }
+    //Not needed, here for readability
+    //if (axis == 1) {//north/south
+    //    offset = offset.xyz;
+    //}
+    if (axis == 2) { //west/east
+        data = data.zxy;
+    }
+    return data;
+}
+
+taskNV in Task {
+    vec3 origin;//Offset to camera in world space (already multiplied by lod level)
+    uint baseOffset;//Base offset into the quad data buffer
+
+    uint meta;//First 4 bits is lod level, remaining is quadCount
+};
+
+uint getQuadIndex() {
+    if ((meta>>4)<=gl_GlobalInvocationID.x) return -1;
+    return baseOffset + gl_GlobalInvocationID.x;
+}
+
+void main() {
+    uint idx = getQuadIndex();
+    //If its over, dont render
+    if (idx == uint(-1)) {
+        return;
+    }
+    emitIndicies();
+
+    uint A = gl_LocalInvocationID.x<<1;
+    uint B = (gl_LocalInvocationID.x<<1)|1u;
+    uint V = (gl_LocalInvocationID.x<<2);
+
+    uint lodLvl = meta&0xf;
+    float lodScale = (1<<lodLvl);
+
+    Quad quad = quadData[idx];
+    uint face = extractFace(quad);
+    uint modelId = extractStateId(quad);
+    BlockModel model = modelData[modelId];
+    uint faceData = model.faceData[face];
+    bool isTranslucent = modelIsTranslucent(model);
+
+
+    bool hasAO = modelHasMipmaps(model);//TODO: replace with per face AO flag
+    bool isShaded = hasAO;//TODO: make this a per face flag
+
+
+    ivec2 quadSize = extractSize(quad);
+
+
+    //Compute the uv coordinates
+    vec2 modelUV = vec2(modelId&0xFFu, (modelId>>8)&0xFFu)*(1.0/(256.0));
+    vec2 baseUV = modelUV + (vec2(face>>1, face&1u) * (1.0/(vec2(3.0, 2.0)*256.0)));
+    //Write out baseUV
+    per_prim_out[A].baseUV = baseUV;
+    per_prim_out[B].baseUV = baseUV;
+
+
+
+
+    uint flags = faceHasAlphaCuttout(faceData);
+
+    //We need to have a conditional override based on if the model size is < a full face + quadSize > 1
+    flags |= uint(any(greaterThan(quadSize, ivec2(1)))) & faceHasAlphaCuttoutOverride(faceData);
+
+    flags |= uint(!modelHasMipmaps(model))<<1;
+
+    //Compute lighting
+    vec4 tinting = getLighting(extractLightId(quad));
+
+    //Apply model colour tinting
+    uint tintColour = model.colourTint;
+    if (modelHasBiomeLUT(model)) {
+        tintColour = colourData[tintColour + extractBiomeId(quad)];
+    }
+
+    vec4 conditionalTinting = vec4(0);
+    if (tintColour != uint(-1)) {
+        flags |= 1u<<2;
+        conditionalTinting = uint2vec4RGBA(tintColour).yzwx;
+    }
+
+    vec4 addin = vec4(0.0);
+    if (!isTranslucent) {
+        tinting.w = 0.0;
+        //Encode the face, the lod level and
+        uint encodedData = 0;
+        encodedData |= face;
+        encodedData |= (lodLvl<<3);
+        encodedData |= uint(hasAO)<<6;
+        addin.w = float(encodedData)/255.0;
+    }
+
+    //Apply face tint
+    if (isShaded) {
+        //TODO: make branchless, infact apply ahead of time to the texture itself in ModelManager since that is
+        // per face
+        if ((face>>1) == 1) {
+            tinting.xyz *= 0.8f;
+        } else if ((face>>1) == 2) {
+            tinting.xyz *= 0.6f;
+        } else if (face == 0){
+            tinting.xyz *= 0.5f;
+        }
+    }
+
+
+    //Write out everything
+    per_prim_out[A].tinting = tinting;
+    per_prim_out[A].addin = addin;
+    per_prim_out[A].flags = flags;
+    per_prim_out[A].conditionalTinting = conditionalTinting;
+    per_prim_out[B].tinting = tinting;
+    per_prim_out[B].addin = addin;
+    per_prim_out[B].flags = flags;
+    per_prim_out[B].conditionalTinting = conditionalTinting;
+
+
+
+
+
+    vec4 faceSize = getFaceSize(faceData);
+
+    vec2 cQuadSize = faceSize.yw + quadSize - 1;
+    vec2 uv0 = faceSize.xz;
+    i_out[V|0].uv = uv0;
+    i_out[V|1].uv = uv0 + vec2(0, cQuadSize.y);
+    i_out[V|2].uv = uv0 + cQuadSize;
+    i_out[V|3].uv = uv0 + vec2(cQuadSize.x, 0);
+
+
+
+
+    //Corner position of quad relative to section corner (in 0->32 scale)
+    vec3 cornerPos = extractPos(quad);
+    float depthOffset = extractFaceIndentation(faceData);
+    cornerPos += swizzelDataAxis(face>>1, vec3(faceSize.xz, mix(depthOffset, 1-depthOffset, float(face&1u))));
+    gl_MeshVerticesNV[V|0].gl_Position = MVP*vec4(cornerPos*lodScale+origin, 1.0);
+    gl_MeshVerticesNV[V|1].gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(0,cQuadSize.y,0)))*lodScale+origin, 1.0);
+    gl_MeshVerticesNV[V|2].gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize,    0)))*lodScale+origin, 1.0);
+    gl_MeshVerticesNV[V|3].gl_Position = MVP*vec4((cornerPos+swizzelDataAxis(face>>1,vec3(cQuadSize.x,0,0)))*lodScale+origin, 1.0);
+
+    if (gl_LocalInvocationID.x == 0) {
+        //Remaining quads in workgroup
+        gl_PrimitiveCountNV = min(uint(int(meta>>4)-int(gl_WorkGroupID.x<<4))<<1, 32);//2 primatives per quad
+    }
+}
--- a/src/main/resources/assets/voxy/shaders/lod/nvmesh/translucent.task
+++ b/src/main/resources/assets/voxy/shaders/lod/nvmesh/translucent.task
@@ -0,0 +1,49 @@
+#version 460
+
+#extension GL_ARB_shading_language_include : enable
+#pragma optionNV(unroll all)
+#define UNROLL_LOOP
+
+#extension GL_NV_mesh_shader : require
+#extension GL_NV_gpu_shader5 : require
+#extension GL_ARB_gpu_shader_int64 : require
+
+#import <voxy:lod/nvmesh/bindings.glsl>
+#import <voxy:lod/section.glsl>
+#line 12
+
+#define MESH_WORKLOAD_PER_INVOCATION 16
+
+layout(local_size_x=1) in;
+
+taskNV out Task {
+    vec3 origin;//Offset to camera in world space (already multiplied by lod level)
+    uint baseOffset;//Base offset into the quad data buffer
+    uint meta;//First 4 bits is lod level, remaining is quadCount
+} task;
+
+void main() {
+    uint sectionId = gl_WorkGroupID.x;
+    bool visibleLastFrame = visibilityData[sectionId] == frameId;
+
+    //If it wasnt visible last frame then dont render this frame ** (do temporal coherance)
+    if (!visibleLastFrame) {
+        gl_TaskCountNV = 0;
+        return;
+    }
+    SectionMeta meta = sectionData[sectionId];
+    uint lodLvl = extractDetail(meta);
+    ivec3 lodPos= extractPosition(meta);
+    //Relative position to camera with resepct to lod level to check for visibility bits
+    ivec3 cpos = lodPos-(baseSectionPos>>lodLvl);
+    //Relative position to camera
+    task.origin = vec3(((lodPos<<lodLvl)-baseSectionPos)<<5)-cameraSubPos;
+
+    task.baseOffset = extractQuadStart(meta);
+    task.meta = lodLvl&0xFu;
+
+    uint cnt = meta.cntA&0xFFFF;//Skip translucency
+
+    task.meta |= cnt<<4;
+    gl_TaskCountNV = (cnt+MESH_WORKLOAD_PER_INVOCATION-1)/MESH_WORKLOAD_PER_INVOCATION;
+}