From 606d3b228217c8a4df7bf3e5927b06152b53f280 Mon Sep 17 00:00:00 2001 From: mcrcortex <18544518+MCRcortex@users.noreply.github.com> Date: Sun, 13 Jul 2025 17:39:48 +1000 Subject: [PATCH] hiz2 --- .../core/rendering/util/HiZBuffer2.java | 149 ++++++++++++++++++ .../assets/voxy/shaders/hiz/hiz.comp | 93 +++++++++++ 2 files changed, 242 insertions(+) create mode 100644 src/main/java/me/cortex/voxy/client/core/rendering/util/HiZBuffer2.java create mode 100644 src/main/resources/assets/voxy/shaders/hiz/hiz.comp diff --git a/src/main/java/me/cortex/voxy/client/core/rendering/util/HiZBuffer2.java b/src/main/java/me/cortex/voxy/client/core/rendering/util/HiZBuffer2.java new file mode 100644 index 00000000..46aec7a7 --- /dev/null +++ b/src/main/java/me/cortex/voxy/client/core/rendering/util/HiZBuffer2.java @@ -0,0 +1,149 @@ +package me.cortex.voxy.client.core.rendering.util; + +import me.cortex.voxy.client.core.gl.GlFramebuffer; +import me.cortex.voxy.client.core.gl.GlTexture; +import me.cortex.voxy.client.core.gl.shader.Shader; +import me.cortex.voxy.client.core.gl.shader.ShaderType; +import me.cortex.voxy.client.core.rendering.RenderService; +import org.lwjgl.opengl.GL11; + +import static org.lwjgl.opengl.ARBDirectStateAccess.*; +import static org.lwjgl.opengl.ARBShaderImageLoadStore.GL_TEXTURE_FETCH_BARRIER_BIT; +import static org.lwjgl.opengl.GL11C.*; +import static org.lwjgl.opengl.GL30C.*; +import static org.lwjgl.opengl.GL30C.glBindVertexArray; +import static org.lwjgl.opengl.GL33.glBindSampler; +import static org.lwjgl.opengl.GL33.glGenSamplers; +import static org.lwjgl.opengl.GL33C.glDeleteSamplers; +import static org.lwjgl.opengl.GL33C.glSamplerParameteri; +import static org.lwjgl.opengl.GL42C.*; +import static org.lwjgl.opengl.GL43C.glDispatchCompute; +import static org.lwjgl.opengl.GL45C.glTextureBarrier; + +public class HiZBuffer2 { + private final Shader hizMip = Shader.make() + .add(ShaderType.COMPUTE, "voxy:hiz/hiz.comp") + .compile(); + private final Shader hizInitial = Shader.make() + .add(ShaderType.VERTEX, "voxy:hiz/blit.vsh") + .add(ShaderType.FRAGMENT, "voxy:hiz/blit.fsh") + .define("OUTPUT_COLOUR") + .compile(); + private final GlFramebuffer fb = new GlFramebuffer().name("HiZ"); + private final int sampler = glGenSamplers(); + private final int type; + private GlTexture texture; + private int levels; + private int width; + private int height; + + public HiZBuffer2() { + this(GL_R32F); + } + public HiZBuffer2(int type) { + glNamedFramebufferDrawBuffer(this.fb.id, GL_COLOR_ATTACHMENT0); + this.type = type; + } + + private void alloc(int width, int height) { + this.levels = (int)Math.ceil(Math.log(Math.max(width, height))/Math.log(2)); + //We dont care about e.g. 1x1 size texture since you dont get meshlets that big to cover such a large area + //this.levels -= 1;//Arbitrary size, shinks the max level by alot and saves a significant amount of processing time + // (could probably increase it to be defined by a max meshlet coverage computation thing) + + //GL_DEPTH_COMPONENT32F //Cant use this as it does not match the depth format of the provided depth buffer + this.texture = new GlTexture().store(this.type, this.levels, width, height).name("HiZ"); + glTextureParameteri(this.texture.id, GL_TEXTURE_MIN_FILTER, GL_NEAREST_MIPMAP_NEAREST); + glTextureParameteri(this.texture.id, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glTextureParameteri(this.texture.id, GL_TEXTURE_COMPARE_MODE, GL_NONE); + glTextureParameteri(this.texture.id, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glTextureParameteri(this.texture.id, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + + glSamplerParameteri(this.sampler, GL_TEXTURE_MIN_FILTER, GL_NEAREST_MIPMAP_NEAREST); + glSamplerParameteri(this.sampler, GL_TEXTURE_MAG_FILTER, GL_NEAREST); + glSamplerParameteri(this.sampler, GL_TEXTURE_COMPARE_MODE, GL_NONE); + glSamplerParameteri(this.sampler, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + glSamplerParameteri(this.sampler, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + + this.width = width; + this.height = height; + + this.fb.bind(GL_COLOR_ATTACHMENT0, this.texture, 0).verify(); + } + + public void buildMipChain(int srcDepthTex, int width, int height) { + if (this.width != Integer.highestOneBit(width) || this.height != Integer.highestOneBit(height)) { + if (this.texture != null) { + this.texture.free(); + this.texture = null; + } + this.alloc(Integer.highestOneBit(width), Integer.highestOneBit(height)); + } + + + {//Mip down to initial chain + int boundFB = GL11.glGetInteger(GL_DRAW_FRAMEBUFFER_BINDING); + + glBindVertexArray(RenderService.STATIC_VAO); + this.hizInitial.bind(); + glBindFramebuffer(GL_FRAMEBUFFER, this.fb.id); + + glDisable(GL_DEPTH_TEST); + + + glBindTextureUnit(0, srcDepthTex); + glBindSampler(0, this.sampler); + glUniform1i(0, 0); + + glViewport(0, 0, this.width, this.height); + + glDrawArrays(GL_TRIANGLE_FAN, 0, 4); + + glTextureBarrier(); + glMemoryBarrier(GL_FRAMEBUFFER_BARRIER_BIT|GL_TEXTURE_FETCH_BARRIER_BIT); + + glBindFramebuffer(GL_FRAMEBUFFER, boundFB); + glViewport(0, 0, width, height); + glBindVertexArray(0); + } + + {//Compute based Mipping + this.hizMip.bind(); + + glUniform2f(0, 1f/this.width, 1f/this.height); + glBindTextureUnit(0, this.texture.id); + glBindSampler(0, this.sampler); + for (int i = 1; i < 7; i++) { + glBindImageTexture(i, this.texture.id, i, false, 0, GL_WRITE_ONLY, GL_R32F); + } + + glDispatchCompute(this.width/64, this.height/64, 1); + + glBindSampler(0, 0); + for (int i =0;i<7;i++) + glBindTextureUnit(i, 0); + + } + + + } + + public void free() { + this.fb.free(); + if (this.texture != null) { + this.texture.free(); + this.texture = null; + } + glDeleteSamplers(this.sampler); + this.hizInitial.free(); + this.hizMip.free(); + } + + public int getHizTextureId() { + return this.texture.id; + } + + public int getPackedLevels() { + return ((Integer.numberOfTrailingZeros(this.width))<<16)|(Integer.numberOfTrailingZeros(this.height));//+1 + } +} diff --git a/src/main/resources/assets/voxy/shaders/hiz/hiz.comp b/src/main/resources/assets/voxy/shaders/hiz/hiz.comp new file mode 100644 index 00000000..003839c5 --- /dev/null +++ b/src/main/resources/assets/voxy/shaders/hiz/hiz.comp @@ -0,0 +1,93 @@ +#version 460 core + +#extension GL_KHR_shader_subgroup_arithmetic: require +#extension GL_KHR_shader_subgroup_basic : require +#extension GL_KHR_shader_subgroup_clustered : require + + +//64x64 reduction +layout(local_size_x=256) in; + +const uint spread[64] = { + 0x11100100, 0x13120302, 0x31302120, 0x33322322, 0x15140504, 0x17160706, 0x35342524, 0x37362726, + 0x51504140, 0x53524342, 0x71706160, 0x73726362, 0x55544544, 0x57564746, 0x75746564, 0x77766766, + 0x19180908, 0x1b1a0b0a, 0x39382928, 0x3b3a2b2a, 0x1d1c0d0c, 0x1f1e0f0e, 0x3d3c2d2c, 0x3f3e2f2e, + 0x59584948, 0x5b5a4b4a, 0x79786968, 0x7b7a6b6a, 0x5d5c4d4c, 0x5f5e4f4e, 0x7d7c6d6c, 0x7f7e6f6e, + 0x91908180, 0x93928382, 0xb1b0a1a0, 0xb3b2a3a2, 0x95948584, 0x97968786, 0xb5b4a5a4, 0xb7b6a7a6, + 0xd1d0c1c0, 0xd3d2c3c2, 0xf1f0e1e0, 0xf3f2e3e2, 0xd5d4c5c4, 0xd7d6c7c6, 0xf5f4e5e4, 0xf7f6e7e6, + 0x99988988, 0x9b9a8b8a, 0xb9b8a9a8, 0xbbbaabaa, 0x9d9c8d8c, 0x9f9e8f8e, 0xbdbcadac, 0xbfbeafae, + 0xd9d8c9c8, 0xdbdacbca, 0xf9f8e9e8, 0xfbfaebea, 0xdddccdcc, 0xdfdecfce, 0xfdfcedec, 0xfffeefee +}; + +uint swizzleId(uint id) { + //swizzel to z curve + return bitfieldExtract(spread[id>>2], (int(id)&3)*8, 8); +} + +layout(location = 0) uniform vec2 invImSize; + +layout(binding = 0) uniform sampler2D mip_0; +layout(binding = 1, r32f) uniform restrict writeonly image2D mip_1; +layout(binding = 2, r32f) uniform restrict writeonly image2D mip_2; +layout(binding = 3, r32f) uniform restrict writeonly image2D mip_3; +layout(binding = 4, r32f) uniform restrict writeonly image2D mip_4; +layout(binding = 5, r32f) uniform restrict writeonly image2D mip_5; +layout(binding = 6, r32f) uniform restrict writeonly image2D mip_6; + +float getReduce2x2(ivec2 pos) {//w.r.t mip_1 + vec4 data = textureGather(mip_0, vec2(pos*2+1)*invImSize); + float ret = max(max(data.x,data.y),max(data.z,data.w)); + imageStore(mip_1, pos, vec4(ret)); + return ret; +} + +float getReduce4x4(ivec2 pos) {//w.r.t mip_2 + ivec2 pos2 = pos*2; + float ret = max(max(getReduce2x2(pos2+ivec2(0,0)),getReduce2x2(pos2+ivec2(0,1))), + max(getReduce2x2(pos2+ivec2(1,0)),getReduce2x2(pos2+ivec2(1,1)))); + imageStore(mip_2, pos, vec4(ret)); + return ret; +} + +//This is where the funny happens +// since we swizzeled the id when getting the value, our ordering within the subgroup should be z ordered +// we sadly cannot use the full subgroup reduction as wave size is 32 and we need a square pow2 values, so 16, sad beep +float getReduceWave(ivec2 pos, float value) { + float reduced; + subgroupBarrier();//Wait for active threads in subgroup + //Now do clustered reduction, with exploiting dropout + reduced = subgroupClusteredMax(value, 4); + if ((gl_SubgroupInvocationID&0x3)==0) {//root writes + imageStore(mip_3, pos>>1, vec4(reduced)); + } + //could exit 3/4 of the threads here if wanted + subgroupBarrier();//Wait for active threads in subgroup + reduced = subgroupClusteredMax(value, 16); + if ((gl_SubgroupInvocationID&0xF)==0) {//root writes + imageStore(mip_4, pos>>2, vec4(reduced)); + } + return reduced; +} +shared float values[16]; +void main() { + uint id = swizzleId(gl_LocalInvocationID.x); + //(ivec2(gl_WorkGroupID.xy)*64+ivec2(id&0xFU, id>>4)*4)/4; + ivec2 wavePos = ivec2(gl_WorkGroupID.xy)*16+ivec2(id&0xFU, id>>4); + float value = getReduce4x4(wavePos); + value = getReduceWave(wavePos, value);//Reduced to 4x4 across all threads and warps + if ((gl_LocalInvocationID.x&0xFU)==0) { + values[gl_LocalInvocationID.x>>4] = value; + } + barrier();//Wait for all + if ((gl_LocalInvocationID.x>>2)!=0) { + return;//Discard all but 4 threads + } + uint i = gl_LocalInvocationID.x*4; + value = max(max(values[i],values[i+1]),max(values[i+2],values[i+3]));//Is funny is already in spread order + imageStore(mip_5, ivec2(gl_WorkGroupID.xy)*2+ivec2(gl_LocalInvocationID.x&1u,gl_LocalInvocationID.x>>1), vec4(value)); + subgroupBarrier(); + value = subgroupMax(value); + if (gl_LocalInvocationID.x==0) { + imageStore(mip_6, ivec2(gl_WorkGroupID.xy), vec4(value)); + } +}