如何解决奇怪的计算着色器延迟
我正在尝试通过计算着色器进行视锥剔除。为此,我有一对用于实例化顶点属性的缓冲区,以及一对用于间接绘制命令的缓冲区。我的计算着色器检查来自第一个缓冲区的实例坐标是否在边界体积内,引用第一个绘制缓冲区的计数、subgroupBallot
和 bitCount
以查看子组内的偏移量,然后添加来自其他子组的结果和全局偏移量,最后将结果存储在第二个缓冲区中。全局偏移量存储在第二个间接绘制缓冲区中。
问题在于,在负载下,视锥可能比移动的相机晚几帧(> 1),边缘消失的物体很宽。我觉得这很奇怪,因为剔除和渲染是在同一个命令缓冲区中完成的。
在 renderdoc 中进行捕获、截取屏幕截图 alt+printScreen 或暂停呈现呈现线程时,事情会恢复到应有的状态。
我唯一的猜测是,即使开始绘制新帧,过去帧的计算着色器也会继续执行,尽管由于管道障碍,这不应该发生。
着色器代码:
#version 460
#extension GL_KHR_shader_subgroup_ballot : require
struct drawData{
uint indexCount;
uint instanceCount;
uint firstIndex;
uint vertexOffset;
uint firstInstance;
};
struct instanceData{
float x,y,z;
float a,b,c,d;
};
layout(local_size_x = 128,local_size_y = 1,local_size_z = 1) in;
layout(set = 0,binding = 0) uniform A
{
mat4 cam;
vec4 camPos;
vec4 l;
vec4 t;
vec4 r;
vec4 b;
};
layout(set = 0,binding = 1) buffer B
{
uint count;
drawData data[];
} Draw[2];
layout(set = 0,binding = 2) buffer C
{
instanceData data[];
} Instance[2];
shared uint offsetsM[32];
void main()
{
const uint gID = gl_LocalInvocationID.x;
const uint lID = gl_SubgroupInvocationID;
const uint patchSize = gl_WorkGroupSize.x;
Draw[1].data[0] = Draw[0].data[0];//copy data like index count
Draw[1].count = Draw[0].count;
uint offsetG = 0;//accumulating offset within end buffer
uint loops = Draw[0].data[0].instanceCount/patchSize;//constant loop count
for(uint i = 0; i<loops;++i){
uint posa = i*patchSize+gID;//runs better this way for some reason
vec3 pos = camPos.xyz-vec3(Instance[0].data[posa].x,Instance[0].data[posa].y,Instance[0].data[posa].z);//position relative to camera
mat4x3 lrtb = mat4x3(l.xyz,r.xyz,t.xyz,b.xyz);
vec4 dist = pos*lrtb+Model.data[0].rad;//dot products and radius tolerance
bool Pass = posa<Draw[0].data[0].instanceCount&&//is real
(dot(pos,pos)<l.w*l.w) &&//not too far
all(greaterThan(dist,vec4(0))); //within view frustum
subgroupBarrier();//no idea what is the best,put what works
uvec4 actives = subgroupBallot(Pass);//count passed instances
if(subgroupElect())
offsetsM[gl_SubgroupID] = bitCount(actives).x+bitCount(actives).y;
barrier();
uint offsetL = bitCount(actives&gl_SubgroupltMask).x+bitCount(actives&gl_SubgroupltMask).y;//offset withing subgroup
uint ii = 0;
if(Pass){
for(; ii<gl_SubgroupID; ++ii)
offsetG+= offsetsM[ii];//offsets before subgroup
Instance[1].data[offsetG+offsetL] = Instance[0].data[posa];
for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];}//offsets after subgroup
else for(; ii<gl_NumSubgroups; ++ii)
offsetG+= offsetsM[ii];//same but no data copying
}
if(gID == 0)
Draw[1].data[0].instanceCount = offsetG;
}
对于计算后的渲染通道,我有依赖项:
{//1
deps[1].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[1].dstSubpass = 0;
deps[1].srcStageMask = VK_PIPELINE_STAGE_COmpuTE_SHADER_BIT;
deps[1].dstStageMask = VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT;
deps[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[1].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
deps[1].dependencyFlags = 0;
}
{//2
deps[2].srcSubpass = VK_SUBPASS_EXTERNAL;
deps[2].dstSubpass = 0;
deps[2].srcStageMask = VK_PIPELINE_STAGE_COmpuTE_SHADER_BIT;
deps[2].dstStageMask = VK_PIPELINE_STAGE_VERTEX_INPUT_BIT;
deps[2].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
deps[2].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
deps[2].dependencyFlags = 0;
}
命令缓冲区是(完全按原样重复使用,交换链中的每个图像一个):
vkBeginCommandBuffer(cmd,&begInfo);
vkCmdBindDescriptorSets(cmd,VK_PIPELINE_BIND_POINT_COmpuTE,layoutsPipe[1],1,&descs[1],0);
vkCmdBindPipeline(cmd,pipes[1]);
vkCmddispatch(cmd,1);
VkBufferMemoryBarrier bufMemBar[2];
{//mem bars
{//0 indirect
bufMemBar[0].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[0].dstAccessMask = VK_ACCESS_INDIRECT_COMMAND_READ_BIT;
bufMemBar[0].buffer = bufferIndirect;
bufMemBar[0].offset = 0;
bufMemBar[0].size = -1;
}
{//1 vertex instance
bufMemBar[1].srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
bufMemBar[1].dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT;
bufMemBar[1].buffer = bufferInstance;
bufMemBar[1].offset = 0;
bufMemBar[1].size = -1;
}
}
vkCmdPipelineBarrier(cmd,VK_PIPELINE_STAGE_COmpuTE_SHADER_BIT,VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT,&bufMemBar[0],0);
vkCmdPipelineBarrier(cmd,VK_PIPELINE_STAGE_VERTEX_INPUT_BIT,&bufMemBar[1],0);
VkRenderPassBeginInfo passBegInfo;
passBegInfo.renderPass = pass;
passBegInfo.framebuffer = chain.frames[i];
passBegInfo.renderArea = {{0,0},chain.dim};
VkClearValue clears[2]{{0},{0}};
passBegInfo.clearValueCount = 2;
passBegInfo.pClearValues = clears;
vkCmdBeginRenderPass(cmd,&passBegInfo,VK_SUBPASS_CONTENTS_INLINE);
vkCmdBindDescriptorSets(cmd,VK_PIPELINE_BIND_POINT_GRAPHICS,layoutsPipe[0],&descs[0],0);
vkCmdBindPipeline (cmd,pipes[0]);
VkBuffer buffersvertex[2]{bufferVertexProto,bufferInstance};
VkDeviceSize offsetsvertex[2]{0,0};
vkCmdBindVertexBuffers(cmd,2,buffersvertex,offsetsvertex);
vkCmdBindindexBuffer (cmd,bufferIndex,VK_INDEX_TYPE_UINT32);
vkCmdDrawIndexedindirectCount(cmd,bufferIndirect,0+4,count.maxDraws,sizeof(VkDrawIndexedindirectCommand));
vkCmdEndRenderPass(cmd);
vkEndCommandBuffer(cmd);
渲染和呈现与两个信号量同步 - imageAvailable 和 renderFinished。视锥体计算在 cpu 上的顺序正确。验证层已启用。
解决方法
问题是我缺乏主机同步。事实上,即使在相同的命令缓冲区中,也没有主机同步保证(这是有道理的,因为它使我们能够使用事件)。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。