如何解决Vanilla Vulkan Compute Shader 不写入输出缓冲区
编辑:修复双重取消映射(但不能解决问题)
EDIT2:修复 API 版本并从代码中删除验证层。相反,使用 VK_INSTANCE_LAYERS=VK_LAYER_KHRONOS_validation
环境运行。问题仍然存在
EDIT3:忘记了描述符集,它允许将缓冲区绑定到着色器输入。但仍然没有解决问题:'(
为了学习 Vulkan API,我开始使用基本的计算着色器编写一个简单的仅计算示例。它将一个 int 缓冲区上传到 GPU,运行一个计算着色器,增加每个 int 并将结果写入第二个缓冲区。
我的问题是一切正常,但我的输出缓冲区没有得到预期的结果,我不知道为什么。看起来计算着色器已调度,但输出缓冲区从未写入。
为了观察这一点,我首先将随机数上传到我的输入缓冲区并用值 2 填充我的输出缓冲区。然后调度计算着色器,它应该从输入中读取每个值 X,并将 X+1 写入输出缓冲区.
等待完成后,我映射我的输出缓冲区并显示其数据。我只有 2 个 :'(
注意:绑定到缓冲区的内存是用 VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT
创建的。
所以 Vulkan 中肯定有一个我弄错的概念,或者我看不到的标志/设置中的微妙之处......
计算着色器代码:
#version 450 core
layout (set = 0,binding = 0) buffer InputBuffer {
uvec4 inputData[25];
};
layout (set = 0,binding = 1) buffer OutputBuffer {
uvec4 outputData[25];
};
layout (local_size_x = 8,local_size_y = 1,local_size_z = 1) in;
void main()
{
uint gid = gl_GlobalInvocationID.x;
if(gid < 25)
outputData[gid] = inputData[gid] + uvec4(1,1,1);
}
以及整个示例代码(因为我不知道我哪里错了,我已经粘贴了整个内容,抱歉):
#include <vulkan/vulkan.h>
#include <iostream>
#include <vector>
#include <assert.h>
#include <fstream>
// Some helper functions
typedef uint32_t u32;
typedef uint64_t u64;
// Vulkan two steps enumeration function
#define COUNT_AND_GET1(func,vec,arg1) {\
u32 size = 0; \
##vec.clear(); \
##func(##arg1,&size,nullptr); \
if(size > 0) { \
##vec.resize(size); \
##func(##arg1,##vec.data()); }\
}
#define COUNT_AND_GET2(func,arg1,arg2) {\
u32 size = 0; \
##vec.clear(); \
##func(##arg1,##arg2,##vec.data()); }\
}
// Basic vec4 data
struct vec4
{
u32 x; u32 y; u32 z; u32 w;
};
struct PhysicalDeviceProps
{
VkPhysicalDeviceProperties m_Properties;
VkPhysicalDeviceFeatures m_Features;
VkPhysicalDeviceMemoryProperties m_MemoryProperties;
std::vector<VkQueueFamilyProperties> m_QueueFamilyProperties;
std::vector<VkLayerProperties> m_LayerProperties;
std::vector<VkExtensionProperties> m_ExtensionProperties;
};
// Return device memory index that matches specified properties
u32 SelectMemoryHeapFrom(u32 memoryTypeBits,const VkPhysicalDeviceMemoryProperties& memoryProperties,VkMemoryPropertyFlags preferredProperties,VkMemoryPropertyFlags requiredProperties)
{
assert((preferredProperties & requiredProperties) > 0);
u32 selectedType = u32(-1);
u32 memIndex = 0;
while (memIndex < VK_MAX_MEMORY_TYPES && selectedType == u32(-1))
{
if (((memoryTypeBits & (1 << memIndex)) > 0)
&& ((memoryProperties.memoryTypes[memIndex].propertyFlags & preferredProperties) == preferredProperties))
{
// If it exactly matches my preferred properties,grab it.
selectedType = memIndex;
}
++memIndex;
}
if (selectedType == u32(-1))
{
memIndex = 0;
while (memIndex < VK_MAX_MEMORY_TYPES && selectedType == u32(-1))
{
if (((memoryTypeBits & (1 << memIndex)) > 0)
&& ((memoryProperties.memoryTypes[memIndex].propertyFlags & requiredProperties) == requiredProperties))
{
// If it exactly matches my required properties,grab it.
selectedType = memIndex;
}
++memIndex;
}
}
return selectedType;
}
// **** MAIN FUNCTION ****
void SampleCompute()
{
// -------------------------------------
// 1. Create Instance
// -------------------------------------
VkApplicationInfo appInfo = { VK_STRUCTURE_TYPE_APPLICATION_INFO,nullptr,"SampleCompute","MyEngine",VK_API_VERSION_1_2 };
VkInstanceCreateInfo instCreateInfo = { VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO,&appInfo,nullptr };
VkInstance instance = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateInstance(&instCreateInfo,&instance))
std::cout << "Instance creation failed!\n";
// ---------------------------------------------------
// 2. Enumerate physical devices and select 'best' one
// ---------------------------------------------------
VkPhysicalDevice bestDevice = VK_NULL_HANDLE;
PhysicalDeviceProps bestDeviceProps;
{
std::vector<VkPhysicalDevice> physicalDevices;
COUNT_AND_GET1(vkEnumeratePhysicalDevices,physicalDevices,instance)
assert(!physicalDevices.empty());
std::vector< PhysicalDeviceProps> physicalDeviceProps(physicalDevices.size());
for (u64 i = 0; i < physicalDevices.size(); ++i)
{
vkGetPhysicalDeviceProperties(physicalDevices[i],&physicalDeviceProps[i].m_Properties);
vkGetPhysicalDeviceMemoryProperties(physicalDevices[i],&physicalDeviceProps[i].m_MemoryProperties);
COUNT_AND_GET1(vkGetPhysicalDeviceQueueFamilyProperties,physicalDeviceProps[i].m_QueueFamilyProperties,physicalDevices[i])
COUNT_AND_GET1(vkEnumerateDeviceLayerProperties,physicalDeviceProps[i].m_LayerProperties,physicalDevices[i])
COUNT_AND_GET2(vkEnumerateDeviceExtensionProperties,physicalDeviceProps[i].m_ExtensionProperties,physicalDevices[i],nullptr)
}
u64 bestDeviceIndex = 0;
for (u64 i = 1; i < physicalDevices.size(); ++i)
{
const bool isDiscrete = physicalDeviceProps[bestDeviceIndex].m_Properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU;
const bool otherIsDiscrete = physicalDeviceProps[i].m_Properties.deviceType == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU;
if (isDiscrete && !otherIsDiscrete)
continue;
else if ((!isDiscrete && otherIsDiscrete)
|| (physicalDeviceProps[bestDeviceIndex].m_Properties.limits.maxFramebufferWidth < physicalDeviceProps[i].m_Properties.limits.maxFramebufferWidth))
bestDeviceIndex = i;
}
bestDevice = physicalDevices[bestDeviceIndex];
bestDeviceProps = physicalDeviceProps[bestDeviceIndex];
}
// ---------------------------------------------------
// 3. Find queue family which support compute pipeline
// ---------------------------------------------------
u32 computeQueue = 0;
while (computeQueue < bestDeviceProps.m_QueueFamilyProperties.size()
&& ((bestDeviceProps.m_QueueFamilyProperties[computeQueue].queueFlags & VK_QUEUE_COMPUTE_BIT) != VK_QUEUE_COMPUTE_BIT))
{
++computeQueue;
}
assert(computeQueue < bestDeviceProps.m_QueueFamilyProperties.size());
// -------------------------------
// 4. Create logical device
// -------------------------------
VkDeviceQueueCreateInfo queueInfo = { VK_STRUCTURE_TYPE_DEVICE_QUEUE_CREATE_INFO,computeQueue,nullptr };
VkPhysicalDeviceFeatures features = {};
VkDeviceCreateInfo createInfo = {
VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO,&queueInfo,&features
};
VkDevice device = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateDevice(bestDevice,&createInfo,&device))
std::cout << "Logical Device creation failed\n";
// -------------------------------
// 5. Create data buffers
// -------------------------------
constexpr u64 elemCount = 25;
constexpr u64 bufferSize = elemCount * sizeof(vec4);
VkBufferCreateInfo bufferCreateInfo = {
VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,bufferSize,VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,VK_SHARING_MODE_EXCLUSIVE,nullptr
};
VkBuffer inputBuffer = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateBuffer(device,&bufferCreateInfo,&inputBuffer))
std::cout << "Creating input buffer failed!\n";
VkMemoryRequirements inputBufferMemory;
vkGetBufferMemoryRequirements(device,inputBuffer,&inputBufferMemory);
VkBuffer outputBuffer = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateBuffer(device,&outputBuffer))
std::cout << "Creating output buffer failed!\n";
VkMemoryRequirements outputBufferMemory;
vkGetBufferMemoryRequirements(device,outputBuffer,&outputBufferMemory);
// -------------------------------
// 6. Allocate memory for buffers
// -------------------------------
u32 inputMemoryIndex = SelectMemoryHeapFrom(inputBufferMemory.memoryTypeBits,bestDeviceProps.m_MemoryProperties,VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VkMemoryAllocateInfo inputAllocationInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,inputBufferMemory.size,inputMemoryIndex };
VkDeviceMemory inputMemory = VK_NULL_HANDLE;
if (VK_SUCCESS != vkAllocateMemory(device,&inputAllocationInfo,&inputMemory))
std::cout << "Memory allocation of " << inputBufferMemory.size << " failed!\n";
u32 outputMemoryIndex = SelectMemoryHeapFrom(outputBufferMemory.memoryTypeBits,VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT | VK_MEMORY_PROPERTY_HOST_COHERENT_BIT);
VkMemoryAllocateInfo outputAllocationInfo = { VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,outputBufferMemory.size,outputMemoryIndex };
VkDeviceMemory outputMemory = VK_NULL_HANDLE;
if (VK_SUCCESS != vkAllocateMemory(device,&outputAllocationInfo,&outputMemory))
std::cout << "Memory allocation of " << outputBufferMemory.size << " failed!\n";
// -------------------------------
// 7. Bind buffers to memory
// -------------------------------
if (vkBindBufferMemory(device,inputMemory,0) != VK_SUCCESS)
std::cout << "Input buffer binding failed!\n";
if (vkBindBufferMemory(device,outputMemory,0) != VK_SUCCESS)
std::cout << "Output buffer binding failed!\n";
// ----------------------------------
// 8. Map buffers and upload data
// ----------------------------------
vec4* inputData = nullptr;
if (VK_SUCCESS != vkMapMemory(device,VK_WHOLE_SIZE,(void**)(&inputData)))
std::cout << "Input memory mapping failed!\n";
for (u32 i = 0; i < elemCount; ++i)
{
inputData[i].x = static_cast<u32>(rand() / (float)RAND_MAX * 100);
inputData[i].y = static_cast<u32>(rand() / (float)RAND_MAX * 100);
inputData[i].z = static_cast<u32>(rand() / (float)RAND_MAX * 100);
inputData[i].w = static_cast<u32>(rand() / (float)RAND_MAX * 100);
std::cout << inputData[i].x << "," << inputData[i].y << "," << inputData[i].z << "," << inputData[i].w << ",";
}
std::cout << "\n\n\n";
vkUnmapMemory(device,inputMemory);
vec4* initialOutputData = nullptr;
if (VK_SUCCESS != vkMapMemory(device,(void**)(&initialOutputData)))
std::cout << "Output memory mapping failed!\n";
for (u32 i = 0; i < elemCount; ++i)
{
initialOutputData[i].x = 2; initialOutputData[i].z = 2; initialOutputData[i].y = 2; initialOutputData[i].w = 2;
}
vkUnmapMemory(device,outputMemory);
// ----------------------------------
// 9. Create shader/pipeline layout
// ----------------------------------
std::vector<VkDescriptorSetLayoutBinding> bindings = {
{ 0,VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,VK_SHADER_STAGE_COMPUTE_BIT,nullptr },{ 1,nullptr }
};
VkDescriptorSetLayoutCreateInfo layoutInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,2,bindings.data() };
VkDescriptorSetLayout descriptorLayout = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateDescriptorSetLayout(device,&layoutInfo,&descriptorLayout))
std::cout << "Descriptor Layout creation failed!\n";
// Create pipeline layout
VkPipelineLayoutCreateInfo pipelineCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,&descriptorLayout,nullptr };
VkPipelineLayout layout = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreatePipelineLayout(device,&pipelineCreateInfo,&layout))
std::cout << "Pipeline Layout creation failed\n";
// --------------------------------------------------
// 10. Load shader source and create shader module
// --------------------------------------------------
std::ifstream file("ComputeShader.spv",std::ifstream::binary);
u64 size = 0;
if (!file.is_open())
std::cout << "Can't open shader!\n";
file.seekg(0,file.end);
size = file.tellg();
file.seekg(0);
char* shaderSrc = new char[size];
file.read(shaderSrc,size);
VkShaderModuleCreateInfo shaderCreateInfo = { VK_STRUCTURE_TYPE_SHADER_MODULE_CREATE_INFO,size,reinterpret_cast<u32*>(shaderSrc) };
VkShaderModule shader = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateShaderModule(device,&shaderCreateInfo,&shader))
std::cout << "Shader Module creation failed\n";
delete[] shaderSrc;
// ----------------------------------
// 10.5. Create descriptor sets
// ----------------------------------
VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,2 };
VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = {
VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,&descriptorPoolSize };
VkDescriptorPool descriptorPool = VK_NULL_HANDLE;
vkCreateDescriptorPool(device,&descriptorPoolCreateInfo,&descriptorPool);
VkDescriptorSetAllocateInfo descriptorSetAllocateInfo = {
VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,descriptorPool,&descriptorLayout
};
VkDescriptorSet descriptorSet;
vkAllocateDescriptorSets(device,&descriptorSetAllocateInfo,&descriptorSet);
VkDescriptorBufferInfo inputBufferDescriptorInfo = { inputBuffer,VK_WHOLE_SIZE };
VkDescriptorBufferInfo outputBufferDescriptorInfo = { outputBuffer,VK_WHOLE_SIZE };
VkWriteDescriptorSet writeDescriptorSet[2] = {
{
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,descriptorSet,&inputBufferDescriptorInfo,0
},{
VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,&outputBufferDescriptorInfo,0
}
};
vkUpdateDescriptorSets(device,writeDescriptorSet,nullptr);
// -------------------------------
// 11. Create compute pipeline
// -------------------------------
const char* entryPointName = "main";
VkComputePipelineCreateInfo computeCreateInfo = {
VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,{
VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,shader,entryPointName,nullptr
},layout,VK_NULL_HANDLE,0
};
VkPipeline pipeline = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateComputePipelines(device,&computeCreateInfo,&pipeline))
std::cout << "Compute Pipeline creation failed!\n";
// ------------------------------------------------
// 12. Create Command Pool and Command Buffer
// --------------------------------------------------
VkCommandPoolCreateInfo poolInfo = { VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,computeQueue };
VkCommandPool cmdPool = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateCommandPool(device,&poolInfo,&cmdPool))
std::cout << "Command Pool creation failed!\n";
VkCommandBufferAllocateInfo cmdBufferInfo = {
VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,cmdPool,VK_COMMAND_BUFFER_LEVEL_PRIMARY,1
};
VkCommandBuffer cmdBuffer = VK_NULL_HANDLE;
if (VK_SUCCESS != vkAllocateCommandBuffers(device,&cmdBufferInfo,&cmdBuffer))
std::cout << "Command buffer allocation failed!\n";
// ---------------------------
// 13. Run compute shader
// ---------------------------
VkCommandBufferUsageFlags flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
VkCommandBufferBeginInfo beginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,nullptr };
vkBeginCommandBuffer(cmdBuffer,&beginInfo);
vkCmdBindDescriptorSets(cmdBuffer,VK_PIPELINE_BIND_POINT_COMPUTE,&descriptorSet,0);
vkCmdBindPipeline(cmdBuffer,pipeline);
vkCmdDispatch(cmdBuffer,8,1);
vkEndCommandBuffer(cmdBuffer);
// -----------------------------------------
// 14. Submit command buffer (with fence)
// -----------------------------------------
VkFenceCreateInfo fenceCreateInfo = { VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,(VkFenceCreateFlags)0 };
VkFence fence = VK_NULL_HANDLE;
if (VK_SUCCESS != vkCreateFence(device,&fenceCreateInfo,&fence))
std::cout << "Fence creation failed!\n";
VkQueue queue = VK_NULL_HANDLE;
vkGetDeviceQueue(device,&queue);
VkSubmitInfo submitInfo = {
VK_STRUCTURE_TYPE_SUBMIT_INFO,&cmdBuffer,nullptr
};
VkResult result = vkQueueSubmit(queue,&submitInfo,fence);
// Wait for everything finished
if (result == VK_SUCCESS)
{
result = vkQueueWaitIdle(queue);
}
vkWaitForFences(device,&fence,VK_TRUE,u64(-1));
// ---------------------------------
// 15. Grab and display results
// ---------------------------------
vec4* resultData = nullptr;
if (VK_SUCCESS != vkMapMemory(device,(void**)(&resultData)))
std::cout << "Output memory mapping failed!\n";
for (u32 i = 0; i < elemCount; ++i)
{
std::cout << resultData[i].x << "," << resultData[i].y << "," << resultData[i].z << "," << resultData[i].w << ",outputMemory);
// ------------------------
// 16. Resources Cleanup
// ------------------------
vkFreeCommandBuffers(device,&cmdBuffer);
vkDestroyCommandPool(device,nullptr);
vkDestroyFence(device,fence,nullptr);
vkDestroyPipeline(device,pipeline,nullptr);
vkDestroyPipelineLayout(device,nullptr);
vkDestroyShaderModule(device,nullptr);
vkDestroyDescriptorSetLayout(device,descriptorLayout,nullptr);
vkDestroyBuffer(device,nullptr);
vkDestroyBuffer(device,nullptr);
vkFreeMemory(device,nullptr);
if (VK_SUCCESS != vkDeviceWaitIdle(device))
std::cout << "Can't wait for device to idle\n";
vkDestroyDevice(device,nullptr);
vkDestroyInstance(instance,nullptr);
}
解决方法
我认为问题可能是不同步,特别是缺少内存域操作。有些平台可能不喜欢它...
在命令缓冲区的末尾,您需要这个特殊的管道屏障,将写入从设备域转换到主机域:
VkBufferMemoryBarrier outbuffDependency = {};
outbuffDependency.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER;
outbuffDependency.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT;
outbuffDependency.dstAccessMask = VK_ACCESS_HOST_READ_BIT;
outbuffDependency.buffer = outputBuffer;
outbuffDependency.size = VK_WHOLE_SIZE;
vkCmdPipelineBarrier(
cmdBuffer,VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,VK_PIPELINE_STAGE_HOST_BIT,(VkDependencyFlags)0,nullptr,1,&outbuffDependency,nullptr
);
Vulkan 有一个独特的内存域概念。有主机域,也有设备域。相同的内存在每个域中可以有不同的状态。例如。内存写入在设备域中可见并不意味着它在主机域中也可见。
围栏(或 vk*WaitIdle
)不包括规范中警告的内存域操作:
注意
发信号给fence并在主机上等待并不能保证内存访问的结果对主机可见,因为fence定义的内存依赖的访问范围只包括设备访问。必须使用 memory barrier 或其他内存依赖来保证这一点。有关详细信息,请参阅 host access types 的说明。
唯一包含域操作的是与 VK_PIPELINE_STAGE_HOST_BIT
或 vkQueueSubmit
的内存依赖(您确实使用 inputBuffer
将其从主机域传输到设备域)。
验证层无法合理地捕获此错误,因为它们无法知道(没有一些侵入性的操作系统调试功能)您是否确实通过映射指针从缓冲区读取数据。
,所以,它终于起作用了:)
我在尝试 soe 的东西时做了很多改变,在某些时候我的输入缓冲区被绑定为一个统一的缓冲区......
现在它作为存储缓冲区返回,并且描述符集已正确创建和更新,我得到了预期的输出。
内存屏障不是强制性的,但我想当我有一个更复杂的例子时,这是一个很好的做法,其中多次通过,缓冲区的使用不同。
感谢大家的帮助,它确实帮助我弄清楚了所有可能使 Vulkan 实施成功或失败的小细节。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。