使用给定的 FLOPS 数量，GPU 可以进行多少次计算？

如何解决使用给定的 FLOPS 数量，GPU 可以进行多少次计算？

我有以下使用光线投射渲染体素的着色器：

#version 460
#extension GL_ARB_separate_shader_objects : enable
#pragma optionNV(unroll all)

layout(binding = 3,std140) uniform compVarsOb {
    float time;
    float phiA;
    float thetaA;
    vec3 camPos;
    float fov;
    int voxWidth;
    int voxHeight;
    int voxDepth;
} cvo;


layout(binding = 2,rgba8) uniform writeonly image2D img;

float hash3(vec2 xy){
    xy = mod(xy,.19);
    float h = dot(xy.yyx,vec3(.013,27.15,2027.3));
    h *= h;
    h *= fract(h);
    
    return fract(h);
}






//layout(binding = 4) uniform sampler3D voxels;


layout(binding = 4,std140) buffer vData{
    vec4 voxels[];
};


float greaterThan(float a,float b){
    float d =  a - b;
    return (1. + (d / abs(d)))/2.;
}

float lesserThan(float a,float b){
    float d =  a - b;
    return (1. - (d / abs(d)))/2.;
}

float withinBounds(ivec3 li){
    vec3 l = vec3(li);
    return greaterThan(l.x,0.) * lesserThan(l.x,cvo.voxWidth) * greaterThan(l.y,0.) * lesserThan(l.y,cvo.voxHeight) * greaterThan(l.z,0.) * lesserThan(l.z,cvo.voxDepth);
}


vec4 quaternionMult(vec4 a,vec4 b){
    return vec4(a.x * b.x - dot(a.yzw,b.yzw),a.x*b.yzw + b.x*a.yzw + cross(a.yzw,b.yzw));
}


void main()
{
    vec2 iResolution = vec2(2560.,1440.);
    vec2 fragCoord = gl_GlobalInvocationID.xy;
    ivec2 fragI = ivec2(gl_GlobalInvocationID.xy);
    
    vec2 iMouse = vec2(.5);

    vec2 uv = fragCoord/iResolution.xy;
    ivec2 uvI = ivec2(uv);
    
    vec2 muv = iMouse.xy / iResolution.xy;
    
    float iTime = cvo.time;
    
    vec3 col = vec3(0.);
    
    float screenRatio = iResolution.y / iResolution.x;
    
    //Setting up the ray directions and other information about the point and camera
    //##############################################################################
    
    
    //camera direction angles phi (xy plane) and theta (xz plane)
    float phi = cvo.phiA;//radians(360. * (1. - muv.x));
    float theta = cvo.thetaA;//radians(180. * (1. - muv.y));
    
    //get the camera direction as the basis for the rotation (each ray direction is a rotation of the camera direciton vector)
    //it is in quarternion form here so its a vec4 instead of a vec3
    vec4 camD = vec4(0.,cos(phi) * sin(theta),sin(phi) * sin(theta),cos(theta));
    
    
    float rad90 = radians(90.);
    
    float fov = cvo.fov;
    
    float xAng = radians(fov * (.5 - uv.x));
    //replace "fov" with "(fov + (110. * pow(.5 - uv.x,2.)))" below to add a counteractment to the fisheye lens effect
    //it basically counteracts the artifact with quaternions that happens when you rotate by a large angle on one axis then try to rotate on another axis perpendicular,it just rotates around it thus making the new direction lesser
    float yAng = radians(fov * screenRatio * (uv.y - .5));
    
    //get the axes that the quarternions should be based around (perpendicular to the camera plane or dv)
    vec3 xRotAxis = vec3(cos(phi) * sin(theta - rad90),sin(phi) * sin(theta - rad90),cos(theta - rad90));
    vec3 yRotAxis = cross(xRotAxis,camD.yzw);//vec3(cos(phi - rad90) * sin(theta),sin(phi - rad90) * sin(theta),cos(theta));
    
    //get the quarternions of the ray direction rotations
    vec4 xQuat = vec4(cos(xAng / 2.),xRotAxis * sin(xAng / 2.));
    vec4 yQuat = vec4(cos(yAng / 2.),yRotAxis * sin(yAng / 2.));
    
    
    
    //combine the rotations
    vec4 compQuat = quaternionMult(yQuat,xQuat);
    
    
    
    //get the conjugate of the compQuart
    vec4 conjComp = vec4(compQuat.x,-compQuat.yzw);

    
    //ray direction
    vec3 rayD = normalize(quaternionMult(quaternionMult(compQuat,camD),conjComp).yzw);
    
    
    
    
    //camera location
    vec3 cam = cvo.camPos;//vec3(cos(iTime),0.,0.);
    
    
    //point location and radius
    //vec3 p = vec3(0.,(5. * iTime) + 1.,0.);
    float pr = .00001;
    
    



    //############################################
    



    
    //hit = 1. means that nothing has been hit or everything has been completely transparent
    float hit = 1.;
    vec3 locf = vec3(0.);
    ivec3 loc = ivec3(0);
    int locI = 0;
    vec4 v = vec4(0.);

    for(int i = 0; i < 20; i++){
        locf = vec3((i * rayD * .4) + cam);
        loc = ivec3(locf);
        //adjust loc for the buffer indexing
        locI = loc.x + loc.y * cvo.voxWidth + loc.z * cvo.voxWidth * cvo.voxHeight;

        //vec4 v = texelFetch(voxels,loc,0);//;imageLoad(voxels,loc);//texelFetch(voxels,ivec3((i * rayD) + cam),0);
        
        

        v = voxels[locI];// * withinBounds(loc);

        if(locf.x < 0. || locf.x > cvo.voxWidth || locf.y < 0. || locf.y > cvo.voxHeight || locf.z < 0. || locf.z > cvo.voxDepth){
            v = vec4(0.);
        }

        col += v.xyz * hit * v.w;

        hit -= v.w;

        if(hit <= 0.){
            //col = vec3(v.w / 5.);
            break;
        }
         
        
    }
    

    //col = imageLoad(voxels,ivec3(uvI,1)).xyz;//texelFetch(voxels,ivec3(fragI / 10,1),0).xyz;
    //col = vec3(phi / radians(180.));
    
    //col = texture(iChannel0,uv).xyz;
    
    //col = voxels[(fragI.x / 10) + (fragI.y / 10) * cvo.voxWidth].xyz;

    //col = vec3(rayD.z);

    imageStore(img,fragI,vec4(col,1.0));
}

它产生这个：

问题是当我将循环（main() 的结尾）更改为我想要迭代的体素数量超过 2（现在为 20）时，fps 绝对是坦克。但是我觉得我的 GPU 能够进行 2 次以上要求不高的循环迭代，所以我不确定发生了什么。

我在 RTX 2060 Super 上运行，据说 here 能够达到 7.81 * 10^12 FLOPS。如果我理解正确，这意味着如果我想以 144 fps 和 1440p 的速度运行计算着色器，我将允许总共 (7.81 * 10^12)/(144 * 2560 * 1440) FLOPS着色器。这在计算着色器中达到大约 14712 FLOPS，这比我现在在计算着色器中的方式更多，但是当循环进行 20 次迭代时，我的代码仅以平均 30 fps 的速度运行.当我将循环减少到 1 或 2 次迭代时，我只能获得 144+ fps（此时基本上就像根本没有循环一样）。对于计算着色器，循环是否非常糟糕？我哪里出错了？

解决方法

GPU 的能力通常是在完美的条件下用完美的代码计算出来的。也就是说，每个计算都是一个融合乘加，能够在前一个计算之后立即开始，数据已经在缓存/寄存器中，并且每个内核都在工作。达到这样的条件就是在GPU上写代码的问题。

GPU 通常为每个内核创建多个线程，以减少必须等待内存访问的代价。缓存用于大带宽使用，但它们相对较小，并且需要内存局部性（数据以物理方式存储）才能有效使用。一些 GPU 有一个单独的纹理缓存，滥用这可能是明智的。驱动程序还尝试以有效的方式存储纹理和图像，至少通过使像素靠近 - 也靠近内存，但可能会使用一些特殊硬件。

在计算着色器中，线程在大块中创建，然后分配给计算单元（供应商之间的术语不同，基本上是一种元核心，具有自己的缓存和一堆核心）。块 a 的大小在着色器中使用 layout(local_size_x = X,local_size_y = Y,local_size_z = Z) in; 定义，其中线程总数等于维度的倍数。块内的线程可以与 shared 变量通信（它们通常与 l1 缓存存储在同一空间内）并与 barrier() 等同步。 GPU 有多个计算单元，为了让它们工作，应该启动多个块（vkCmdDispatch() 中的值是启动的组数）。

线程也隐式地分组在类似 SIMD 的组中（nVidia 称之为 SIMT - 单指令多线程）。此类组中的每个线程都执行相同的指令（但与较新的 nVidia 卡存在一些差异）。 如果 if() 或 for()、while() 等仅使某些线程执行部分代码，则部分内核会被禁用，可能会浪费性能。 通常它们的大小为 32 或 64(AMD)，因此应将线程块创建为该数字的倍数。此类组的某些功能通过 subgroup extensions 公开。

由于您正在使用体素，因此我认为使用 3D 纹理或图像而不是缓冲区可能对缓存有益。弄清楚如何让线程相互协作，利用快速共享内存也应该是一个好主意。

使用给定的 FLOPS 数量，GPU 可以进行多少次计算？

如何解决使用给定的 FLOPS 数量，GPU 可以进行多少次计算？

解决方法

相关推荐