四倍过采样性能 - 编程之家

如何解决四倍过采样性能

在制作基本上依赖于四倍于四倍过采样的渲染引擎的过程中，我遇到了缩减自身的性能。

#include <stdint.h>

    const int_fast32_t sRGBtolinear[256] = {0,20,40,60,80,99,119,139,159,179,199,219,241,264,288,313,340,367,396,427,458,491,526,562,599,637,677,718,761,805,851,898,947,997,1048,1101,1156,1212,1270,1330,1391,1453,1517,1583,1651,1720,1791,1863,1937,2013,2090,2170,2250,2333,2418,2504,2592,2681,2773,2866,2961,3058,3157,3258,3360,3464,3570,3678,3788,3900,4014,4129,4247,4366,4488,4611,4736,4864,4993,5124,5257,5392,5530,5669,5810,5953,6099,6246,6395,6547,6701,6856,7014,7174,7336,7500,7666,7834,8004,8177,8352,8529,8708,8889,9072,9258,9446,9636,9828,10022,10219,10418,10619,10822,11028,11236,11446,11658,11873,12090,12309,12531,12754,12981,13209,13440,13673,13909,14147,14387,14629,14874,15122,15372,15624,15878,16135,16394,16656,16920,17187,17456,17727,18001,18278,18556,18838,19121,19408,19696,19988,20281,20578,20876,21178,21481,21788,22096,22408,22722,23038,23357,23679,24003,24329,24659,24991,25325,25662,26002,26344,26689,27036,27387,27739,28095,28453,28813,29177,29543,29911,30283,30657,31033,31413,31795,32180,32567,32957,33350,33746,34144,34545,34949,35355,35765,36177,36591,37009,37429,37852,38278,38707,39138,39572,40009,40449,40892,41337,41786,42237,42691,43147,43607,44069,44534,45003,45474,45947,46424,46904,47386,47871,48360,48851,49345,49842,50342,50844,51350,51859,52370,52884,53402,53922,54445,54972,55501,56033,56568,57106,57647,58191,58738,59288,59841,60397,60956,61518,62083,62651,63222,63796,64373,64953,65536};
    const int_fast32_t lineartosRGBthr[256] = {0,10,30,50,70,90,110,130,150,170,189,209,230,253,276,301,327,354,382,412,443,475,509,544,580,618,657,698,740,783,828,875,923,972,1023,1075,1129,1185,1242,1300,1360,1422,1486,1551,1617,1685,1755,1827,1900,1975,2052,2130,2210,2292,2376,2461,2548,2637,2727,2820,2914,3010,3108,3208,3309,3412,3518,3625,3734,3844,3957,4072,4188,4307,4427,4550,4674,4800,4929,5059,5191,5325,5461,5600,5740,5882,6026,6172,6321,6471,6624,6779,6935,7094,7255,7418,7583,7750,7920,8091,8265,8440,8618,8798,8981,9165,9352,9541,9732,9925,10121,10318,10518,10721,10925,11132,11341,11552,11766,11981,12200,12420,12643,12868,13095,13325,13557,13791,14028,14267,14508,14752,14998,15247,15498,15751,16007,16265,16525,16788,17054,17322,17592,17864,18140,18417,18697,18980,19265,19552,19842,20135,20430,20727,21027,21330,21635,21942,22252,22565,22880,23198,23518,23841,24166,24494,24825,25158,25494,25832,26173,26517,26863,27212,27563,27917,28274,28633,28995,29360,29727,30097,30470,30845,31223,31604,31987,32373,32762,33154,33548,33945,34345,34747,35152,35560,35971,36384,36800,37219,37641,38065,38493,38923,39355,39791,40229,40671,41115,41562,42011,42464,42919,43377,43838,44302,44769,45238,45711,46186,46664,47145,47629,48116,48605,49098,49593,50092,50593,51097,51604,52114,52627,53143,53662,54184,54709,55236,55767,56300,56837,57377,57919,58465,59013,59564,60119,60676,61237,61800,62367,62936,63509,64084,64663,65245};

 uint_least8_t lineartosRGB(int32_t value){
    uint_least8_t a = 0;
    if(lineartosRGBthr[a+128] <= value) a+=128;
    if(lineartosRGBthr[a+ 64] <= value) a+= 64;
    if(lineartosRGBthr[a+ 32] <= value) a+= 32;
    if(lineartosRGBthr[a+ 16] <= value) a+= 16;
    if(lineartosRGBthr[a+  8] <= value) a+=  8;
    if(lineartosRGBthr[a+  4] <= value) a+=  4;
    if(lineartosRGBthr[a+  2] <= value) a+=  2;
    if(lineartosRGBthr[a+  1] <= value) a+=  1;
    return a;
 }

 uint32_t RGBavg16(const uint32_t* pixel){
    int_fast32_t red = 0;
    int_fast32_t green = 0;
    int_fast32_t blue = 0;
    for(int_fast16_t i=0; i<16; i++){
        red   += sRGBtolinear[(pixel[i]>>16)&0xFF];
        green += sRGBtolinear[(pixel[i]>> 8)&0xFF];
        blue  += sRGBtolinear[(pixel[i]    )&0xFF];
    }
    return lineartosRGB((red+8)>>4)*65536+lineartosRGB((green+8)>>4)*256+lineartosRGB((blue+8)>>4)*1;
 }

void fourtimesfouroversampling(int* stagesize,uint32_t* pixels,int pixelsscanlineoffset,uint32_t* oversampled,int oversampledscanlineoffset){
        for(int i=0; i<stagesize[1]; i++){
        for(int j=0; j<stagesize[0]; j++){
            uint32_t pixel[16];
            for(int k=0; k<4; k++){
                for(int l=0; l<4; L++){
                    pixel[k*4+l] = oversampled[i*4*oversampledscanlineoffset+j*4+l+k*oversampledscanlineoffset];
                }
            }
            pixels[i*pixelsscanlineoffset+j] = RGBavg16(pixel);
        }
    }
}

四倍四倍过采样是一种通过以与四倍四倍的四倍采样率完全相同的方式呈现抗锯齿的方法，该方法与不使用抗锯齿或平滑方法（双级轮廓渲染，最近邻图形等）相同。倍的超采样阶段。然后使用盒式滤波器通过获取每个十六个像素正方形的线性平均值来缩小每个像素的大小。需要sRGB /线性转换，因为sRGB值不是线性标度，因此无法直接取平均值。

要测试性能，请在前面的代码之后使用以下主代码，该代码绘制随机的过采样像素并缩小其比例：

#include <stdlib.h>
#include <time.h>
    const int width = 640;
    const int height = 480;
    int stagesize[2] = {width,height};
    uint32_t pixels[width*height];
    uint32_t oversampled[width*4*height*4];
int main(){
    for(int i=0; i<width*4*height*4; i++) oversampled[i]=0;
    srand(time(NULL));
    for(int i=0; i<60; i++){
        for(int j=0; j<4096; j++){
            oversampled[rand()%(height*4)*(width*4)+rand()%(width*4)] = rand();
        }
        fourtimesfouroversampling(stagesize,pixels,width,oversampled,width*4);
    }
    return 0;
}

编译-O3时，平均大约需要3.731秒。由于代码无法在一秒钟内渲染60帧，因此无法维持60fps渲染，并且使用此渲染器的60fps程序将无法全速运行。要使四倍于四倍的过采样保持60fps，应该怎么做？

解决方法

有很多原因导致此速度比必要速度慢得多。首先，您如何测量速度？您的main()看起来只能运行60帧，优化fourtimesfouroversampling()后，这可能不足以进行精确的测量。您还只想测量在fourtimesfouroversampling()中花费的时间，而不是在设置代码中，而不是用随机值填充oversampled[]的循环，而不是缓存必须预热的时间，等等。

您在lineartosRGB()中进行的二进制搜索可能会非常缓慢。如果CPU支持条件移动指令，那还算不错，但是您仍然有7个间接加载，并且由于下一个加载取决于先前加载的值，因此无法有效地对其进行管道传输。使用预先计算的65545条目查找表可能更快。

另一种可能性是避免查找表，而改用浮点运算。这听起来很疯狂，但是优点是您可以使用SSE指令一次性处理多个像素。看看this question如何优化pow()。

此外，我会避免使用临时数组pixel[16]，而仅将RGBavg16()与fourtimesfouroversampling()合并。

有关上述某些更改，请参见this example on godbolt.org，但浮点数学除外。无论使用哪种编译器，它们都展开内部的两个循环。由于不幸的是，无法使用SSE指令并行进行表查找，因此每个像素都需要单独处理。