两段 C++ 代码之间的显着性能差异

如何解决两段 C++ 代码之间的显着性能差异

我无法理解为什么以下两个例程在性能方面没有预期的差异。

void matmul1(const float *matrix,const float *vector,float *output,uint32_t input_height,uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += matrix[y * input_width + x] * vector[x];
        }
    }
}

void matmul2(const float *matrix,uint32_t input_width) {
    for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
            output[y] += *matrix++ * vector[x];
        }
    }
}

我在同一台机器上对随机数据重复执行两个函数 100 次。函数matmul1的平均运行时间为21298μs，函数matmul2的平均运行时间为24034μs。样本的标准偏差为 198 和 171。

反汇编 https://godbolt.org/z/of3zM4 给了我这个（我不太适合汇编，无法正确解释结果）

matmul1(float const*,float const*,float*,unsigned int,unsigned int):
  mov w8,0
  mov x7,0
  cbz w3,.L1
.L2:
  cbz w4,.L5
  ldr s0,[x2,x7,lsl 2]
  mov x5,0
.L6:
  add w6,w8,w5
  ldr s1,[x1,x5,lsl 2]
  add x5,1
  cmp w4,w5
  ldr s2,[x0,x6,lsl 2]
  fmadd s0,s2,s1,s0
  str s0,lsl 2]
  bhi .L6
.L5:
  add x7,1
  add w8,w4
  cmp w3,w7
  bhi .L2
.L1:
  ret
matmul2(float const*,unsigned int):
  cbz w3,.L10
  sub w7,w4,#1
  mov x6,0
  add x7,1
  lsl x7,2
.L15:
  cbz w4,.L12
  ldr s0,0
.L13:
  ldr s2,lsl 2]
  ldr s1,w5
  fmadd s0,lsl 2]
  bhi .L13
  add x0,x0,x7
.L12:
  add x6,1
  cmp w3,w6
  bhi .L15
.L10:
  ret

我还在不同的优化级别、不同的输入大小上运行了代码。每次第一个函数都会击败第二个函数。为什么？我希望第一个函数比第二个函数慢，因为它在内循环中多了一个乘法，但事实并非如此。

我使用 g++ 8.3.0 在 RaspBerry Pi 4 上运行代码

解决方法

更新 1：

我花了一些时间，实际上我的建议比“matmul2”运行得慢。见下文！

尝试更换

output[y] += *matrix++ * vector[x];

在第二个循环中

output[y] += *(++matrix) * vector[x];

即用前增量替换指针的后增量。如果使用后增量，则会创建指针的临时副本并使用其值。因为每次运行时间增加时都会发生这种情况。如果使用预增量，则不需要此临时副本。

我不确定，编译器是否可以优化这部分。因为您使用指针，所以可能无法避免副作用。因此，它保持不变。

仔细检查结果，因为语义略有变化。

更新 1：

我实现了以下功能并进行了一些计时。 matmul1 是最慢的版本。 matmul2 是我机器上最快的版本。我没想到 matmul3 会变慢。 1000 次重复和未优化的时间如下：

matmul1 - 573.62 毫秒

matmul2 - 512.58 毫秒

matmul3 - 534.63 毫秒

    #include <chrono>
    #include <iostream>
    #include <vector>
    
    using namespace std;
    using std::chrono::duration;
    using std::chrono::duration_cast;
    using std::chrono::high_resolution_clock;
    using std::chrono::milliseconds;
    
    void
    long_operation(vector<int>& vec) {
      /* Simulating a long,heavy operation. */
      for (size_t i = 0; i < vec.size(); ++i)
        vec[i] += i;
    }
    
    void
    matmul1(const float* matrix,const float* vector,float*       output,uint32_t     input_height,uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += matrix[y * input_width + x] * vector[x];
        }
      }
    }
    
    void
    matmul2(const float* matrix,uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += *matrix++ * vector[x];
        }
      }
    }
    
    void
    matmul3(const float* matrix,uint32_t     input_width) {
      for (uint32_t y = 0; y < input_height; y++) {
        for (uint32_t x = 0; x < input_width; x++) {
          output[y] += *(++matrix) * vector[x];
        }
      }
    }
    
    int
    main() {
      //--- prepare some test data ---//
      uint32_t       height = 100;
      uint32_t       width  = 200;
      const uint32_t size   = height * width;
      float          matrix[size];
      float          vector[size];
      float          output[size];
    
      for (uint32_t i = 0; i < size; ++i) {
        matrix[i] = i;
        vector[i] = i * i;
        output[i] = 0.0;
      }
    
      //--- test timings ---//
      double time1  = 0.0;
      double time2  = 0.0;
      double time3  = 0.0;
      int    repeat = 0;
      for (repeat = 0; repeat < 10000; ++repeat) {
        //--- version 1
        auto t1 = high_resolution_clock::now();
        matmul1(matrix,vector,output,height,width);
        auto t2 = high_resolution_clock::now();
    
        duration<double,std::milli> ms_double = t2 - t1;
        time1 += ms_double.count();
    
        //--- version 2
        t1 = high_resolution_clock::now();
        matmul2(matrix,width);
        t2 = high_resolution_clock::now();
    
        ms_double = t2 - t1;
        time2 += ms_double.count();
    
        //--- version 3
        t1 = high_resolution_clock::now();
        matmul3(matrix,width);
        t2 = high_resolution_clock::now();
    
        ms_double = t2 - t1;
        time3 += ms_double.count();
      }
      std::cout << "total time 1:   " << time1 << "ms\n";
      std::cout << "total time 2:   " << time2 << "ms\n";
      std::cout << "total time 3:   " << time3 << "ms\n" << endl;
    
      time1 /= repeat;
      time2 /= repeat;
      time3 /= repeat;
      
      cout << "average time 1:   " << time1 << "ms\n";
      cout << "average time 2:   " << time2 << "ms\n";
      cout << "average time 3:   " << time3 << "ms" << endl;
      return 0;
    }

完全优化 (-O3) 的时间几乎相同。