如何解决AVX2 代码不能比 gcc 基础优化更快

我正在通过使用内联汇编编写 AVX 代码来学习 AVX。在这种情况下，我尝试在一个简单的函数中实现 AVX。我做的函数名是lower_all_chars_base。

它的行为是：对 std::string 中的每个字符与 0x20 应用逻辑 OR。

让 c 成为输入中的每个字符。
假设输入仅包含此集合 'A' <= c && c <= 'Z' 中的字符。

所以函数会使字符变成小写。

我试过做AVX版的函数，store指令没有对齐，一点加速都没有。

然后我想，如果内存访问是对齐的，那么它一定会更快。之后，我尝试制作带有对齐存储的 AVX 版本，但 gcc 基础优化 -O3 仍然手动击败了我的矢量化代码。我在这里做错了什么？

功能总结

lower_all_chars_base 简单的函数。
lower_all_chars_avx_aligned AVX2 对齐移动版本：

使用基本操作处理第一个未对齐的内存。
然后使用 AVX2 处理对齐的内存部分并对齐移动。
剩下的就是基本操作了。

lower_all_chars_avx_unaligned AVX2 未对齐移动版本：

使用 AVX2 和未对齐移动处理数据
接下来是基本操作。

问题

为什么 gcc 基础优化 -O3 击败了我的优化？
我在这里做错了什么？
执行此操作的正确 AVX 操作是什么？

基准测试结果

cpu：Intel(R) Xeon(R) cpu E5-2650 v4 (2.20GHz)
微架构：broadwell

root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
copyright (C) 2019 Free Software Foundation,Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or fitness FOR A PARTIculaR PURPOSE.

root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min   =  0.00662300
Max   =  0.00793100
Avg   =  0.00717280
Total =  0.07172800

lower_all_chars_avx_aligned
Min   =  0.00650200
Max   =  0.00785100
Avg   =  0.00726220
Total =  0.07262200

lower_all_chars_avx_unaligned
Min   =  0.00623600
Max   =  0.00835000
Avg   =  0.00701360
Total =  0.07013600

代码

编辑：N - 1 用于 memset。

Godbolt 链接：https://godbolt.org/z/a16cGK


#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;

void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x,void (*fx)(string &));
void mem_flush(const void *p,unsigned int allocation_size);

#define N (size_t)(1024u * 1024 * 40)

#define BENCHMARK(STR,FX) do { \
 puts(#FX); \
 do_benchmark(STR,FX); \
} while(0)

int main() {
  static char x[N];
  memset(x,'A',N - 1);
  string a(x),b(x),c(x);
  
  BENCHMARK(a,lower_all_chars_base);
  BENCHMARK(b,lower_all_chars_avx_aligned);
  BENCHMARK(c,lower_all_chars_avx_unaligned);

  assert(a == b);
  assert(b == c);

  memset(x,'a',N - 1);
  assert(memcmp(c.c_str(),x,N - 1) == 0);
}

void do_benchmark(std::string &x,void (*fx)(string &)) {
  const size_t n = 10;
  double min,max,avg,c,total = 0;
  for (size_t i = 0; i < n; i++) {
    clock_t time0 = clock();
    fx(x);
    clock_t time1 = clock();

    c = (double)(time1 - time0) / CLOCKS_PER_SEC;
    total += c;
    if (i == 0) {
      min = max = c;
    } else {
      if (c > max) max = c;
      if (c < min) min = c;
    }
    mem_flush(x.c_str(),x.size());
  }
  avg = total / (double)n;
  printf("Min   =  %.8f\n",min);
  printf("Max   =  %.8f\n",max);
  printf("Avg   =  %.8f\n",avg);
  printf("Total =  %.8f\n\n",total);
}

__attribute__((noinline))
void lower_all_chars_base(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  while (len--) {
    *cs++ |= 0x20;
  }
}

static const uint64_t mask[] __attribute__((aligned(32))) = {
  0x2020202020202020ull,0x2020202020202020ull,0x2020202020202020ull
};

__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();

  /* Only use AVX for data bigger than 4K. */
  if (len > 4096) {
    /* Handle unaligned data from the head. */
    uint8_t n = (uintptr_t)cs & 0b11111u;
    for (uint8_t i = 0; i < n; i++) {
      *cs++ |= 0x20;
    }

    len -= n;

    /* Prevent AVX to process data beyond the array. */
    size_t vlen = len - 288;
    size_t j;

    /* Process the aligned memory with AVX. */
    asm volatile("vmovdqa %[mask],%%ymm0"::[mask]"m"(mask):"ymm0");
    for (j = 0; j < vlen; j += 288) {
      asm volatile(
        "vpor\t(%[cs],%[j]),%%ymm0,%%ymm1\n\t"
        "vpor\t32(%[cs],%%ymm2\n\t"
        "vpor\t64(%[cs],%%ymm3\n\t"
        "vpor\t96(%[cs],%%ymm4\n\t"
        "vpor\t128(%[cs],%%ymm5\n\t"
        "vpor\t160(%[cs],%%ymm6\n\t"
        "vpor\t192(%[cs],%%ymm7\n\t"
        "vpor\t224(%[cs],%%ymm8\n\t"
        "vpor\t256(%[cs],%%ymm9\n\t"
        "vmovdqa\t%%ymm1,(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm2,32(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm3,64(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm4,96(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm5,128(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm6,160(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm7,192(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm8,224(%[cs],%[j])\n\t"
        "vmovdqa\t%%ymm9,256(%[cs],%[j])"
        :
        : [cs]"p"(cs),[j]"r"(j)
        : "memory","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9"
      );
    }
    asm volatile("vzeroupper":::
      "ymm0","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14","ymm15"
    );
    cs  += j;
    len -= j;
  }

  /* Backup remaining elements from the AVX operation. */
  for (size_t i = 0; i < len; i++) {
    *cs++ |= 0x20;
  }
}

__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();

  /* Only use AVX for data bigger than 4K. */
  if (len > 4096) {
    size_t j;
    size_t vlen  = len - 288;
    asm volatile("vmovdqa %[mask],%%ymm9\n\t"
        "vmovdqu\t%%ymm1,%[j])\n\t"
        "vmovdqu\t%%ymm2,%[j])\n\t"
        "vmovdqu\t%%ymm3,%[j])\n\t"
        "vmovdqu\t%%ymm4,%[j])\n\t"
        "vmovdqu\t%%ymm5,%[j])\n\t"
        "vmovdqu\t%%ymm6,%[j])\n\t"
        "vmovdqu\t%%ymm7,%[j])\n\t"
        "vmovdqu\t%%ymm8,%[j])\n\t"
        "vmovdqu\t%%ymm9,"ymm15"
    );
    cs  += j;
    len -= j;
  }

  /* Backup remaining elements from the AVX operation. */
  for (size_t i = 0; i < len; i++) {
    *cs++ |= 0x20;
  }
}

void mem_flush(const void *p,unsigned int allocation_size) {
  /* https://stackoverflow.com/a/43694725/7275114 */
  const size_t cache_line = 64;
  const char *cp = (const char *)p;
  size_t i = 0;
  if (p == NULL || allocation_size <= 0)
    return;

  for (i = 0; i < allocation_size; i += cache_line) {
    asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
  }
  asm volatile("sfence"::: "memory");
}

解决方法

我尝试在评论中应用一些建议。

然而，我不使用内在函数。现在手工编码的程序集版本比使用 -O3 -mavx2 标志的 gcc 优化快大约 1.02 倍。这不是一个显着的加速。但是我学到了很多关于内联汇编的知识。我还在等待其他答案，希望有比这个更好的答案。

`lower_all_chars_avx_aligned` 函数更改摘要：

使用 vbroadcastsd 加载掩码，就像 clang 一样。
使用虚拟内存输出替换内联汇编中的 volatile 和 "memory" clobber。
将 AVX2 阈值降低到 1024 字节。
使用 AVX2 处理未对齐的头部/尾部。
AVX2 循环完全用内联汇编编写。
利用 ymm0 到 ymm15 寄存器。
添加 __AVX__ 和 __AVX2__ 常量检查以防止 vzeroupper 发出两次。

基准更改摘要：

要处理的数据长度为 3.5 GB（只有 40 MB）。
每个函数运行 30 次（只有 10 次）。
添加了 -mavx2 以改进 gcc 基础优化。
lower_all_chars_avx_unaligned 已删除。
使用来自 malloc 的堆而不是 static char[] 变量来处理更大的数据。
在 Skylake 微架构上运行（在 Broadwell 上运行）。

基准信息：

Min 是 30 次函数调用的最短时间。
Max 是 30 次函数调用的最长时间。
Avg 是 30 次函数调用的平均时间。
Total 是 30 次函数调用的总时间。

基准测试结果：

CPU：Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz
微架构：Skylake

root@yukii-hpc2:/tmp# g++ -Wall -Wextra -std=c++2a -O3 -mavx2 test.cpp -o test
root@yukii-hpc2:/tmp# nice -n -20 ./test
lower_all_chars_avx_aligned
Min   =  0.31086600
Max   =  0.31319800
Avg   =  0.31159833
Total =  9.34795000

lower_all_chars_base
Min   =  0.31823400
Max   =  0.32902100
Avg   =  0.31904893
Total =  9.57146800

root@yukii-hpc2:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation,Inc.
This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

root@yukii-hpc2:/tmp#

代码

Godbolt 链接：https://godbolt.org/z/ox7hjT

/*
  https://stackoverflow.com/questions/65404362/avx2-code-cannot-be-faster-than-gcc-base-optmization
*/

#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cstdlib>
#include <iostream>
using std::string;

void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void do_benchmark(string &x,void (*fx)(string &));
void mem_flush(const void *p,unsigned int allocation_size);

#define _M(N) (size_t)(1024ull * 1024 * (N))
#define _G(N) (size_t)(1024ull * 1024 * 1024 * (N))

#define N (_G(3) + _M(512) + 1234) /* 3.5 G + 1234 */
/*
   1234 is just to make it odd,so it likely will jump to the tail of AVX aligned loop.
*/

#define BENCHMARK(STR,FX) do { \
 puts(#FX); \
 do_benchmark(STR,FX); \
} while(0)

int main() {
  char *x = (char *)malloc(N + 1);
  memset(x,'A',N);
  x[N] = '\0';

  {
    string a(x);
    memset(x,'a',N);
    BENCHMARK(a,lower_all_chars_avx_aligned);
    assert(memcmp(a.c_str(),x,N) == 0);
  }

  /* Restore value for the next benchmark. */
  memset(x,N);

  {
    string a(x);
    memset(x,lower_all_chars_base);
    assert(memcmp(a.c_str(),N) == 0);
  }

  free(x);
}

inline static void lower_all_chars_b1024(char *cs,uint16_t len) {
  while (len--) {
    *cs++ |= 0x20;
  }
}

/* Aligned memory for mask for performance. */
static const uint64_t mask[] __attribute__((aligned(32))) = {
  0x2020202020202020ull
};

__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
  char   *cs = (char *)str.c_str();
  size_t len = str.size();

  /* Only use AVX for data bigger than or equal to 1K. */
  if (len >= 1024) {

    size_t    avx_size  = 0x1e0; /* Bytes per AVX main iteration. */
    char      *end      = &(cs[len]);

    /* End of aligned process iteration. */
    char      *end_avx  = &(end[-avx_size]);

    /* Dummy variable,to let the compiler choose the best GP register. */
    uintptr_t rem_bytes;

    asm(
      /* Prepare %[rem_bytes] initial value. */
      "movq\t%[end],%[rem_bytes]\n\t"

      /* Load the mask. */
      "vbroadcastsd\t%[mask],%%ymm0\n\t"

      /* Handle unaligned memory from the head. */
      "vpor\t(%[cs]),%%ymm0,%%ymm1\n\t"
      "vmovdqu\t%%ymm1,(%[cs])\n\t"
      "addq\t$0x20,%[cs]\n\t" /* Move to the next 32 bytes. */


      /* Handle aligned memory part.

        Use `vmovdqa` to make sure that the memory is
        aligned properly.

        Note that ORing is idempotent: you can OR the same
        byte multiple times without changing it further. So
        %[cs] can partially overlap with `vmovdqu` operation
        before this point.

        https://stackoverflow.com/questions/65404362/avx2-code-cannot-be-faster-than-gcc-base-optmization#comment115632279_65404362
      */
      "andq\t$~0b11111ull,%[cs]\n\t" /* Clear 5-bit LSB. */
      "1:\n\t"

      "vpor\t0x000(%[cs]),%%ymm1\n\t"
      "vpor\t0x020(%[cs]),%%ymm2\n\t"
      "vpor\t0x040(%[cs]),%%ymm3\n\t"
      "vpor\t0x060(%[cs]),%%ymm4\n\t"
      "vpor\t0x080(%[cs]),%%ymm5\n\t"
      "vpor\t0x0a0(%[cs]),%%ymm6\n\t"
      "vpor\t0x0c0(%[cs]),%%ymm7\n\t"
      "vpor\t0x0e0(%[cs]),%%ymm8\n\t"
      "vpor\t0x100(%[cs]),%%ymm9\n\t"
      "vpor\t0x120(%[cs]),%%ymm10\n\t"
      "vpor\t0x140(%[cs]),%%ymm11\n\t"
      "vpor\t0x160(%[cs]),%%ymm12\n\t"
      "vpor\t0x180(%[cs]),%%ymm13\n\t"
      "vpor\t0x1a0(%[cs]),%%ymm14\n\t"
      "vpor\t0x1c0(%[cs]),%%ymm15\n\t"

      /* Plug the result to aligned memory.  */
      "vmovdqa\t%%ymm1,0x000(%[cs])\n\t"
      "vmovdqa\t%%ymm2,0x020(%[cs])\n\t"
      "vmovdqa\t%%ymm3,0x040(%[cs])\n\t"
      "vmovdqa\t%%ymm4,0x060(%[cs])\n\t"
      "vmovdqa\t%%ymm5,0x080(%[cs])\n\t"
      "vmovdqa\t%%ymm6,0x0a0(%[cs])\n\t"
      "vmovdqa\t%%ymm7,0x0c0(%[cs])\n\t"
      "vmovdqa\t%%ymm8,0x0e0(%[cs])\n\t"
      "vmovdqa\t%%ymm9,0x100(%[cs])\n\t"
      "vmovdqa\t%%ymm10,0x120(%[cs])\n\t"
      "vmovdqa\t%%ymm11,0x140(%[cs])\n\t"
      "vmovdqa\t%%ymm12,0x160(%[cs])\n\t"
      "vmovdqa\t%%ymm13,0x180(%[cs])\n\t"
      "vmovdqa\t%%ymm14,0x1a0(%[cs])\n\t"
      "vmovdqa\t%%ymm15,0x1c0(%[cs])\n\t"

      "addq\t%[avx_size],%[cs]\n\t"
      "cmpq\t%[end_avx],%[cs]\n\t"
      "jb\t1b\n\t"

      "subq\t%[cs],%[rem_bytes]\n\t"

      /* Now,%[rem_bytes] contains the remaining bytes. */
      "testq\t%[rem_bytes],%[rem_bytes]\n\t"
      "jz\t3f\n\t"
      /* There's no remaining bytes if `jz` is taken. */


      /* Handle the tail,may be back off several bytes
         to make the remaining bytes to be multiple of 32.
       */
      "leaq\t0b11111(%[rem_bytes]),%[dec_avx]\n\t"
      "andq\t$~0b11111ull,%[dec_avx]\n\t"
      "subq\t%[rem_bytes],%[dec_avx]\n\t"
      "subq\t%[dec_avx],%[cs]\n\t"

      "2:\n\t"
      "vpor\t(%[cs]),%[cs]\n\t"
      "cmpq\t%[end],%[cs]\n\t"
      "jb\t2b\n\t"

      "3:\n\t"
      #if !defined(__AVX__) && !defined(__AVX2__)
      "vzeroupper"
      #endif
      /* Output */
      : [cs]"+r"(cs),[end]"+r"(end),[end_avx]"+r"(end_avx),[dec_avx]"=r"(end_avx),/* May reuse end_avx if needed. */
        [rem_bytes]"=r"(rem_bytes),/* Tell the compiler that this inline assembly is
           going to read/write `len` bytes from `cs`. */
        [dummy_mem_output]"+m"(*(char (*)[len])cs)

      /* Input */
      : [mask]"m"(mask),[avx_size]"n"(avx_size)

      /* Clobbers */
      : "ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14","ymm15"
    );
  } else {
    /* Let the compiler use its own optimization here. */
    lower_all_chars_b1024(cs,len);
  }
}

__attribute__((noinline))
void lower_all_chars_base(string &str) {
  char *cs = (char *)str.c_str();
  size_t len = str.size();
  while (len--) {
    *cs++ |= 0x20;
  }
}

void do_benchmark(string &x,void (*fx)(string &)) {
  const size_t n = 30;
  double min = 0,max = 0,avg,c,total = 0;
  for (size_t i = 0; i < n; i++) {

    mem_flush(x.c_str(),x.size());

    clock_t time0 = clock();
    fx(x);
    clock_t time1 = clock();

    c = (double)(time1 - time0) / CLOCKS_PER_SEC;
    total += c;
    if (i == 0) {
      min = max = c;
    } else {
      if (c > max) max = c;
      if (c < min) min = c;
    }
  }
  avg = total / (double)n;
  printf("Min   =  %.8f\n",min);
  printf("Max   =  %.8f\n",max);
  printf("Avg   =  %.8f\n",avg);
  printf("Total =  %.8f\n\n",total);
}

void mem_flush(const void *p,unsigned int allocation_size) {
  /* https://stackoverflow.com/a/43694725/7275114 */
  const size_t cache_line = 64;
  const char *cp = (const char *)p;
  size_t i = 0;
  if (p == NULL || allocation_size <= 0)
    return;

  for (i = 0; i < allocation_size; i += cache_line) {
    asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
  }
  asm volatile("sfence"::: "memory");
}

AVX2 代码不能比 gcc 基础优化更快