如何解决AVX2 代码不能比 gcc 基础优化更快
我正在通过使用内联汇编编写 AVX 代码来学习 AVX。在这种情况下,我尝试在一个简单的函数中实现 AVX。我做的函数名是lower_all_chars_base
。
它的行为是:对 std::string
中的每个字符与 0x20
应用逻辑 OR。
- 让
c
成为输入中的每个字符。 - 假设输入仅包含此集合
'A' <= c && c <= 'Z'
中的字符。
所以函数会使字符变成小写。
我试过做AVX版的函数,store指令没有对齐,一点加速都没有。
然后我想,如果内存访问是对齐的,那么它一定会更快。之后,我尝试制作带有对齐存储的 AVX 版本,但 gcc
基础优化 -O3
仍然手动击败了我的矢量化代码。我在这里做错了什么?
功能总结
-
lower_all_chars_base
简单的函数。 -
lower_all_chars_avx_aligned
AVX2 对齐移动版本:
- 使用基本操作处理第一个未对齐的内存。
- 然后使用 AVX2 处理对齐的内存部分并对齐移动。
- 剩下的就是基本操作了。
-
lower_all_chars_avx_unaligned
AVX2 未对齐移动版本:
- 使用 AVX2 和未对齐移动处理数据
- 接下来是基本操作。
问题
- 为什么
gcc
基础优化-O3
击败了我的优化? - 我在这里做错了什么?
- 执行此操作的正确 AVX 操作是什么?
基准测试结果
root@esteh:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
copyright (C) 2019 Free Software Foundation,Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or fitness FOR A PARTIculaR PURPOSE.
root@esteh:/tmp# g++ -Wall -Wextra -std=c++2a -O3 test.cpp -o test
root@esteh:/tmp# nice -n -20 ./test
lower_all_chars_base
Min = 0.00662300
Max = 0.00793100
Avg = 0.00717280
Total = 0.07172800
lower_all_chars_avx_aligned
Min = 0.00650200
Max = 0.00785100
Avg = 0.00726220
Total = 0.07262200
lower_all_chars_avx_unaligned
Min = 0.00623600
Max = 0.00835000
Avg = 0.00701360
Total = 0.07013600
代码
编辑:N - 1
用于 memset。
Godbolt 链接:https://godbolt.org/z/a16cGK
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void lower_all_chars_avx_unaligned(string &str);
void do_benchmark(std::string &x,void (*fx)(string &));
void mem_flush(const void *p,unsigned int allocation_size);
#define N (size_t)(1024u * 1024 * 40)
#define BENCHMARK(STR,FX) do { \
puts(#FX); \
do_benchmark(STR,FX); \
} while(0)
int main() {
static char x[N];
memset(x,'A',N - 1);
string a(x),b(x),c(x);
BENCHMARK(a,lower_all_chars_base);
BENCHMARK(b,lower_all_chars_avx_aligned);
BENCHMARK(c,lower_all_chars_avx_unaligned);
assert(a == b);
assert(b == c);
memset(x,'a',N - 1);
assert(memcmp(c.c_str(),x,N - 1) == 0);
}
void do_benchmark(std::string &x,void (*fx)(string &)) {
const size_t n = 10;
double min,max,avg,c,total = 0;
for (size_t i = 0; i < n; i++) {
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
mem_flush(x.c_str(),x.size());
}
avg = total / (double)n;
printf("Min = %.8f\n",min);
printf("Max = %.8f\n",max);
printf("Avg = %.8f\n",avg);
printf("Total = %.8f\n\n",total);
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull,0x2020202020202020ull,0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
/* Handle unaligned data from the head. */
uint8_t n = (uintptr_t)cs & 0b11111u;
for (uint8_t i = 0; i < n; i++) {
*cs++ |= 0x20;
}
len -= n;
/* Prevent AVX to process data beyond the array. */
size_t vlen = len - 288;
size_t j;
/* Process the aligned memory with AVX. */
asm volatile("vmovdqa %[mask],%%ymm0"::[mask]"m"(mask):"ymm0");
for (j = 0; j < vlen; j += 288) {
asm volatile(
"vpor\t(%[cs],%[j]),%%ymm0,%%ymm1\n\t"
"vpor\t32(%[cs],%%ymm2\n\t"
"vpor\t64(%[cs],%%ymm3\n\t"
"vpor\t96(%[cs],%%ymm4\n\t"
"vpor\t128(%[cs],%%ymm5\n\t"
"vpor\t160(%[cs],%%ymm6\n\t"
"vpor\t192(%[cs],%%ymm7\n\t"
"vpor\t224(%[cs],%%ymm8\n\t"
"vpor\t256(%[cs],%%ymm9\n\t"
"vmovdqa\t%%ymm1,(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm2,32(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm3,64(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm4,96(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm5,128(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm6,160(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm7,192(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm8,224(%[cs],%[j])\n\t"
"vmovdqa\t%%ymm9,256(%[cs],%[j])"
:
: [cs]"p"(cs),[j]"r"(j)
: "memory","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9"
);
}
asm volatile("vzeroupper":::
"ymm0","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14","ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
__attribute__((noinline))
void lower_all_chars_avx_unaligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than 4K. */
if (len > 4096) {
size_t j;
size_t vlen = len - 288;
asm volatile("vmovdqa %[mask],%%ymm9\n\t"
"vmovdqu\t%%ymm1,%[j])\n\t"
"vmovdqu\t%%ymm2,%[j])\n\t"
"vmovdqu\t%%ymm3,%[j])\n\t"
"vmovdqu\t%%ymm4,%[j])\n\t"
"vmovdqu\t%%ymm5,%[j])\n\t"
"vmovdqu\t%%ymm6,%[j])\n\t"
"vmovdqu\t%%ymm7,%[j])\n\t"
"vmovdqu\t%%ymm8,%[j])\n\t"
"vmovdqu\t%%ymm9,"ymm15"
);
cs += j;
len -= j;
}
/* Backup remaining elements from the AVX operation. */
for (size_t i = 0; i < len; i++) {
*cs++ |= 0x20;
}
}
void mem_flush(const void *p,unsigned int allocation_size) {
/* https://stackoverflow.com/a/43694725/7275114 */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
解决方法
我尝试在评论中应用一些建议。
然而,我不使用内在函数。现在手工编码的程序集版本比使用 -O3 -mavx2
标志的 gcc 优化快大约 1.02 倍。这不是一个显着的加速。但是我学到了很多关于内联汇编的知识。我还在等待其他答案,希望有比这个更好的答案。
lower_all_chars_avx_aligned
函数更改摘要:
- 使用
vbroadcastsd
加载掩码,就像clang
一样。 - 使用虚拟内存输出替换内联汇编中的
volatile
和"memory"
clobber。 - 将 AVX2 阈值降低到 1024 字节。
- 使用 AVX2 处理未对齐的头部/尾部。
- AVX2 循环完全用内联汇编编写。
- 利用
ymm0
到ymm15
寄存器。 - 添加
__AVX__
和__AVX2__
常量检查以防止vzeroupper
发出两次。
基准更改摘要:
- 要处理的数据长度为 3.5 GB(只有 40 MB)。
- 每个函数运行 30 次(只有 10 次)。
- 添加了
-mavx2
以改进 gcc 基础优化。 -
lower_all_chars_avx_unaligned
已删除。 - 使用来自
malloc
的堆而不是static char[]
变量来处理更大的数据。 - 在 Skylake 微架构上运行(在 Broadwell 上运行)。
基准信息:
-
Min
是 30 次函数调用的最短时间。 -
Max
是 30 次函数调用的最长时间。 -
Avg
是 30 次函数调用的平均时间。 -
Total
是 30 次函数调用的总时间。
基准测试结果:
- CPU:Intel(R) Xeon(R) Gold 6140 CPU @ 2.30GHz
- 微架构:Skylake
root@yukii-hpc2:/tmp# g++ -Wall -Wextra -std=c++2a -O3 -mavx2 test.cpp -o test
root@yukii-hpc2:/tmp# nice -n -20 ./test
lower_all_chars_avx_aligned
Min = 0.31086600
Max = 0.31319800
Avg = 0.31159833
Total = 9.34795000
lower_all_chars_base
Min = 0.31823400
Max = 0.32902100
Avg = 0.31904893
Total = 9.57146800
root@yukii-hpc2:/tmp# g++ --version
g++ (Ubuntu 9.3.0-17ubuntu1~20.04) 9.3.0
Copyright (C) 2019 Free Software Foundation,Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
root@yukii-hpc2:/tmp#
代码
- Godbolt 链接:https://godbolt.org/z/ox7hjT
/*
https://stackoverflow.com/questions/65404362/avx2-code-cannot-be-faster-than-gcc-base-optmization
*/
#include <ctime>
#include <cstdio>
#include <cassert>
#include <cstring>
#include <cstdlib>
#include <iostream>
using std::string;
void lower_all_chars_base(string &str);
void lower_all_chars_avx_aligned(string &str);
void do_benchmark(string &x,void (*fx)(string &));
void mem_flush(const void *p,unsigned int allocation_size);
#define _M(N) (size_t)(1024ull * 1024 * (N))
#define _G(N) (size_t)(1024ull * 1024 * 1024 * (N))
#define N (_G(3) + _M(512) + 1234) /* 3.5 G + 1234 */
/*
1234 is just to make it odd,so it likely will jump to the tail of AVX aligned loop.
*/
#define BENCHMARK(STR,FX) do { \
puts(#FX); \
do_benchmark(STR,FX); \
} while(0)
int main() {
char *x = (char *)malloc(N + 1);
memset(x,'A',N);
x[N] = '\0';
{
string a(x);
memset(x,'a',N);
BENCHMARK(a,lower_all_chars_avx_aligned);
assert(memcmp(a.c_str(),x,N) == 0);
}
/* Restore value for the next benchmark. */
memset(x,N);
{
string a(x);
memset(x,lower_all_chars_base);
assert(memcmp(a.c_str(),N) == 0);
}
free(x);
}
inline static void lower_all_chars_b1024(char *cs,uint16_t len) {
while (len--) {
*cs++ |= 0x20;
}
}
/* Aligned memory for mask for performance. */
static const uint64_t mask[] __attribute__((aligned(32))) = {
0x2020202020202020ull
};
__attribute__((noinline))
void lower_all_chars_avx_aligned(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
/* Only use AVX for data bigger than or equal to 1K. */
if (len >= 1024) {
size_t avx_size = 0x1e0; /* Bytes per AVX main iteration. */
char *end = &(cs[len]);
/* End of aligned process iteration. */
char *end_avx = &(end[-avx_size]);
/* Dummy variable,to let the compiler choose the best GP register. */
uintptr_t rem_bytes;
asm(
/* Prepare %[rem_bytes] initial value. */
"movq\t%[end],%[rem_bytes]\n\t"
/* Load the mask. */
"vbroadcastsd\t%[mask],%%ymm0\n\t"
/* Handle unaligned memory from the head. */
"vpor\t(%[cs]),%%ymm0,%%ymm1\n\t"
"vmovdqu\t%%ymm1,(%[cs])\n\t"
"addq\t$0x20,%[cs]\n\t" /* Move to the next 32 bytes. */
/* Handle aligned memory part.
Use `vmovdqa` to make sure that the memory is
aligned properly.
Note that ORing is idempotent: you can OR the same
byte multiple times without changing it further. So
%[cs] can partially overlap with `vmovdqu` operation
before this point.
https://stackoverflow.com/questions/65404362/avx2-code-cannot-be-faster-than-gcc-base-optmization#comment115632279_65404362
*/
"andq\t$~0b11111ull,%[cs]\n\t" /* Clear 5-bit LSB. */
"1:\n\t"
"vpor\t0x000(%[cs]),%%ymm1\n\t"
"vpor\t0x020(%[cs]),%%ymm2\n\t"
"vpor\t0x040(%[cs]),%%ymm3\n\t"
"vpor\t0x060(%[cs]),%%ymm4\n\t"
"vpor\t0x080(%[cs]),%%ymm5\n\t"
"vpor\t0x0a0(%[cs]),%%ymm6\n\t"
"vpor\t0x0c0(%[cs]),%%ymm7\n\t"
"vpor\t0x0e0(%[cs]),%%ymm8\n\t"
"vpor\t0x100(%[cs]),%%ymm9\n\t"
"vpor\t0x120(%[cs]),%%ymm10\n\t"
"vpor\t0x140(%[cs]),%%ymm11\n\t"
"vpor\t0x160(%[cs]),%%ymm12\n\t"
"vpor\t0x180(%[cs]),%%ymm13\n\t"
"vpor\t0x1a0(%[cs]),%%ymm14\n\t"
"vpor\t0x1c0(%[cs]),%%ymm15\n\t"
/* Plug the result to aligned memory. */
"vmovdqa\t%%ymm1,0x000(%[cs])\n\t"
"vmovdqa\t%%ymm2,0x020(%[cs])\n\t"
"vmovdqa\t%%ymm3,0x040(%[cs])\n\t"
"vmovdqa\t%%ymm4,0x060(%[cs])\n\t"
"vmovdqa\t%%ymm5,0x080(%[cs])\n\t"
"vmovdqa\t%%ymm6,0x0a0(%[cs])\n\t"
"vmovdqa\t%%ymm7,0x0c0(%[cs])\n\t"
"vmovdqa\t%%ymm8,0x0e0(%[cs])\n\t"
"vmovdqa\t%%ymm9,0x100(%[cs])\n\t"
"vmovdqa\t%%ymm10,0x120(%[cs])\n\t"
"vmovdqa\t%%ymm11,0x140(%[cs])\n\t"
"vmovdqa\t%%ymm12,0x160(%[cs])\n\t"
"vmovdqa\t%%ymm13,0x180(%[cs])\n\t"
"vmovdqa\t%%ymm14,0x1a0(%[cs])\n\t"
"vmovdqa\t%%ymm15,0x1c0(%[cs])\n\t"
"addq\t%[avx_size],%[cs]\n\t"
"cmpq\t%[end_avx],%[cs]\n\t"
"jb\t1b\n\t"
"subq\t%[cs],%[rem_bytes]\n\t"
/* Now,%[rem_bytes] contains the remaining bytes. */
"testq\t%[rem_bytes],%[rem_bytes]\n\t"
"jz\t3f\n\t"
/* There's no remaining bytes if `jz` is taken. */
/* Handle the tail,may be back off several bytes
to make the remaining bytes to be multiple of 32.
*/
"leaq\t0b11111(%[rem_bytes]),%[dec_avx]\n\t"
"andq\t$~0b11111ull,%[dec_avx]\n\t"
"subq\t%[rem_bytes],%[dec_avx]\n\t"
"subq\t%[dec_avx],%[cs]\n\t"
"2:\n\t"
"vpor\t(%[cs]),%[cs]\n\t"
"cmpq\t%[end],%[cs]\n\t"
"jb\t2b\n\t"
"3:\n\t"
#if !defined(__AVX__) && !defined(__AVX2__)
"vzeroupper"
#endif
/* Output */
: [cs]"+r"(cs),[end]"+r"(end),[end_avx]"+r"(end_avx),[dec_avx]"=r"(end_avx),/* May reuse end_avx if needed. */
[rem_bytes]"=r"(rem_bytes),/* Tell the compiler that this inline assembly is
going to read/write `len` bytes from `cs`. */
[dummy_mem_output]"+m"(*(char (*)[len])cs)
/* Input */
: [mask]"m"(mask),[avx_size]"n"(avx_size)
/* Clobbers */
: "ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14","ymm15"
);
} else {
/* Let the compiler use its own optimization here. */
lower_all_chars_b1024(cs,len);
}
}
__attribute__((noinline))
void lower_all_chars_base(string &str) {
char *cs = (char *)str.c_str();
size_t len = str.size();
while (len--) {
*cs++ |= 0x20;
}
}
void do_benchmark(string &x,void (*fx)(string &)) {
const size_t n = 30;
double min = 0,max = 0,avg,c,total = 0;
for (size_t i = 0; i < n; i++) {
mem_flush(x.c_str(),x.size());
clock_t time0 = clock();
fx(x);
clock_t time1 = clock();
c = (double)(time1 - time0) / CLOCKS_PER_SEC;
total += c;
if (i == 0) {
min = max = c;
} else {
if (c > max) max = c;
if (c < min) min = c;
}
}
avg = total / (double)n;
printf("Min = %.8f\n",min);
printf("Max = %.8f\n",max);
printf("Avg = %.8f\n",avg);
printf("Total = %.8f\n\n",total);
}
void mem_flush(const void *p,unsigned int allocation_size) {
/* https://stackoverflow.com/a/43694725/7275114 */
const size_t cache_line = 64;
const char *cp = (const char *)p;
size_t i = 0;
if (p == NULL || allocation_size <= 0)
return;
for (i = 0; i < allocation_size; i += cache_line) {
asm volatile("clflush (%0)"::"r"(&cp[i]):"memory");
}
asm volatile("sfence"::: "memory");
}
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。