如何解决将原生 C 矩阵乘法转换为 OpenCl SIMD 矩阵乘法
我目前正在优化 OpenCL 代码以提高性能,我确定以下方法是性能瓶颈,我想用等效的 OpenCL-SIMD-Replacement 替换它:
typedef unsigned long uint64_t;
void multiply256(const unsigned int x[8],const unsigned int y[8],unsigned int out_high[8],unsigned int out_low[8])
{
unsigned int z[16];
unsigned int high = 0;
uint64_t product = 0;
// First round,overwrite z
for(int j = 7; j >= 0; j--) {
product = (uint64_t)x[7] * y[j] + high;
z[7 + j + 1] = (unsigned int)product;
high = (unsigned int)(product >> 32);
}
z[7] = high;
for(int i = 6; i >= 0; i--) {
high = 0;
for(int j = 7; j >= 0; j--) {
product = (uint64_t)x[i] * y[j] + z[i + j + 1] + high;
z[i + j + 1] = (unsigned int)product;
high = product >> 32;
}
z[i] = high;
}
for(int i = 0; i < 8; i++) {
out_high[i] = z[i];
out_low[i] = z[8 + i];
}
}
所以我想我可以像这样替换它:
void multiply256(const unsigned int x[8],unsigned int out_low[8])
{
uint8 x8;
x8[0] = x[0];
x8[1] = x[1];
x8[2] = x[2];
x8[3] = x[3];
x8[4] = x[4];
x8[5] = x[5];
x8[6] = x[6];
x8[7] = x[7];
uint8 y8;
y8[0] = y[0];
y8[1] = y[1];
y8[2] = y[2];
y8[3] = y[3];
y8[4] = y[4];
y8[5] = y[5];
y8[6] = y[6];
y8[7] = y[7];
uint8 high = mul_hi(x8,y8);
uint8 low = x8 * y8;
out_high[0] = high[0];
out_high[1] = high[1];
out_high[2] = high[2];
out_high[3] = high[3];
out_high[4] = high[4];
out_high[5] = high[5];
out_high[6] = high[6];
out_high[7] = high[7];
out_low[0] = low[0];
out_low[1] = low[1];
out_low[2] = low[2];
out_low[3] = low[3];
out_low[4] = low[4];
out_low[5] = low[5];
out_low[6] = low[6];
out_low[7] = low[7];
}
但它没有相同的结果。我做错了什么?
谢谢
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。