如何解决就64位除法而言,是否可以在没有分支的情况下执行128位/64位除法?
我正在使用 algorand 合约代码,该代码在其汇编代码中的可能操作范围非常有限 - 例如,无法控制代码流。可以使用基本的 64 位算术运算。
我需要做的是将某个 128 位整数的两个 64 位段(作为一个完整的 128 位数字)除以另一个 64 位整数,知道结果将适合 64 位整数。是否可以不控制代码流而仅使用 64 位算法?
解决方法
我不熟悉 Algorand 语言。完全可以使用 64 位操作创建无分支代码,用于将 128 位操作数除以 64 位操作数。最简单的方法是使用我们在小学都学过的长手除法和乘法算法,但以232为基数,而不是10。一般来说,这需要移位和逻辑运算除了简单的算术之外,这就是我假设在下面的 ISO-C99 代码中可用的。
可以使用现有的 64 位除法,通过将被除数的前两位“数字”除以除数的前导“数字”来生成下一个商“数字”的近似估计,其中“数字”是可以理解为基数 232。在 TAOCP 中,Knuth 显示错误为正,最多为 2,前提是除数已归一化,即设置其最高有效位。我们通过反乘和减法计算余数,然后可以检查它是否为负。如果是,我们将(部分)商减一并重新计算余数。
由于不允许分支,因此需要一种机制来在两个可能的条件下从两个操作数中进行选择。这可以通过将谓词计算为取值为 0 或 1 的整数,并将其传递给一个 2:1 多路复用器函数来完成,该函数将一个源操作数与谓词相乘,另一个与谓词的补码相乘,并加上结果。在下面的代码中,此函数称为 mux_64()
。
我测试了函数 my_div_128_64()
中的除法代码,将其与我的 64 位 Intel CPU 的 DIV
指令进行比较,通过内联汇编访问该指令。代码是使用英特尔编译器版本 13 构建的。我已经实现了两种测试模式,一种基于纯随机操作数,另一种基于简单模式,这些模式对满足各种边界情况很有用。到目前为止还没有发现不匹配。更精细的测试是可能的,但这对于一个体面的“烟雾”测试来说应该足够了。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define PATTERN_TEST (1)
#define MSVC_STYLE (1)
#define GCC_STYLE (2)
#define ASM_STYLE (GCC_STYLE)
// our own 128-bit unsigned integer type
typedef struct
{
uint64_t x;
uint64_t y;
} ulonglong2;
// (a < b) ? 1 : 0
uint64_t ltu_64 (uint64_t a,uint64_t b)
{
return ((~a & b) + ((~a ^ b) >> 1)) >> 63;
}
// (a != b) ? 1 : 0
uint64_t neq_64 (uint64_t a,uint64_t b)
{
return ((((a ^ b) | (1ULL << 63)) - 1ULL) | (a ^ b)) >> 63;
}
// (a >= b) ? 1 : 0
uint64_t geu_64 (uint64_t a,uint64_t b)
{
return ((a | ~b) - ((a ^ ~b) >> 1)) >> 63;
}
//#define ADDcc(a,b,cy,t0,t1) (t0=(b),t1=(a),t0=t0+t1,cy=t0<t1,t0=t0)
#define ADDcc(a,cy=ltu_64(t0,t1),t0=t0)
#define ADDC(a,t1) (t0=(b)+cy,t0+t1)
//#define SUBcc(a,cy=t1<t0,t1-t0)
#define SUBcc(a,cy=ltu_64(t1,t0),t1-t0)
#define SUBC(a,t1) (t0=(b)+cy,t1-t0)
// 128-bit subtraction
ulonglong2 sub_128 (ulonglong2 a,ulonglong2 b)
{
ulonglong2 res;
uint64_t cy,t1;
res.x = SUBcc (a.x,b.x,t1);
res.y = SUBC (a.y,b.y,t1);
return res;
}
// 128-bit addition
ulonglong2 add_128 (ulonglong2 a,t1;
res.x = ADDcc (a.x,t1);
res.y = ADDC (a.y,t1);
return res;
}
// branch free implementation of ((cond) ? a : b). cond must be in {0,1}
uint64_t mux_64 (uint64_t cond,uint64_t a,int64_t b)
{
return cond * a + (1 - cond) * b;
}
// count leading zeros
uint64_t clz_64 (uint64_t x)
{
uint64_t n = 64;
uint64_t y;
y = x >> 32; n = mux_64 (neq_64 (y,0),n - 32,n); x = mux_64 (neq_64 (y,y,x);
y = x >> 16; n = mux_64 (neq_64 (y,n - 16,x);
y = x >> 8; n = mux_64 (neq_64 (y,n - 8,x);
y = x >> 4; n = mux_64 (neq_64 (y,n - 4,x);
y = x >> 2; n = mux_64 (neq_64 (y,n - 2,x);
y = x >> 1; n = mux_64 (neq_64 (y,n - x);
return n;
}
// 64x64->128 bit unsigned multiplication
ulonglong2 my_mul_64_128 (uint64_t a,uint64_t b)
{
ulonglong2 res;
const uint64_t mask_lo32 = 0xffffffffULL;
uint64_t a_lo = a & mask_lo32;
uint64_t a_hi = a >> 32;
uint64_t b_lo = b & mask_lo32;
uint64_t b_hi = b >> 32;
uint64_t p0 = a_lo * b_lo;
uint64_t p1 = a_lo * b_hi;
uint64_t p2 = a_hi * b_lo;
uint64_t p3 = a_hi * b_hi;
uint64_t cy = ((p0 >> 32) + (p1 & mask_lo32) + (p2 & mask_lo32)) >> 32;
res.x = p0 + (p1 << 32) + (p2 << 32);
res.y = p3 + (p1 >> 32) + (p2 >> 32) + cy;
return res;
}
// 128/64->64 bit unsigned division
uint64_t my_div_128_64 (ulonglong2 dvnd,uint64_t dvsr)
{
ulonglong2 prod,rem;
uint64_t lz,q,qh,ql,rem_msb,nrm_dvsr,t1;
// normalize divisor; adjust dividend accordingly (initial partial remainder)
lz = clz_64 (dvsr);
nrm_dvsr = dvsr << lz;
rem.y = (dvnd.y << lz) | mux_64 (neq_64 (lz,dvnd.x >> (64 - lz),0);
rem.x = dvnd.x << lz;
// compute most significant quotient "digit"; TAOCP: may be off by 0,+1,+2
qh = mux_64 (geu_64 (rem.y >> 32,nrm_dvsr >> 32),0xffffffff,rem.y / (nrm_dvsr >> 32));
// compute remainder; correct quotient "digit" if remainder negative
prod = my_mul_64_128 (qh << 32,nrm_dvsr);
rem = sub_128 (rem,prod);
rem_msb= rem.y >> 63;
qh = mux_64 (rem_msb,qh - 1,qh);
rem.x = mux_64 (rem_msb,ADDcc (rem.x,nrm_dvsr << 32,rem.x);
rem.y = mux_64 (rem_msb,ADDC (rem.y,nrm_dvsr >> 32,rem.y);
rem_msb= rem.y >> 63;
qh = mux_64 (rem_msb,rem.y);
// pull down next dividend "digit"
rem.y = (rem.y << 32) | (rem.x >> 32);
rem.x = rem.x << 32;
// compute least significant quotient "digit"; TAOCP: may be off by 0,+2
ql = mux_64 (geu_64 (rem.y >> 32,rem.y / (nrm_dvsr >> 32));
// combine partial results into complete quotient
q = (qh << 32) + ql;
// compute remainder; correct quotient "digit" if remainder negative
prod = my_mul_64_128 (q,dvsr);
rem = sub_128 (dvnd,prod);
rem_msb= rem.y >> 63;
q = mux_64 (rem_msb,q - 1,q);
rem.x = mux_64 (rem_msb,dvsr,rem.y);
rem_msb= rem.y >> 63;
q = mux_64 (rem_msb,q);
return q;
}
uint64_t ref_div_128_64 (ulonglong2 dvnd,uint64_t dvsr)
{
uint64_t quot;
#if ASM_STYLE == MSVC_STYLE
__asm mov rax,qword ptr [dvnd.x];
__asm mov rdx,qword ptr [dvnd.y];
__asm div qword ptr [dvsr];
__asm mov qword ptr [quot],rax;
#else // ASM_STYLE
__asm__ (
"movq %1,%%rax\n\t"
"movq %2,%%rdx\n\t"
"divq %3\n\t"
"movq %%rax,%0\n\t"
: "=r" (quot)
: "r" (dvnd.x),"r" (dvnd.y),"r" (dvsr)
: "rax","rdx");
#endif // ASM_STYLE
return quot;
}
ulonglong2 ref_mul_64_128 (uint64_t a,uint64_t b)
{
ulonglong2 prod;
#if ASM_STYLE == MSVC_STYLE
__asm mov rax,qword ptr [a];
__asm mul qword ptr [b];
__asm mov qword ptr [prod.x],rax;
__asm mov qword ptr [prod.y],rdx;
#else // ASM_STYLE
__asm__(
"movq %2,%%rax\n\t"
"mulq %3\n\t"
"movq %%rax,%0\n\t"
"movq %%rdx,%1\n\t"
: "=r" (prod.x),"=r" (prod.y)
: "r" (a),"r" (b)
: "rax","rdx");
#endif // ASM_STYLE
return prod;
}
/*
https://groups.google.com/forum/#!original/comp.lang.c/qFv18ql_WlU/IK8KGZZFJx4J
From: geo <gmars...@gmail.com>
Newsgroups: sci.math,comp.lang.c,comp.lang.fortran
Subject: 64-bit KISS RNGs
Date: Sat,28 Feb 2009 04:30:48 -0800 (PST)
This 64-bit KISS RNG has three components,each nearly
good enough to serve alone. The components are:
Multiply-With-Carry (MWC),period (2^121+2^63-1)
Xorshift (XSH),period 2^64-1
Congruential (CNG),period 2^64
*/
static uint64_t kiss64_x = 1234567890987654321ULL;
static uint64_t kiss64_c = 123456123456123456ULL;
static uint64_t kiss64_y = 362436362436362436ULL;
static uint64_t kiss64_z = 1066149217761810ULL;
static uint64_t kiss64_t;
#define MWC64 (kiss64_t = (kiss64_x << 58) + kiss64_c,\
kiss64_c = (kiss64_x >> 6),kiss64_x += kiss64_t,\
kiss64_c += (kiss64_x < kiss64_t),kiss64_x)
#define XSH64 (kiss64_y ^= (kiss64_y << 13),kiss64_y ^= (kiss64_y >> 17),\
kiss64_y ^= (kiss64_y << 43))
#define CNG64 (kiss64_z = 6906969069ULL * kiss64_z + 1234567ULL)
#define KISS64 (MWC64 + XSH64 + CNG64)
#define PATTERN (1)
int main (void)
{
ulonglong2 dvnd,prod,diff;
uint64_t dvsr,ref,res,count = 0;
do {
count++;
do {
#if PATTERN_TEST
ulonglong2 temp1,temp2;
uint64_t shift,shift1,shift2;
shift = KISS64 & 127;
temp1.y = (shift > 63) ? (1ULL << (shift - 64)) : 0;
temp1.x = (shift > 63) ? 0ULL : (1ULL << shift);
shift = KISS64 & 127;
temp2.y = (shift > 63) ? (1ULL << (shift - 64)) : 0;
temp2.x = (shift > 63) ? 0ULL : (1ULL << shift);
dvnd = (KISS64 & 1) ? add_128 (temp1,temp2) : sub_128 (temp1,temp2);
shift1 = KISS64 & 63;
shift2 = KISS64 & 63;
dvsr = (KISS64 & 1) ? ((1ULL << shift1) + (1ULL << shift2)) :
((1ULL << shift1) - (1ULL << shift2));
#else // PATTERN_TEST
dvnd.y = KISS64;
dvnd.x = KISS64;
dvsr = KISS64;
#endif // PATTERN_TEST
} while ((dvsr == 0ULL) || (dvnd.y >= dvsr)); // guard against overflow
ref = ref_div_128_64 (dvnd,dvsr);
res = my_div_128_64 (dvnd,dvsr);
prod = ref_mul_64_128 (res,dvsr);
diff = sub_128 (dvnd,prod);
if ((diff.y != 0) || (diff.x >= dvsr)) {
printf ("error @ chk1: dvnd= %016llx_%016llx dvsr=%016llx res=%016llx ref=%016llx diff=%016llx_%016llx\n",dvnd.y,dvnd.x,diff.y,diff.x);
return EXIT_FAILURE;
}
if (res != ref) {
printf ("error @ chk2: dvnd= %016llx_%016llx dvsr=%016llx res=%016llx ref=%016llx diff=%016llx_%016llx\n",diff.x);
return EXIT_FAILURE;
}
if ((count & 0xffffff) == 0) printf ("\r%llu",count);
} while (dvsr);
return EXIT_SUCCESS;
}
如果编程语言受到更严格的限制,除谓词函数 neq_64
()、ltu_64()
和 geu_64()
中使用的那些之外,所有移位和所有逻辑运算都可以用算术代替。示例 ISO-C99 代码如下所示。可能有特定于编程语言的方法来实现这些谓词,而无需使用逻辑运算。
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#define PATTERN_TEST (0)
#define MSVC_STYLE (1)
#define GCC_STYLE (2)
#define ASM_STYLE (GCC_STYLE)
#define POW2_63 (0x8000000000000000ULL)
#define POW2_32 (0x0000000100000000ULL)
#define POW2_16 (0x0000000000010000ULL)
#define POW2_8 (0x0000000000000100ULL)
#define POW2_5 (0x0000000000000020ULL)
#define POW2_4 (0x0000000000000010ULL)
#define POW2_3 (0x0000000000000008ULL)
#define POW2_2 (0x0000000000000004ULL)
#define POW2_1 (0x0000000000000002ULL)
#define POW2_0 (0x0000000000000001ULL)
// our own 128-bit unsigned integer type
typedef struct
{
uint64_t x;
uint64_t y;
} ulonglong2;
// (a < b) ? 1 : 0
uint64_t ltu_64 (uint64_t a,uint64_t b)
{
return ((~a & b) + ((~a ^ b) / POW2_1)) / POW2_63;
}
// (a >= b) ? 1 : 0
uint64_t geu_64 (uint64_t a,uint64_t b)
{
return ((a | ~b) - ((a ^ ~b) / POW2_1)) / POW2_63;
}
// (a != b) ? 1 : 0
uint64_t neq_64 (uint64_t a,uint64_t b)
{
return ((a - b) | (b - a)) / POW2_63;
}
//#define ADDcc(a,int64_t b)
{
return cond * a + (1 - cond) * b;
}
// count leading zeros
uint64_t clz_64 (uint64_t x)
{
uint64_t n = 64;
uint64_t c,y;
y = x / POW2_32; c = neq_64 (y,0); n = mux_64 (c,n); x = mux_64 (c,x);
y = x / POW2_16; c = neq_64 (y,x);
y = x / POW2_8; c = neq_64 (y,x);
y = x / POW2_4; c = neq_64 (y,x);
y = x / POW2_2; c = neq_64 (y,x);
y = x / POW2_1; c = neq_64 (y,n - x);
return n;
}
// compute 2**i,0 <= i <=63
uint64_t pow2i (uint64_t i)
{
uint64_t r = 1ULL;
r = mux_64 ((i / POW2_5) % 2,r * POW2_32,r);
r = mux_64 ((i / POW2_4) % 2,r * POW2_16,r);
r = mux_64 ((i / POW2_3) % 2,r * POW2_8,r);
r = mux_64 ((i / POW2_2) % 2,r * POW2_4,r);
r = mux_64 ((i / POW2_1) % 2,r * POW2_2,r);
r = mux_64 ((i / POW2_0) % 2,r * POW2_1,r);
return r;
}
// 64x64->128 bit unsigned multiplication
ulonglong2 my_mul_64_128 (uint64_t a,uint64_t b)
{
ulonglong2 res;
uint64_t a_hi = a / POW2_32;
uint64_t a_lo = a % POW2_32;
uint64_t b_hi = b / POW2_32;
uint64_t b_lo = b % POW2_32;
uint64_t p0 = a_lo * b_lo;
uint64_t p1 = a_lo * b_hi;
uint64_t p2 = a_hi * b_lo;
uint64_t p3 = a_hi * b_hi;
uint64_t cy = ((p0 / POW2_32) + (p1 % POW2_32) + (p2 % POW2_32)) / POW2_32;
res.x = p0 + (p1 * POW2_32) + (p2 * POW2_32);
res.y = p3 + (p1 / POW2_32) + (p2 / POW2_32) + cy;
return res;
}
// 128/64->64 bit unsigned division
uint64_t my_div_128_64 (ulonglong2 dvnd,lz_nz,scal,shft,nrm_dvsr_hi,t1;
// normalize divisor; adjust dividend accordingly (initial partial remainder)
lz = clz_64 (dvsr);
lz_nz = neq_64 (lz,0);
scal = pow2i (lz);
shft = mux_64 (lz_nz,64 - lz,0);
nrm_dvsr = dvsr * scal;
rem.y = dvnd.y * scal + mux_64 (lz_nz,dvnd.x / pow2i (shft),0);
rem.x = dvnd.x * scal;
nrm_dvsr_hi = nrm_dvsr / POW2_32;
// compute most significant quotient "digit"; TAOCP: may be off by 0,+2
qh = mux_64 (geu_64 (rem.y / POW2_32,nrm_dvsr_hi),rem.y / (nrm_dvsr_hi));
// compute remainder; correct quotient "digit" if remainder negative
prod = my_mul_64_128 (qh * POW2_32,prod);
rem_msb= rem.y / POW2_63;
qh = mux_64 (rem_msb,nrm_dvsr * POW2_32,nrm_dvsr / POW2_32,rem.y);
rem_msb= rem.y / POW2_63;
qh = mux_64 (rem_msb,rem.y);
// pull down next dividend "digit"
rem.y = rem.y * POW2_32 + rem.x / POW2_32;
rem.x = rem.x * POW2_32;
// compute least significant quotient "digit"; TAOCP: may be off by 0,+2
ql = mux_64 (geu_64 (rem.y / POW2_32,rem.y / (nrm_dvsr_hi));
// combine partial results into complete quotient
q = qh * POW2_32 + ql;
// compute remainder; correct quotient "digit" if remainder negative
prod = my_mul_64_128 (q,prod);
rem_msb= rem.y / POW2_63;
q = mux_64 (rem_msb,rem.y);
rem_msb= rem.y / POW2_63;
q = mux_64 (rem_msb,count);
} while (dvsr);
return EXIT_SUCCESS;
}
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。