punpcklbw在 MMX/SSE/AVX 中交错有哪些用例?

如何解决punpcklbw在 MMX/SSE/AVX 中交错有哪些用例?

  1. 哪些类的算法可以使用 punpcklbw

  2. 特别是punpcklbw xmm0,xmm0在做什么?

  3. 然而,maskedPow2_Value 有什么用?

    maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val; // Val 是整数 maskedPow2_Value = 0x101010101010101i64 * maskedValue;

(或 mov r9,101010101010101h; imul rdx,r9; 两次)

一个完整的例子(该函数被命名为 CompressPacket 但它可能会产生误导),作为 IDA 反编译的结果:

void *__cdecl CompressPacket(void *Dst,int Val,size_t Size)
{
  __int64 maskedPow2_Value; // rdx
  unsigned int v5; // ecx
  __int64 *bufferOut; // rcx
  size_t size_; // r9
  size_t i; // r9
  size_t size__; // r9
  size_t counter; // r8
  size_t j; // r9
  void *result; // rax
  __m128i v13; // xmm0
  __int64 lsb4; // rax
  size_t counter1; // r9
  size_t k; // r9
  size_t lsb4_; // r8
  __int64 maskedValue; // rdx

  *(_QWORD *)&Val = (unsigned __int8)Val;
  maskedValue = 0x101010101010101i64 * *(_QWORD *)&Val;
  bufferOut = (__int64 *)((char *)Dst + Size);
  result = Dst;
  switch ( Size )
  {
    case 0ui64:
      return result;
    case 1ui64:
      goto LBL_1_F;
    case 2ui64:
      goto LBL_2_E;
    case 3ui64:
      goto LBL_3_F;
    case 4ui64:
      goto LBL_4_C;
    case 5ui64:
      goto LBL_5_D;
    case 6ui64:
      goto LBL_6_E;
    case 7ui64:
      goto LBL_7_F;
    case 8ui64:
      *(bufferOut - 1) = maskedValue;
      return result;
    case 9ui64:
      *(__int64 *)((char *)bufferOut - 9) = maskedValue;
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    case 0xAui64:
      *(__int64 *)((char *)bufferOut - 10) = maskedValue;
      *((_WORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xBui64:
      *(__int64 *)((char *)bufferOut - 11) = maskedValue;
      goto LBL_3_F;
    case 0xCui64:
      *(__int64 *)((char *)bufferOut - 12) = maskedValue;
LBL_4_C:
      *((_DWORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xDui64:
      *(__int64 *)((char *)bufferOut - 13) = maskedValue;
LBL_5_D:
      *(_DWORD *)((char *)bufferOut - 5) = maskedValue;
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    case 0xEui64:
      *(__int64 *)((char *)bufferOut - 14) = maskedValue;
LBL_6_E:
      *(_DWORD *)((char *)bufferOut - 6) = maskedValue;
LBL_2_E:
      *((_WORD *)bufferOut - 1) = maskedValue;
      return result;
    case 0xFui64:
      *(__int64 *)((char *)bufferOut - 15) = maskedValue;
LBL_7_F:
      *(_DWORD *)((char *)bufferOut - 7) = maskedValue;
LBL_3_F:
      *(_WORD *)((char *)bufferOut - 3) = maskedValue;
LBL_1_F:
      *((_BYTE *)bufferOut - 1) = maskedValue;
      return result;
    default:
      if ( _bittest(dword_7FFFF4B237D8,1u) )
      {
        memset(bufferOut,maskedValue,Size);
        return Dst;
      }
      maskedPow2_Value = 0x101010101010101i64 * maskedValue;
      if ( !_bittest(dword_7FFFF4B237D8,2u) )
      {
        if ( Size >= 0x40 )
        {
          v5 = -(int)bufferOut & 7;
          if ( v5 )
          {
            Size -= v5;
            *(_QWORD *)Dst = maskedPow2_Value;
          }
          bufferOut = (__int64 *)((char *)Dst + v5);
          size_ = Size;
          Size &= 0x3Fu;
          for ( i = size_ >> 6; i; *(bufferOut - 1) = maskedPow2_Value )
          {
            *bufferOut = maskedPow2_Value;
            bufferOut[1] = maskedPow2_Value;
            bufferOut[2] = maskedPow2_Value;
            bufferOut += 8;
            *(bufferOut - 5) = maskedPow2_Value;
            *(bufferOut - 4) = maskedPow2_Value;
            --i;
            *(bufferOut - 3) = maskedPow2_Value;
            *(bufferOut - 2) = maskedPow2_Value;
          }
        }
        size__ = Size;
        counter = Size & 7;
        for ( j = size__ >> 3; j; --j )
          *bufferOut++ = maskedPow2_Value;
        for ( ; counter; --counter )
        {
          *(_BYTE *)bufferOut = maskedPow2_Value;
          bufferOut = (__int64 *)((char *)bufferOut + 1);
        }
        return Dst;
      }
      v13 = _mm_unpacklo_epi8((__m128i)(unsigned __int64)maskedPow2_Value,(__m128i)(unsigned __int64)maskedPow2_Value);
      if ( ((unsigned __int8)bufferOut & 0xF) != 0 )
      {
        *(__m128i *)bufferOut = v13;
        lsb4 = (unsigned __int8)bufferOut & 0xF;
        bufferOut = (__int64 *)((char *)bufferOut - lsb4 + 16);
        Size = lsb4 + Size - 16;
      }
      counter1 = Size >> 7;
      if ( Size >> 7 )
      {
        do
        {
          *(__m128i *)bufferOut = v13;
          *((__m128i *)bufferOut + 1) = v13;
          bufferOut += 16;
          *((__m128i *)bufferOut - 6) = v13;
          *((__m128i *)bufferOut - 5) = v13;
          --counter1;
          *((__m128i *)bufferOut - 4) = v13;
          *((__m128i *)bufferOut - 3) = v13;
          *((__m128i *)bufferOut - 2) = v13;
          *((__m128i *)bufferOut - 1) = v13;
        }
        while ( counter1 );
        Size &= 0x7Fu;
      }
      for ( k = Size >> 4; k; --k )
      {
        *(__m128i *)bufferOut = v13;
        bufferOut += 2;
      }
      lsb4_ = Size & 0xF;
      if ( lsb4_ )
        *(__m128i *)((char *)bufferOut + lsb4_ - 16) = v13;
      return Dst;
  }
}

以及 IDA 的反汇编:

.text:00007FFFF4AF6440 ; void *__cdecl CompressPacket(void *Dst,size_t Size)
.text:00007FFFF4AF6440 CompressPacket  proc near               ; CODE XREF: j_memset↑j
.text:00007FFFF4AF6440                                         ; Concurrency::details::ResourceManager::CreateAllocatedNodeData(void)+49↑p ...
.text:00007FFFF4AF6440                 mov     r11,rcx
.text:00007FFFF4AF6443                 movzx   edx,dl         ; Move with Zero-Extend
.text:00007FFFF4AF6446                 cmp     r8,10h         ; switch 16 cases
.text:00007FFFF4AF644A                 jb      SetBytes15      ; Jump if Below (CF=1)
.text:00007FFFF4AF6450
.text:00007FFFF4AF6450 def_7FFFF4AF65D2:                       ; jumptable 00007FFFF4AF65D2 default case
.text:00007FFFF4AF6450                 bt      cs:dword_7FFFF4B237D8,1
.text:00007FFFF4AF6458                 jnb     short mset05    ; Jump if Not Below (CF=0)
.text:00007FFFF4AF645A                 push    rdi
.text:00007FFFF4AF645B                 mov     rdi,rcx
.text:00007FFFF4AF645E                 mov     eax,edx
.text:00007FFFF4AF6460                 mov     rcx,r8
.text:00007FFFF4AF6463                 rep stosb               ; Store String
.text:00007FFFF4AF6465                 pop     rdi
.text:00007FFFF4AF6466                 jmp     short mset60    ; Jump
.text:00007FFFF4AF6468 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6468
.text:00007FFFF4AF6468 mset05:                                 ; CODE XREF: CompressPacket+18↑j
.text:00007FFFF4AF6468                 mov     r9,101010101010101h
.text:00007FFFF4AF6472                 imul    rdx,r9         ; Signed Multiply
.text:00007FFFF4AF6476                 bt      cs:dword_7FFFF4B237D8,2 ; Bit Test
.text:00007FFFF4AF647E                 jb      msetxmm10       ; Jump if Below (CF=1)
.text:00007FFFF4AF6484                 cmp     r8,40h ; '@'   ; Compare Two Operands
.text:00007FFFF4AF6488                 jb      short mset20    ; Jump if Below (CF=1)
.text:00007FFFF4AF648A                 neg     rcx             ; Two's Complement Negation
.text:00007FFFF4AF648D                 and     ecx,7          ; Logical AND
.text:00007FFFF4AF6490                 jz      short mset10    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6492                 sub     r8,rcx         ; Integer Subtraction
.text:00007FFFF4AF6495                 mov     [r11],rdx
.text:00007FFFF4AF6498
.text:00007FFFF4AF6498 mset10:                                 ; CODE XREF: CompressPacket+50↑j
.text:00007FFFF4AF6498                 add     rcx,r11        ; Add
.text:00007FFFF4AF649B                 mov     r9,r8
.text:00007FFFF4AF649E                 and     r8,3Fh         ; Logical AND
.text:00007FFFF4AF64A2                 shr     r9,6           ; Shift Logical Right
.text:00007FFFF4AF64A6                 jnz     short mset80    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64A8
.text:00007FFFF4AF64A8 mset20:                                 ; CODE XREF: CompressPacket+48↑j
.text:00007FFFF4AF64A8                                         ; CompressPacket+CF↓j
.text:00007FFFF4AF64A8                 mov     r9,r8
.text:00007FFFF4AF64AB                 and     r8,7           ; Logical AND
.text:00007FFFF4AF64AF                 shr     r9,3           ; Shift Logical Right
.text:00007FFFF4AF64B3                 jz      short mset40    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64B5                 db      66h,66h
.text:00007FFFF4AF64B5                 xchg    ax,ax          ; Exchange Register/Memory with Register
.text:00007FFFF4AF64B9                 nop                     ; No Operation
.text:00007FFFF4AF64BA
.text:00007FFFF4AF64BA mset30:                                 ; CODE XREF: CompressPacket+84↓j
.text:00007FFFF4AF64BA                 mov     [rcx],rdx
.text:00007FFFF4AF64BD                 add     rcx,8          ; Add
.text:00007FFFF4AF64C1                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF64C4                 jnz     short mset30    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64C6
.text:00007FFFF4AF64C6 mset40:                                 ; CODE XREF: CompressPacket+73↑j
.text:00007FFFF4AF64C6                 test    r8,r8          ; Logical Compare
.text:00007FFFF4AF64C9                 jz      short mset60    ; Jump if Zero (ZF=1)
.text:00007FFFF4AF64CB
.text:00007FFFF4AF64CB mset50:                                 ; CODE XREF: CompressPacket+93↓j
.text:00007FFFF4AF64CB                 mov     [rcx],dl
.text:00007FFFF4AF64CD                 inc     rcx             ; Increment by 1
.text:00007FFFF4AF64D0                 dec     r8              ; Decrement by 1
.text:00007FFFF4AF64D3                 jnz     short mset50    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF64D5
.text:00007FFFF4AF64D5 mset60:                                 ; CODE XREF: CompressPacket+26↑j
.text:00007FFFF4AF64D5                                         ; CompressPacket+89↑j
.text:00007FFFF4AF64D5                 mov     rax,r11
.text:00007FFFF4AF64D8                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF64D8 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64D9                 db 0Fh,1Fh,80h,4 dup(0)
.text:00007FFFF4AF64E0                 db 3 dup(66h),90h
.text:00007FFFF4AF64E4                 db 2 dup(66h),90h
.text:00007FFFF4AF64E7 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF64E7
.text:00007FFFF4AF64E7 mset80:                                 ; CODE XREF: CompressPacket+66↑j
.text:00007FFFF4AF64E7                                         ; CompressPacket+CD↓j
.text:00007FFFF4AF64E7                 mov     [rcx],rdx
.text:00007FFFF4AF64EA                 mov     [rcx+8],rdx
.text:00007FFFF4AF64EE                 mov     [rcx+10h],rdx
.text:00007FFFF4AF64F2                 add     rcx,40h ; '@'  ; Add
.text:00007FFFF4AF64F6                 mov     [rcx-28h],rdx
.text:00007FFFF4AF64FA                 mov     [rcx-20h],rdx
.text:00007FFFF4AF64FE                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF6501                 mov     [rcx-18h],rdx
.text:00007FFFF4AF6505                 mov     [rcx-10h],rdx
.text:00007FFFF4AF6509                 mov     [rcx-8],rdx
.text:00007FFFF4AF650D                 jnz     short mset80    ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF650F                 jmp     short mset20    ; Jump
.text:00007FFFF4AF650F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6511                 align 20h
.text:00007FFFF4AF6520
.text:00007FFFF4AF6520 msetxmm10:                              ; CODE XREF: CompressPacket+3E↑j
.text:00007FFFF4AF6520                 movq    xmm0,rdx       ; Move 64 bits
.text:00007FFFF4AF6525                 punpcklbw xmm0,xmm0    ; Unpack Low Packed Data (Byte->Word)
.text:00007FFFF4AF6529                 test    cl,0Fh         ; Logical Compare
.text:00007FFFF4AF652C                 jz      short msetxmm20 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF652E                 movups  xmmword ptr [rcx],xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF6531                 mov     rax,rcx
.text:00007FFFF4AF6534                 and     rax,0Fh        ; Logical AND
.text:00007FFFF4AF6538                 add     rcx,10h        ; Add
.text:00007FFFF4AF653C                 sub     rcx,rax        ; Integer Subtraction
.text:00007FFFF4AF653F                 lea     r8,[rax+r8-10h] ; Load Effective Address
.text:00007FFFF4AF6544
.text:00007FFFF4AF6544 msetxmm20:                              ; CODE XREF: CompressPacket+EC↑j
.text:00007FFFF4AF6544                 mov     r9,r8
.text:00007FFFF4AF6547                 shr     r9,7           ; Shift Logical Right
.text:00007FFFF4AF654B                 jz      short msetxmm40 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF654D                 jmp     short msetxmm30 ; Jump
.text:00007FFFF4AF654D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF654F                 align 10h
.text:00007FFFF4AF6550
.text:00007FFFF4AF6550 msetxmm30:                              ; CODE XREF: CompressPacket+10D↑j
.text:00007FFFF4AF6550                                         ; CompressPacket+139↓j
.text:00007FFFF4AF6550                 movaps  xmmword ptr [rcx],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6553                 movaps  xmmword ptr [rcx+10h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6557                 add     rcx,80h ; '€'  ; Add
.text:00007FFFF4AF655E                 movaps  xmmword ptr [rcx-60h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6562                 movaps  xmmword ptr [rcx-50h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6566                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF6569                 movaps  xmmword ptr [rcx-40h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF656D                 movaps  xmmword ptr [rcx-30h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6571                 movaps  xmmword ptr [rcx-20h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6575                 movaps  xmmword ptr [rcx-10h],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6579                 jnz     short msetxmm30 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF657B                 and     r8,7Fh         ; Logical AND
.text:00007FFFF4AF657F
.text:00007FFFF4AF657F msetxmm40:                              ; CODE XREF: CompressPacket+10B↑j
.text:00007FFFF4AF657F                 mov     r9,r8
.text:00007FFFF4AF6582                 shr     r9,4           ; Shift Logical Right
.text:00007FFFF4AF6586                 jz      short msetxmm60 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF6588                 nop     dword ptr [rax+rax+00000000h] ; No Operation
.text:00007FFFF4AF6590
.text:00007FFFF4AF6590 msetxmm50:                              ; CODE XREF: CompressPacket+15A↓j
.text:00007FFFF4AF6590                 movaps  xmmword ptr [rcx],xmm0 ; Move Aligned Four Packed Single-FP
.text:00007FFFF4AF6593                 add     rcx,10h        ; Add
.text:00007FFFF4AF6597                 dec     r9              ; Decrement by 1
.text:00007FFFF4AF659A                 jnz     short msetxmm50 ; Jump if Not Zero (ZF=0)
.text:00007FFFF4AF659C
.text:00007FFFF4AF659C msetxmm60:                              ; CODE XREF: CompressPacket+146↑j
.text:00007FFFF4AF659C                 and     r8,0Fh         ; Logical AND
.text:00007FFFF4AF65A0                 jz      short msetxmm70 ; Jump if Zero (ZF=1)
.text:00007FFFF4AF65A2                 movups  xmmword ptr [r8+rcx-10h],xmm0 ; Move Unaligned Four Packed Single-FP
.text:00007FFFF4AF65A8
.text:00007FFFF4AF65A8 msetxmm70:                              ; CODE XREF: CompressPacket+160↑j
.text:00007FFFF4AF65A8                 mov     rax,r11
.text:00007FFFF4AF65AB                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF65AC ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65AC
.text:00007FFFF4AF65AC SetBytes15:                             ; CODE XREF: CompressPacket+A↑j
.text:00007FFFF4AF65AC                 mov     r9,101010101010101h
.text:00007FFFF4AF65B6                 imul    rdx,r9         ; Signed Multiply
.text:00007FFFF4AF65BA                 lea     r9,cs:7FFFF4AB0000h ; Load Effective Address
.text:00007FFFF4AF65C1                 mov     eax,ds:(jpt_7FFFF4AF65D2 - 7FFFF4AB0000h)[r9+r8*4]
.text:00007FFFF4AF65C9                 add     r9,rax         ; Add
.text:00007FFFF4AF65CC                 add     rcx,r8         ; Add
.text:00007FFFF4AF65CF                 mov     rax,r11
.text:00007FFFF4AF65D2                 jmp     r9              ; switch jump
.text:00007FFFF4AF65D2 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF65D5 jpt_7FFFF4AF65D2 dd offset msetTab00 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                                         ; DATA XREF: CompressPacket+181↑r
.text:00007FFFF4AF65D5                 dd offset msetTab01 - 7FFFF4AB0000h ; jump table for switch statement
.text:00007FFFF4AF65D5                 dd offset msetTab02 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab03 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab04 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab05 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab06 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab07 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab08 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab09 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab10 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab11 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab12 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab13 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab14 - 7FFFF4AB0000h
.text:00007FFFF4AF65D5                 dd offset msetTab15 - 7FFFF4AB0000h
.text:00007FFFF4AF6615                 align 20h
.text:00007FFFF4AF6620
.text:00007FFFF4AF6620 msetTab15:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6620                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6620                 mov     [rcx-0Fh],rdx  ; jumptable 00007FFFF4AF65D2 case 15
.text:00007FFFF4AF6624
.text:00007FFFF4AF6624 msetTab07:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6624                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6624                 mov     [rcx-7],edx    ; jumptable 00007FFFF4AF65D2 case 7
.text:00007FFFF4AF6627
.text:00007FFFF4AF6627 msetTab03:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6627                                         ; CompressPacket+1F3↓j
.text:00007FFFF4AF6627                                         ; DATA XREF: ...
.text:00007FFFF4AF6627                 mov     [rcx-3],dx     ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF662B
.text:00007FFFF4AF662B msetTab01:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662B                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662B                 mov     [rcx-1],dl     ; jumptable 00007FFFF4AF65D2 case 1
.text:00007FFFF4AF662E
.text:00007FFFF4AF662E msetTab00:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662E                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662E                 retn                    ; jumptable 00007FFFF4AF65D2 case 0
.text:00007FFFF4AF662F ; ---------------------------------------------------------------------------
.text:00007FFFF4AF662F
.text:00007FFFF4AF662F msetTab11:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF662F                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF662F                 mov     [rcx-0Bh],rdx  ; jumptable 00007FFFF4AF65D2 case 11
.text:00007FFFF4AF6633                 jmp     short msetTab03 ; jumptable 00007FFFF4AF65D2 case 3
.text:00007FFFF4AF6635 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6635
.text:00007FFFF4AF6635 msetTab14:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6635                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6635                 mov     [rcx-0Eh],rdx  ; jumptable 00007FFFF4AF65D2 case 14
.text:00007FFFF4AF6639
.text:00007FFFF4AF6639 msetTab06:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6639                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6639                 mov     [rcx-6],edx    ; jumptable 00007FFFF4AF65D2 case 6
.text:00007FFFF4AF663C
.text:00007FFFF4AF663C msetTab02:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF663C                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF663C                 mov     [rcx-2],dx     ; jumptable 00007FFFF4AF65D2 case 2
.text:00007FFFF4AF6640                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6641 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6641
.text:00007FFFF4AF6641 msetTab13:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6641                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6641                 mov     [rcx-0Dh],rdx  ; jumptable 00007FFFF4AF65D2 case 13
.text:00007FFFF4AF6645
.text:00007FFFF4AF6645 msetTab05:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6645                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6645                 mov     [rcx-5],edx    ; jumptable 00007FFFF4AF65D2 case 5
.text:00007FFFF4AF6648                 mov     [rcx-1],dl
.text:00007FFFF4AF664B                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF664C ; ---------------------------------------------------------------------------
.text:00007FFFF4AF664C
.text:00007FFFF4AF664C msetTab12:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF664C                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF664C                 mov     [rcx-0Ch],rdx  ; jumptable 00007FFFF4AF65D2 case 12
.text:00007FFFF4AF6650
.text:00007FFFF4AF6650 msetTab04:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6650                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6650                 mov     [rcx-4],edx    ; jumptable 00007FFFF4AF65D2 case 4
.text:00007FFFF4AF6653                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6654 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6654
.text:00007FFFF4AF6654 msetTab10:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6654                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6654                 mov     [rcx-0Ah],rdx  ; jumptable 00007FFFF4AF65D2 case 10
.text:00007FFFF4AF6658                 mov     [rcx-2],dx
.text:00007FFFF4AF665C                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF665D ; ---------------------------------------------------------------------------
.text:00007FFFF4AF665D
.text:00007FFFF4AF665D msetTab09:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF665D                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF665D                 mov     [rcx-9],rdx    ; jumptable 00007FFFF4AF65D2 case 9
.text:00007FFFF4AF6661                 mov     [rcx-1],dl
.text:00007FFFF4AF6664                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6665 ; ---------------------------------------------------------------------------
.text:00007FFFF4AF6665
.text:00007FFFF4AF6665 msetTab08:                              ; CODE XREF: CompressPacket+192↑j
.text:00007FFFF4AF6665                                         ; DATA XREF: CompressPacket:jpt_7FFFF4AF65D2↑o
.text:00007FFFF4AF6665                 mov     [rcx-8],rdx    ; jumptable 00007FFFF4AF65D2 case 8
.text:00007FFFF4AF6669                 retn                    ; Return Near from Procedure
.text:00007FFFF4AF6669 CompressPacket  endp

解决方法

一个常见的用例是用零解包以将 8 位数字扩展到 16 位(带有零扩展),如 SSE4.1 pmovzxbw。或者特别是解压缩 16 字节寄存器的低半部分和高半部分,以获得两个 8x 16 位元素的向量。 这是“解包”名称有意义的唯一用例,而 packuswb 是它的反义词, 将 2 个寄存器组合为 1。(或 packsswb 用于符号饱和.)

“unpack”这个名字很奇怪;它只是一个从两个寄存器中交错元素的洗牌。 ARM NEON 有一个 similar shuffle whose mnemonic is "zip"


就您而言,它是一部分将一个字节广播到 XMM 寄存器中,作为 memset 的一部分。即它是 _mm_set_epi8(x) 的一部分。

乘以 0x0101010101010101 在 64 位整数中重复一个字节 8 次。这让您可以将标量整数存储用于奇数 8 个字节(不是 16 的倍数),例如 mov [r11],rdx 存储。

将这个 8 字节广播作为输入(通过 movaq),只需要一次 SIMD shuffle。用 punpcklqdq 复制低位 8 是我的选择,因为 8 字节粒度混洗在像 Core 2 这样的老式 CPU 上效率更高。但是将字节相互交错是等效的,因为无论如何它们都是相同的,生成一个 XMM 寄存器,其中包含 16 个相同字节的副本。

事实上,SSE2 可以用一条指令广播一个双字:pshufd xmm0,xmm0,0,所以如果不是想要一个 8 字节的标量,它可以只使用 imul edx,r9d,0x01010101

使用 8 字节 mov 和 16 字节 movups 存储实现 memset 当然需要将其作为输入,如果它使用该策略而不是 rep stosb 策略。

使用 SSSE3,您可以直接广播单个字节,其中一个 pshufb 带有全零向量(无需先乘法)为目标的每个元素选择源的第 0 个元素。或者使用 AVX2 vpbroadcastb。跳过整数乘法步骤就可以了;您可以使用来自 xmm0 而不是来自 RDX 的 movq [mem],xmm0 8 字节存储。

xmm 寄存器底部有一个字节,其他元素中有垃圾(即如果你没有使用 imul),2x punpcklbw + pshufd 可以广播上证二。或者当然 punpcklbw xmm0,xmm0 / punpcklwd xmm0,xmm0 作为前 2 次洗牌。或 punpcklbw xmm0,xmm0 / pshuflw xmm0,0 / punpcklqdq xmm0,xmm0

版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。

相关推荐


使用本地python环境可以成功执行 import pandas as pd import matplotlib.pyplot as plt # 设置字体 plt.rcParams['font.sans-serif'] = ['SimHei'] # 能正确显示负号 p
错误1:Request method ‘DELETE‘ not supported 错误还原:controller层有一个接口,访问该接口时报错:Request method ‘DELETE‘ not supported 错误原因:没有接收到前端传入的参数,修改为如下 参考 错误2:cannot r
错误1:启动docker镜像时报错:Error response from daemon: driver failed programming external connectivity on endpoint quirky_allen 解决方法:重启docker -> systemctl r
错误1:private field ‘xxx‘ is never assigned 按Altʾnter快捷键,选择第2项 参考:https://blog.csdn.net/shi_hong_fei_hei/article/details/88814070 错误2:启动时报错,不能找到主启动类 #
报错如下,通过源不能下载,最后警告pip需升级版本 Requirement already satisfied: pip in c:\users\ychen\appdata\local\programs\python\python310\lib\site-packages (22.0.4) Coll
错误1:maven打包报错 错误还原:使用maven打包项目时报错如下 [ERROR] Failed to execute goal org.apache.maven.plugins:maven-resources-plugin:3.2.0:resources (default-resources)
错误1:服务调用时报错 服务消费者模块assess通过openFeign调用服务提供者模块hires 如下为服务提供者模块hires的控制层接口 @RestController @RequestMapping("/hires") public class FeignControl
错误1:运行项目后报如下错误 解决方案 报错2:Failed to execute goal org.apache.maven.plugins:maven-compiler-plugin:3.8.1:compile (default-compile) on project sb 解决方案:在pom.
参考 错误原因 过滤器或拦截器在生效时,redisTemplate还没有注入 解决方案:在注入容器时就生效 @Component //项目运行时就注入Spring容器 public class RedisBean { @Resource private RedisTemplate<String
使用vite构建项目报错 C:\Users\ychen\work>npm init @vitejs/app @vitejs/create-app is deprecated, use npm init vite instead C:\Users\ychen\AppData\Local\npm-
参考1 参考2 解决方案 # 点击安装源 协议选择 http:// 路径填写 mirrors.aliyun.com/centos/8.3.2011/BaseOS/x86_64/os URL类型 软件库URL 其他路径 # 版本 7 mirrors.aliyun.com/centos/7/os/x86
报错1 [root@slave1 data_mocker]# kafka-console-consumer.sh --bootstrap-server slave1:9092 --topic topic_db [2023-12-19 18:31:12,770] WARN [Consumer clie
错误1 # 重写数据 hive (edu)> insert overwrite table dwd_trade_cart_add_inc > select data.id, > data.user_id, > data.course_id, > date_format(
错误1 hive (edu)> insert into huanhuan values(1,'haoge'); Query ID = root_20240110071417_fe1517ad-3607-41f4-bdcf-d00b98ac443e Total jobs = 1
报错1:执行到如下就不执行了,没有显示Successfully registered new MBean. [root@slave1 bin]# /usr/local/software/flume-1.9.0/bin/flume-ng agent -n a1 -c /usr/local/softwa
虚拟及没有启动任何服务器查看jps会显示jps,如果没有显示任何东西 [root@slave2 ~]# jps 9647 Jps 解决方案 # 进入/tmp查看 [root@slave1 dfs]# cd /tmp [root@slave1 tmp]# ll 总用量 48 drwxr-xr-x. 2
报错1 hive> show databases; OK Failed with exception java.io.IOException:java.lang.RuntimeException: Error in configuring object Time taken: 0.474 se
报错1 [root@localhost ~]# vim -bash: vim: 未找到命令 安装vim yum -y install vim* # 查看是否安装成功 [root@hadoop01 hadoop]# rpm -qa |grep vim vim-X11-7.4.629-8.el7_9.x
修改hadoop配置 vi /usr/local/software/hadoop-2.9.2/etc/hadoop/yarn-site.xml # 添加如下 <configuration> <property> <name>yarn.nodemanager.res