解析 x86 机器代码指令的过程

如何解决解析 x86 机器代码指令的过程

我一直在考虑 disassemblers 以及如何从机器代码转回汇编（或机器代码转回某些可以像在 VM 中一样进行评估的中间形式）。这让我想到了 xed，这是一个不错的项目，但它极其复杂且难以理解。我找到了我正在寻找的粗略 piece of code，它基本上归结为：

decode(decoder) {
  prefix_scanner(decoder)
  opcode_scanner(decoder)
  modrm_scanner(decoder)
  sib_scanner(decoder)
  disp_scanner(decoder)
  imm_scanner(decoder)
}

那种将指令输入位解析为某种结构或对象的基本过程的提示。

This one 甚至可能更好，但它没有太多活动，尽管它说它彻底和完整。他们有这个功能（一种独特实现的代码，其中一切都是全局变量......看起来更像是一个演示而不是一个模块）：

function DecodeInstruction()
{
  //Reset Prefix adjustments,and vector setting adjustments.

  Reset();

  var out = ""; //The instruction code that will be returned back from this function.

  //Record the starting position.

  InstructionPos = GetPosition();

  //First read any opcodes (prefix) that act as adjustments to the main three operand decode functions ^DecodeRegValue()^,//^Decode_ModRM_SIB_Address()^,and ^DecodeImmediate()^.

  DecodePrefixAdjustments();

  //Only continue if an invalid opcode is not read by DecodePrefixAdjustments() for cpu bit mode setting.

  if( !InvalidOp )
  {
    //Decode the instruction.

    DecodeOpcode();

    //-------------------------------------------------------------------------------------------------------------------------
    //Intel Larrabee CCCCC condition codes.
    //-------------------------------------------------------------------------------------------------------------------------

    if( Opcode >= 0x700 && Instruction.slice(-1) === "," )
    {
      Instruction = Instruction.split(",");

      //CMP conditions.

      if( Opcode >= 0x720 && Opcode <= 0x72F )
      {
        IMMValue = VectorRegister >> 2;

        if( Float || ( IMMValue !== 3 && IMMValue !== 7 ) )
        {
          Instruction = Instruction[0] + ConditionCodes[IMMValue] + Instruction[1];
        }
        else { Instruction = Instruction[0] + Instruction[1]; }

        IMMValue = 0; VectorRegister &= 0x03;
      }

      //Else High/Low.

      else
      {
        Instruction = Instruction[0] + ( ( ( VectorRegister & 1 ) === 1 ) ? "H" : "L" ) + Instruction[1];
      }
    }

    //Setup the X86 Decoder for which operands the instruction uses.

    DecodeOperandString();

    //Now only some instructions can vector extend,and that is only if the instruction is an SIMD Vector format instruction.

    if( !Vect && Extension > 0 && Opcode <= 0x400 ) { InvalidOp = true; }

    //The Width Bit setting must match the vector numbers size otherwise this create an invalid operation code in MVEX/EVEX unless the Width bit is ignored.

    if( Vect && !IgnoresWidthbit && Extension >= 2 )
    {
      InvalidOp = ( ( SIMD & 1 ) !== ( WidthBit & 1 ) ); //Note use,and ignore width bit pastern EVEX.
    }
    if( Opcode >= 0x700 ) { WidthBit ^= IgnoresWidthbit; } //L1OM Width bit invert.
  }

  //If the instruction is invalid then set the instruction to "???"

  if( InvalidOp )
  {
    out = "???" //set the returned instruction to invalid
  }

  //Else finish decoding the valid instruction.

  else
  {
    //Decode each operand along the Decoder array in order,and deactivate them.

    DecodeOperands();

    /*-------------------------------------------------------------------------------------------------------------------------
    3DNow Instruction name is encoded by the next byte after the ModR/M,and Reg operands.
    -------------------------------------------------------------------------------------------------------------------------*/

    if( Opcode === 0x10F )
    {
      //Lookup operation code.

      Instruction = M3DNow[ BinCode[CodePos] ]; NextByte();

      //If Invalid instruction.

      if( Instruction === "" || Instruction == null )
      {
        Instruction = "???"; InsOperands = "";
      }
    }

    /*-------------------------------------------------------------------------------------------------------------------------
    Synthetic virtual machine operation codes.
    -------------------------------------------------------------------------------------------------------------------------*/

    else if( Instruction === "SSS" )
    {
      //The Next two bytes after the static opcode is the select synthetic virtual machine operation code.

      var Code1 = BinCode[CodePos]; NextByte();
      var Code2 = BinCode[CodePos]; NextByte();

      //No operations exist past 4 in value for both bytes that combine to the operation code.

      if( Code1 >= 5 || Code2 >= 5 ) { Instruction = "???"; }

      //Else calculate the operation code in the 5x5 map.

      else
      {
        Instruction = MSynthetic[ ( Code1 * 5 ) + Code2 ];

        //If Invalid instruction.

        if( Instruction === "" || Instruction == null )
        {
          Instruction = "???";
        }
      }
    }

    //32/16 bit instructions 9A,and EA use Segment,and offset with Immediate format.

    if( Opcode === 0x9A || Opcode === 0xEA )
    {
      var t = InsOperands.split(",");
      InsOperands = t[1] + ":" +t[0];
    }

    //**Depending on the operation different prefixes replace others for  HLE,or MPX,and branch prediction.
    //if REP prefix,and LOCK prefix are used together,and the current decoded operation allows HLE XRELEASE.

    if(PrefixG1 === Mnemonics[0xF3] && PrefixG2 === Mnemonics[0xF0] && XRelease)
    {
      PrefixG1 = "XRELEASE"; //Then change REP to XRELEASE.
    }

    //if REPNE prefix,and the current decoded operation allows HLE XACQUIRE.

    if(PrefixG1 === Mnemonics[0xF2] && PrefixG2 === Mnemonics[0xF0] && XAcquire)
    {
      PrefixG1 = "XACQUIRE"; //Then change REP to XACQUIRE
    }

    //Depending on the order that the Repeat prefix,and Lock prefix is used flip Prefix G1,and G2 if HLEFlipG1G2 it is true.

    if((PrefixG1 === "XRELEASE" || PrefixG1 === "XACQUIRE") && HLEFlipG1G2)
    {
      t = PrefixG1; PrefixG1 = PrefixG2; PrefixG2 = t;
    }

    //if HT is active then it is a jump instruction check and adjust for the HT,and HNT prefix.

    if(HT)
    {
      if (SegOverride === Mnemonics[0x2E])
      {
        PrefixG1 = "HNT";
      }
      else if (SegOverride === Mnemonics[0x3E])
      {
        PrefixG1 = "HT";
      }
    }

    //else if Prefix is REPNE switch it to BND if operation is a MPX instruction.

    if(PrefixG1 === Mnemonics[0xF2] && BND)
    {
      PrefixG1 = "BND";
    }

    //Before the Instruction is put together check the length of the instruction if it is longer than 15 bytes the instruction is undefined.

    if ( InstructionHex.length > 30 )
    {
      //Calculate how many bytes over.

      var Dif32 = ( ( InstructionHex.length - 30 ) >> 1 );

      //Limit the instruction hex output to 15 bytes.

      InstructionHex = InstructionHex.substring( 0,30 );

      //Calculate the Difference between the Disassembler current position.

      Dif32 = Pos32 - Dif32;

      //Convert Dif to unsignified numbers.

      if( Dif32 < 0 ) { Dif32 += 0x100000000; }

      //Convert to strings.

      for (var S32 = Dif32.toString(16) ; S32.length < 8; S32 = "0" + S32);
      for (var S64 = Pos64.toString(16) ; S64.length < 8; S64 = "0" + S64);

      //Go to the Calculated address right after the Instruction UD.

      GotoPosition( S64 + S32 );

      //Set prefixes,and operands to empty strings,and set Instruction to UD.

      PrefixG1 = "";PrefixG2 = ""; Instruction = "???"; InsOperands = "";
    }

    //Put the Instruction sequence together.

    out = PrefixG1 + " " + PrefixG2 + " " + Instruction + " " + InsOperands;

    //Remove any trailing spaces because of unused prefixes.

    out = out.replace(/^[ ]+|[ ]+$/g,'');

    //Add error suppression if used.

    if( Opcode >= 0x700 || RoundMode !== 0 )
    {
      out += RoundModes[ RoundMode ];
    }

    //Return the instruction.
  }

  return( out );
}

我的第一个问题是，他们怎么知道怎么做？我在 Intel manuals 中没有看到任何用于实现反汇编器的算法。英特尔手册似乎没有任何直接的数据可以用于实现各种表格等的代码。所以在我看来，你必须做大量的工作才能将英特尔手册总结为它的本质，发现它的各个方面。然后你就可以开始考虑实现一个反汇编器了。有更容易的方法吗？我想看看如何实现反汇编程序，但首先想在指令和操作数等方面找到一个很好的 computer readable data 来源，这很好而且很干净，以帮助这个过程。因此 my interest in xed 及其数据文件。但我不认为 xed 会起作用，它看起来太乱了。

不过，主要问题是，什么是实现指令反汇编（从机器代码转换为其他任何内容、某些数据结构，甚至只是发出令牌或其他内容）的伪代码过程（或者甚至更好的 JavaScript 过程）？上面的 JavaScript 是否足够好，还是有更直接的实现？知道在伪代码中会是什么样子可能会使事情更容易理解。

我主要有两个原因。首先，我想实现一个反汇编器和一个生成器。其次，我不知道如何有效地解析字节（和字节的位），并且不需要您在整个地方前后扫描以找出 1-15 的边界字节指令。我还没有在我的脑海中看到如何轻松地判断指令的开始和结束位置，因此通过伪代码演示展示该过程将使其可见，并且更容易考虑如何编写真正的解析器。

解析 x86 机器代码指令的过程

如何解决解析 x86 机器代码指令的过程

相关推荐