Subzero. x8664. Resurrects the Target.

After a hiatus while x32 was not available in nacl's llvm, x8664 is being revived. Rejoice! The Target is now back to where it was before: the crosstests pass, and SPEC2k builds and verifies. Makefile.standalone still has the crosstests for x8664 disabled while we wait for all the plumbing that's needed for x32 support on nacl's toolchain to be available. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4077 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1537703002 .

Subzero. x8664. Resurrects the Target.
3c275ce1 · John Porto · e1d6a80f · 3c275ce1 · 3c275ce1 · 3c275ce1
Commit 3c275ce1 authored Dec 22, 2015 by John Porto
23 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -402,7 +402,7 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
          -i x8632,sandbox,sse4.1,Om1 \
          -e x8664,native,sse2 \
          -e x8664,native,sse4.1,test_vector_ops \
-          -e x8664,native,sse2,test_global \
+          -e x8664,sandbox,sse4.1,Om1 \
          -i arm32,neon \
          -e arm32,neon,test_vector_ops \
          -e arm32,neon,test_select
@@ -426,6 +426,11 @@ ifeq ($(TARGET),x8632)
  SETUP=SetupGccX8632Opt
  SPEC := -O2 --filetype=obj
 endif
+ifeq ($(TARGET),x8664)
+  TARGETFLAG=x8664
+  SETUP=SetupGccX8664Opt
+  SPEC := -O2 --filetype=obj
+endif
 ifeq ($(TARGET),arm32)
  TARGETFLAG=arm32
  SETUP=SetupGccArmOpt

--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -135,6 +135,7 @@ def main():
        asm_sz = os.path.join(args.dir, base_sz + '.sz.s')
        obj_sz = os.path.join(args.dir, base_sz + '.sz.o')
        obj_llc = os.path.join(args.dir, base_sz + '.llc.o')
        shellcmd(['{path}/pnacl-sz'.format(path=os.path.dirname(mypath)),
                  ] + args.sz_args + [
                  '-O' + args.optlevel,
@@ -153,6 +154,7 @@ def main():
                      '-filetype=obj',
                      '-o=' + obj_sz,
                      asm_sz])
        # Each separately translated Subzero object file contains its own
        # definition of the __Sz_block_profile_info profiling symbol.  Avoid
        # linker errors (multiply defined symbol) by making all copies weak.
@@ -182,10 +184,8 @@ def main():
            ).format(root=nacl_root, sb='sb' if args.sandbox else 'native'))
    pure_c = os.path.splitext(args.driver)[1] == '.c'
-    # TargetX8664 is ilp32, but pnacl-clang does not currently support such
+    # TODO(jpp): clean up stack hack related code.
-    # configuration. In order to run the crosstests we play nasty, dangerous
+    needs_stack_hack = False
-    # tricks with the stack pointer.
-    needs_stack_hack = (args.target == 'x8664')
    target_params = []
    if needs_stack_hack:
      shellcmd('{bin}/clang -g -o stack_hack.x8664.{key}.o -c '

--- a/pydir/szbuild.py
+++ b/pydir/szbuild.py
@@ -89,7 +89,7 @@ def AddOptionalArgs(argparser):
                           dest='enable_block_profile', action='store_true',
                           help='Enable basic block profiling.')
    argparser.add_argument('--target', default='x8632', dest='target',
-                           choices=['arm32', 'x8632'],
+                           choices=['arm32', 'x8632', 'x8664'],
                           help='Generate code for specified target.')
    argparser.add_argument('--verbose', '-v', dest='verbose',
                           action='store_true',
@@ -190,6 +190,7 @@ def ProcessPexe(args, pexe, exe):
        arch = {
          'arm32': 'armv7' if args.sandbox else 'arm-nonsfi',
          'x8632': 'x86-32' if args.sandbox else 'x86-32-linux',
+          'x8664': 'x86-64' if args.sandbox else 'x86-64-linux',
        }[args.target]
        # Only run pnacl-translate in hybrid mode.
@@ -240,6 +241,7 @@ def ProcessPexe(args, pexe, exe):
            triple = {
              'arm32': 'arm-nacl' if args.sandbox else 'arm',
              'x8632': 'i686-nacl' if args.sandbox else 'i686',
+              'x8664': 'x86_64-nacl' if args.sandbox else 'x86_64-linux-gnux32',
            }[args.target]
            shellcmd((
@@ -295,10 +297,12 @@ def ProcessPexe(args, pexe, exe):
        ld = {
          'arm32': 'arm-linux-gnueabihf-ld',
          'x8632': 'ld',
+          'x8664': 'ld',
        }[args.target]
        emulation = {
          'arm32': 'armelf_linux_eabi',
          'x8632': 'elf_i386',
+          'x8664': 'elf32_x86_64',
        }[args.target]
        shellcmd((
            '{ld} -r -m {emulation} -o {partial} {sz} {llc}'
@@ -345,17 +349,21 @@ def ProcessPexe(args, pexe, exe):
        linker = {
          'arm32': '/usr/bin/arm-linux-gnueabihf-g++',
          'x8632': ('{root}/../third_party/llvm-build/Release+Asserts/bin/clang'
+                   ).format(root=nacl_root),
+          'x8664': ('{root}/../third_party/llvm-build/Release+Asserts/bin/clang'
                   ).format(root=nacl_root)
        }[args.target]
        extra_linker_args = ' '.join({
          'arm32': ['-mcpu=cortex-a9'],
-          'x8632': ['-m32']
+          'x8632': ['-m32'],
+          'x8664': ['-mx32']
        }[args.target])
        lib_dir = {
          'arm32': 'arm-linux',
          'x8632': 'x86-32-linux',
+          'x8664': 'x86-64-linux',
        }[args.target]
        shellcmd((

--- a/pydir/szbuild_spec2k.py
+++ b/pydir/szbuild_spec2k.py
@@ -63,7 +63,8 @@ def main():
        os.chdir('{root}/tests/spec2k'.format(root=FindBaseNaCl()))
        setup = 'SetupGcc' + {
            'arm32': 'Arm',
-            'x8632': 'X8632'}[args.target] + 'Opt'
+            'x8632': 'X8632',
+            'x8664': 'X8664'}[args.target] + 'Opt'
        shellcmd(['./run_all.sh',
                  'RunTimedBenchmarks',
                  setup,

--- a/pydir/targets.py
+++ b/pydir/targets.py
@@ -29,9 +29,9 @@ X8632Target = TargetInfo(target='x8632',
 X8664Target = TargetInfo(target='x8664',
                         compiler_arch='x8664',
-                         triple='x86_64-none-linux',
+                         triple='x86_64-none-linux-gnux32',
                         llc_flags=['-mcpu=x86-64'],
-                         ld_emu='elf_x86_64_nacl',
+                         ld_emu='elf32_x86_64_nacl',
                         cross_headers=[])
 ARM32Target = TargetInfo(target='arm32',

--- a/src/IceAssemblerX86Base.h
+++ b/src/IceAssemblerX86Base.h
@@ -918,6 +918,14 @@ private:
  Label *getOrCreateLabel(SizeT Number, LabelVector &Labels);
+  void emitAddrSizeOverridePrefix() {
+    if (!Traits::Is64Bit) {
+      return;
+    }
+    static constexpr uint8_t AddrSizeOverridePrefix = 0x67;
+    emitUint8(AddrSizeOverridePrefix);
+  }
  // The arith_int() methods factor out the commonality between the encodings
  // of add(), Or(), adc(), sbb(), And(), sub(), Xor(), and cmp(). The Tag
  // parameter is statically asserted to be less than 8.
@@ -965,8 +973,17 @@ private:
        std::is_same<typename std::decay<RegType>::type,
                     typename Traits::GPRRegister>::value;
+    // At this point in the assembler, we have encoded regs, so it is not
+    // possible to distinguish between the "new" low byte registers introduced
+    // in x86-64 and the legacy [abcd]h registers. Because x86, we may still see
+    // ah (div) in the assembler, so we whitelist it here.
+    //
+    // The "local" uint32_t Encoded_Reg_ah is needed because RegType is an enum
+    // that is not necessarily the same type of
+    // Traits::RegisterSet::Encoded_Reg_ah.
+    constexpr uint32_t Encoded_Reg_ah = Traits::RegisterSet::Encoded_Reg_ah;
    return IsGPR && (Reg & 0x04) != 0 && (Reg & 0x08) == 0 &&
-           isByteSizedType(Ty);
+           isByteSizedType(Ty) && (Reg != Encoded_Reg_ah);
  }
  // assembleAndEmitRex is used for determining which (if any) rex prefix

--- a/src/IceAssemblerX86BaseImpl.h
+++ b/src/IceAssemblerX86BaseImpl.h
--- a/src/IceInstX8664.cpp
+++ b/src/IceInstX8664.cpp
@@ -89,19 +89,50 @@ MachineTraits<TargetX8664>::X86OperandMem::X86OperandMem(Cfg *Func, Type Ty,
  }
 }
+namespace {
+static int32_t getRematerializableOffset(Variable *Var,
+                                         const Ice::TargetX8664 *Target) {
+  int32_t Disp = Var->getStackOffset();
+  SizeT RegNum = static_cast<SizeT>(Var->getRegNum());
+  if (RegNum == Target->getFrameReg()) {
+    Disp += Target->getFrameFixedAllocaOffset();
+  } else if (RegNum != Target->getStackReg()) {
+    llvm::report_fatal_error("Unexpected rematerializable register type");
+  }
+  return Disp;
+}
+} // end of anonymous namespace
 void MachineTraits<TargetX8664>::X86OperandMem::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
+  const auto *Target = static_cast<const Ice::TargetX8664 *>(Func->getTarget());
+  // If the base is rematerializable, we need to replace it with the correct
+  // physical register (stack or base pointer), and update the Offset.
+  int32_t Disp = 0;
+  if (getBase() && getBase()->isRematerializable()) {
+    Disp += getRematerializableOffset(getBase(), Target);
+  }
+  // The index should never be rematerializable.  But if we ever allow it, then
+  // we should make sure the rematerialization offset is shifted by the Shift
+  // value.
+  if (getIndex())
+    assert(!getIndex()->isRematerializable());
  Ostream &Str = Func->getContext()->getStrEmit();
  // Emit as Offset(Base,Index,1<<Shift). Offset is emitted without the leading
  // '$'. Omit the (Base,Index,1<<Shift) part if Base==nullptr.
-  if (!Offset) {
+  if (getOffset() == nullptr && Disp == 0) {
    // No offset, emit nothing.
+  } else if (getOffset() == nullptr && Disp != 0) {
+    Str << Disp;
  } else if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(Offset)) {
-    if (Base == nullptr || CI->getValue())
+    if (Base == nullptr || CI->getValue() || Disp != 0)
      // Emit a non-zero offset without a leading '$'.
-      Str << CI->getValue();
+      Str << CI->getValue() + Disp;
  } else if (const auto *CR = llvm::dyn_cast<ConstantRelocatable>(Offset)) {
+    // TODO(sehr): ConstantRelocatable still needs updating for
+    // rematerializable base/index and Disp.
+    assert(Disp == 0);
    CR->emitWithoutPrefix(Func->getTarget());
  } else {
    llvm_unreachable("Invalid offset type for x86 mem operand");
@@ -127,6 +158,11 @@ void MachineTraits<TargetX8664>::X86OperandMem::dump(const Cfg *Func,
    return;
  bool Dumped = false;
  Str << "[";
+  int32_t Disp = 0;
+  const auto *Target = static_cast<const Ice::TargetX8664 *>(Func->getTarget());
+  if (getBase() && getBase()->isRematerializable()) {
+    Disp += getRematerializableOffset(getBase(), Target);
+  }
  if (Base) {
    if (Func)
      Base->dump(Func);
@@ -145,6 +181,12 @@ void MachineTraits<TargetX8664>::X86OperandMem::dump(const Cfg *Func,
      Index->dump(Str);
    Dumped = true;
  }
+  if (Disp) {
+    if (Disp > 0)
+      Str << "+";
+    Str << Disp;
+    Dumped = true;
+  }
  // Pretty-print the Offset.
  bool OffsetIsZero = false;
  bool OffsetIsNegative = false;
@@ -172,23 +214,23 @@ void MachineTraits<TargetX8664>::X86OperandMem::dump(const Cfg *Func,
 MachineTraits<TargetX8664>::Address
 MachineTraits<TargetX8664>::X86OperandMem::toAsmAddress(
    MachineTraits<TargetX8664>::Assembler *Asm,
-    const Ice::TargetLowering *Target) const {
+    const Ice::TargetLowering *TargetLowering) const {
-  // TODO(sehr): handle rematerializable base/index.
+  const auto *Target = static_cast<const Ice::TargetX8664 *>(TargetLowering);
-  (void)Target;
+  int32_t Disp = 0;
-  if (getBase())
+  if (getBase() && getBase()->isRematerializable()) {
-    assert(!getBase()->isRematerializable());
+    Disp += getRematerializableOffset(getBase(), Target);
+  }
  if (getIndex())
    assert(!getIndex()->isRematerializable());
-  int32_t Disp = 0;
  AssemblerFixup *Fixup = nullptr;
  // Determine the offset (is it relocatable?)
-  if (getOffset()) {
+  if (getOffset() != nullptr) {
    if (const auto *CI = llvm::dyn_cast<ConstantInteger32>(getOffset())) {
-      Disp = static_cast<int32_t>(CI->getValue());
+      Disp += static_cast<int32_t>(CI->getValue());
    } else if (const auto CR =
                   llvm::dyn_cast<ConstantRelocatable>(getOffset())) {
-      Disp = CR->getOffset() - 4;
+      Disp += CR->getOffset();
-      Fixup = Asm->createFixup(PcRelFixup, CR);
+      Fixup = Asm->createFixup(RelFixup, CR);
    } else {
      llvm_unreachable("Unexpected offset type");
    }

--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
--- a/src/IceInstX86Base.h
+++ b/src/IceInstX86Base.h
@@ -990,10 +990,10 @@ public:
        //   mov cl, ecx ==> redundant
        //   mov ch, ecx ==> not redundant due to different encodings
        //   mov ch, ebp ==> not redundant due to different base registers
-        // TODO(stichnot): Don't consider "mov eax, eax" to be redundant when
+        //   mov ecx, ecx ==> redundant, and dangerous in x86-64. i64 zexting
-        // used in 64-bit mode to clear the upper half of rax.
+        //                    is handled by Inst86Zext.
-        int32_t SrcReg = SrcVar->getRegNum();
+        const int32_t SrcReg = SrcVar->getRegNum();
-        int32_t DestReg = this->Dest->getRegNum();
+        const int32_t DestReg = this->Dest->getRegNum();
        return (InstX86Base<Machine>::Traits::getEncoding(SrcReg) ==
                InstX86Base<Machine>::Traits::getEncoding(DestReg)) &&
               (InstX86Base<Machine>::Traits::getBaseReg(SrcReg) ==
@@ -1197,6 +1197,9 @@ class InstX86Mov
    : public InstX86BaseMovlike<Machine, InstX86Base<Machine>::Mov> {
 public:
  static InstX86Mov *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    assert(!isScalarIntegerType(Dest->getType()) ||
+           (typeWidthInBytes(Dest->getType()) <=
+            typeWidthInBytes(Source->getType())));
    return new (Func->allocate<InstX86Mov>()) InstX86Mov(Func, Dest, Source);
  }

--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -198,8 +198,10 @@ InstX86Cmpxchg<Machine>::InstX86Cmpxchg(Cfg *Func, Operand *DestOrAddr,
    : InstX86BaseLockable<Machine>(Func, InstX86Base<Machine>::Cmpxchg, 3,
                                   llvm::dyn_cast<Variable>(DestOrAddr),
                                   Locked) {
-  assert(InstX86Base<Machine>::Traits::getBaseReg(Eax->getRegNum()) ==
+  constexpr uint16_t Encoded_rAX = 0;
-         InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+  (void)Encoded_rAX;
+  assert(InstX86Base<Machine>::Traits::getEncodedGPR(Eax->getRegNum()) ==
+         Encoded_rAX);
  this->addSource(DestOrAddr);
  this->addSource(Eax);
  this->addSource(Desired);
@@ -1302,7 +1304,9 @@ void InstX86Cbwdq<Machine>::emitIAS(const Cfg *Func) const {
    Asm->cdq();
    break;
  case IceType_i64:
-    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(InstX86Base<Machine>::Traits::Is64Bit);
+    assert(SrcReg == InstX86Base<Machine>::Traits::getRaxOrDie());
+    assert(DestReg == InstX86Base<Machine>::Traits::getRdxOrDie());
    Asm->cqo();
    break;
  }
@@ -2261,49 +2265,58 @@ void InstX86Movd<Machine>::emitIAS(const Cfg *Func) const {
      Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
  assert(this->getSrcSize() == 1);
  const Variable *Dest = this->getDest();
-  const auto *SrcVar = llvm::cast<Variable>(this->getSrc(0));
  auto *Target = InstX86Base<Machine>::getTarget(Func);
  // For insert/extract element (one of Src/Dest is an Xmm vector and the other
  // is an int type).
-  if (SrcVar->getType() == IceType_i32 ||
+  if (const auto *SrcVar = llvm::dyn_cast<Variable>(this->getSrc(0))) {
-      (InstX86Base<Machine>::Traits::Is64Bit &&
+    if (SrcVar->getType() == IceType_i32 ||
-       SrcVar->getType() == IceType_i64)) {
+        (InstX86Base<Machine>::Traits::Is64Bit &&
-    assert(isVectorType(Dest->getType()) ||
+         SrcVar->getType() == IceType_i64)) {
-           (isScalarFloatingType(Dest->getType()) &&
+      assert(isVectorType(Dest->getType()) ||
-            typeWidthInBytes(SrcVar->getType()) ==
+             (isScalarFloatingType(Dest->getType()) &&
-                typeWidthInBytes(Dest->getType())));
+              typeWidthInBytes(SrcVar->getType()) ==
-    assert(Dest->hasReg());
+                  typeWidthInBytes(Dest->getType())));
-    typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister DestReg =
+      assert(Dest->hasReg());
-        InstX86Base<Machine>::Traits::getEncodedXmm(Dest->getRegNum());
+      typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister DestReg =
-    if (SrcVar->hasReg()) {
+          InstX86Base<Machine>::Traits::getEncodedXmm(Dest->getRegNum());
-      Asm->movd(
+      if (SrcVar->hasReg()) {
-          SrcVar->getType(), DestReg,
+        Asm->movd(
-          InstX86Base<Machine>::Traits::getEncodedGPR(SrcVar->getRegNum()));
+            SrcVar->getType(), DestReg,
+            InstX86Base<Machine>::Traits::getEncodedGPR(SrcVar->getRegNum()));
+      } else {
+        typename InstX86Base<Machine>::Traits::Address StackAddr(
+            Target->stackVarToAsmOperand(SrcVar));
+        Asm->movd(SrcVar->getType(), DestReg, StackAddr);
+      }
    } else {
-      typename InstX86Base<Machine>::Traits::Address StackAddr(
+      assert(isVectorType(SrcVar->getType()) ||
-          Target->stackVarToAsmOperand(SrcVar));
+             (isScalarFloatingType(SrcVar->getType()) &&
-      Asm->movd(SrcVar->getType(), DestReg, StackAddr);
+              typeWidthInBytes(SrcVar->getType()) ==
+                  typeWidthInBytes(Dest->getType())));
+      assert(SrcVar->hasReg());
+      assert(Dest->getType() == IceType_i32 ||
+             (InstX86Base<Machine>::Traits::Is64Bit &&
+              Dest->getType() == IceType_i64));
+      typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister SrcReg =
+          InstX86Base<Machine>::Traits::getEncodedXmm(SrcVar->getRegNum());
+      if (Dest->hasReg()) {
+        Asm->movd(Dest->getType(), InstX86Base<Machine>::Traits::getEncodedGPR(
+                                       Dest->getRegNum()),
+                  SrcReg);
+      } else {
+        typename InstX86Base<Machine>::Traits::Address StackAddr(
+            Target->stackVarToAsmOperand(Dest));
+        Asm->movd(Dest->getType(), StackAddr, SrcReg);
+      }
    }
  } else {
-    assert(isVectorType(SrcVar->getType()) ||
+    assert(Dest->hasReg());
-           (isScalarFloatingType(SrcVar->getType()) &&
+    typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister DestReg =
-            typeWidthInBytes(SrcVar->getType()) ==
+        InstX86Base<Machine>::Traits::getEncodedXmm(Dest->getRegNum());
-                typeWidthInBytes(Dest->getType())));
+    auto *Mem =
-    assert(SrcVar->hasReg());
+        llvm::cast<typename InstX86Base<Machine>::Traits::X86OperandMem>(
-    assert(Dest->getType() == IceType_i32 ||
+            this->getSrc(0));
-           (InstX86Base<Machine>::Traits::Is64Bit &&
+    Asm->movd(Mem->getType(), DestReg, Mem->toAsmAddress(Asm, Target));
-            Dest->getType() == IceType_i64));
-    typename InstX86Base<Machine>::Traits::RegisterSet::XmmRegister SrcReg =
-        InstX86Base<Machine>::Traits::getEncodedXmm(SrcVar->getRegNum());
-    if (Dest->hasReg()) {
-      Asm->movd(Dest->getType(),
-                InstX86Base<Machine>::Traits::getEncodedGPR(Dest->getRegNum()),
-                SrcReg);
-    } else {
-      typename InstX86Base<Machine>::Traits::Address StackAddr(
-          Target->stackVarToAsmOperand(Dest));
-      Asm->movd(Dest->getType(), StackAddr, SrcReg);
-    }
  }
 }

--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -838,6 +838,7 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull,
    Iter.Cur = Unhandled.back();
    Unhandled.pop_back();
    dumpLiveRangeTrace("\nConsidering  ", Iter.Cur);
+    assert(Target->getRegistersForVariable(Iter.Cur).any());
    Iter.RegMask = RegMaskFull & Target->getRegistersForVariable(Iter.Cur);
    KillsRange.trim(Iter.Cur->getLiveRange().getStart());

--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -68,6 +68,8 @@ template <> struct MachineTraits<TargetX8632> {
  using Cond = ::Ice::CondX86;
  using RegisterSet = ::Ice::RegX8632;
+  static const SizeT StackPtr = RegX8632::Reg_esp;
+  static const SizeT FramePtr = RegX8632::Reg_ebp;
  static const GPRRegister Encoded_Reg_Accumulator = RegX8632::Encoded_Reg_eax;
  static const GPRRegister Encoded_Reg_Counter = RegX8632::Encoded_Reg_ecx;
  static const FixupKind PcRelFixup = llvm::ELF::R_386_PC32;
@@ -577,6 +579,14 @@ template <> struct MachineTraits<TargetX8632> {
    }
  }
+  static int32_t getRaxOrDie() {
+    llvm::report_fatal_error("no rax in non-64-bit mode.");
+  }
+  static int32_t getRdxOrDie() {
+    llvm::report_fatal_error("no rdx in non-64-bit mode.");
+  }
  /// The maximum number of arguments to pass in XMM registers
  static const uint32_t X86_MAX_XMM_ARGS = 4;
  /// The maximum number of arguments to pass in GPR registers

--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -193,7 +193,7 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
        ParameterAreaSizeBytes =
            Traits::applyStackAlignment(ParameterAreaSizeBytes);
      }
-      Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+      Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_rsp);
      Constant *Loc = Ctx->getConstantInt32(ParameterAreaSizeBytes);
      StackArgLocations.push_back(
          Traits::X86OperandMem::create(Func, Ty, esp, Loc));
@@ -276,14 +276,6 @@ void TargetX8664::lowerCall(const InstCall *Instr) {
    llvm_unreachable("X86-64 Sandboxing codegen not implemented.");
  }
-  // Add the appropriate offset to esp. The call instruction takes care of
-  // resetting the stack offset during emission.
-  if (ParameterAreaSizeBytes) {
-    Variable *Esp =
-        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
-    _add(Esp, Ctx->getConstantInt32(ParameterAreaSizeBytes));
-  }
  // Insert a register-kill pseudo instruction.
  Context.insert<InstFakeKill>(NewCall);
@@ -465,12 +457,20 @@ void TargetX8664::addProlog(CfgNode *Node) {
  // Add push instructions for preserved registers.
  uint32_t NumCallee = 0;
  size_t PreservedRegsSizeBytes = 0;
+  llvm::SmallBitVector Pushed(CalleeSaves.size());
  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
-    if (CalleeSaves[i] && RegsUsed[i]) {
+    const int32_t Canonical = Traits::getBaseReg(i);
-      ++NumCallee;
+    assert(Canonical == Traits::getBaseReg(Canonical));
-      PreservedRegsSizeBytes += typeWidthInBytes(IceType_i64);
+    if (CalleeSaves[i] && RegsUsed[i])
-      _push(getPhysicalRegister(i));
+      Pushed[Canonical] = true;
-    }
+  }
+  for (SizeT i = 0; i < Pushed.size(); ++i) {
+    if (!Pushed[i])
+      continue;
+    assert(static_cast<int32_t>(i) == Traits::getBaseReg(i));
+    ++NumCallee;
+    PreservedRegsSizeBytes += typeWidthInBytes(IceType_i64);
+    _push(getPhysicalRegister(i, IceType_i64));
  }
  Ctx->statsUpdateRegistersSaved(NumCallee);
@@ -479,8 +479,8 @@ void TargetX8664::addProlog(CfgNode *Node) {
    assert((RegsUsed & getRegisterSet(RegSet_FramePointer, RegSet_None))
               .count() == 0);
    PreservedRegsSizeBytes += typeWidthInBytes(IceType_i64);
-    Variable *ebp = getPhysicalRegister(Traits::RegisterSet::Reg_ebp);
+    Variable *ebp = getPhysicalRegister(Traits::RegisterSet::Reg_rbp);
-    Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+    Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_rsp);
    _push(ebp);
    _mov(ebp, esp);
    // Keep ebp live for late-stage liveness analysis (e.g. asm-verbose mode).
@@ -521,14 +521,14 @@ void TargetX8664::addProlog(CfgNode *Node) {
    SpillAreaSizeBytes += FixedAllocaSizeBytes;
  // Generate "sub esp, SpillAreaSizeBytes"
  if (SpillAreaSizeBytes) {
-    _sub(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
+    _sub(getPhysicalRegister(getStackReg(), IceType_i64),
         Ctx->getConstantInt32(SpillAreaSizeBytes));
    // If the fixed allocas are aligned more than the stack frame, align the
    // stack pointer accordingly.
    if (PrologEmitsFixedAllocas &&
        FixedAllocaAlignBytes > Traits::X86_STACK_ALIGNMENT_BYTES) {
      assert(IsEbpBasedFrame);
-      _and(getPhysicalRegister(Traits::RegisterSet::Reg_esp),
+      _and(getPhysicalRegister(Traits::RegisterSet::Reg_rsp),
           Ctx->getConstantInt32(-FixedAllocaAlignBytes));
    }
  }
@@ -637,9 +637,9 @@ void TargetX8664::addEpilog(CfgNode *Node) {
  Context.init(Node);
  Context.setInsertPoint(InsertPoint);
-  Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+  Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_rsp);
  if (IsEbpBasedFrame) {
-    Variable *ebp = getPhysicalRegister(Traits::RegisterSet::Reg_ebp);
+    Variable *ebp = getPhysicalRegister(Traits::RegisterSet::Reg_rbp);
    // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
    // use of esp before the assignment of esp=ebp keeps previous esp
    // adjustments from being dead-code eliminated.
@@ -655,13 +655,19 @@ void TargetX8664::addEpilog(CfgNode *Node) {
  // Add pop instructions for preserved registers.
  llvm::SmallBitVector CalleeSaves =
      getRegisterSet(RegSet_CalleeSave, RegSet_None);
-  for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
+  llvm::SmallBitVector Popped(CalleeSaves.size());
-    SizeT j = CalleeSaves.size() - i - 1;
+  for (int32_t i = CalleeSaves.size() - 1; i >= 0; --i) {
-    if (j == Traits::RegisterSet::Reg_ebp && IsEbpBasedFrame)
+    if (i == Traits::RegisterSet::Reg_rbp && IsEbpBasedFrame)
      continue;
-    if (CalleeSaves[j] && RegsUsed[j]) {
+    const SizeT Canonical = Traits::getBaseReg(i);
-      _pop(getPhysicalRegister(j));
+    if (CalleeSaves[i] && RegsUsed[i])
-    }
+      Popped[Canonical] = true;
+  }
+  for (int32_t i = Popped.size() - 1; i >= 0; --i) {
+    if (!Popped[i])
+      continue;
+    assert(i == Traits::getBaseReg(i));
+    _pop(getPhysicalRegister(i, IceType_i64));
  }
  if (Ctx->getFlags().getUseSandboxing()) {

--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -67,10 +67,12 @@ template <> struct MachineTraits<TargetX8664> {
  using Cond = ::Ice::CondX8664;
  using RegisterSet = ::Ice::RegX8664;
+  static const SizeT StackPtr = RegX8664::Reg_rsp;
+  static const SizeT FramePtr = RegX8664::Reg_rbp;
  static const GPRRegister Encoded_Reg_Accumulator = RegX8664::Encoded_Reg_eax;
  static const GPRRegister Encoded_Reg_Counter = RegX8664::Encoded_Reg_ecx;
  static const FixupKind PcRelFixup = llvm::ELF::R_X86_64_PC32;
-  static const FixupKind RelFixup = llvm::ELF::R_X86_64_32S;
+  static const FixupKind RelFixup = llvm::ELF::R_X86_64_32;
  class Operand {
  public:
@@ -255,7 +257,10 @@ template <> struct MachineTraits<TargetX8664> {
    /// Generate a RIP-relative address expression on x86-64.
    Address(RelocOffsetT Offset, AssemblerFixup *Fixup) {
-      SetModRM(0, RegX8664::Encoded_Reg_ebp);
+      SetModRM(0x0, RegX8664::Encoded_Reg_esp);
+      static constexpr ScaleFactor Scale = TIMES_1;
+      SetSIB(Scale, RegX8664::Encoded_Reg_esp, RegX8664::Encoded_Reg_ebp);
      // Use the Offset in the displacement for now. If we decide to process
      // fixups later, we'll need to patch up the emitted displacement.
      SetDisp32(Offset);
@@ -561,6 +566,10 @@ template <> struct MachineTraits<TargetX8664> {
    }
  }
+  static int32_t getRaxOrDie() { return RegisterSet::Reg_rax; }
+  static int32_t getRdxOrDie() { return RegisterSet::Reg_rdx; }
  /// The maximum number of arguments to pass in XMM registers
  static const uint32_t X86_MAX_XMM_ARGS = 8;
  /// The maximum number of arguments to pass in GPR registers

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -88,8 +88,8 @@ public:
  bool hasFramePointer() const override { return IsEbpBasedFrame; }
  void setHasFramePointer() override { IsEbpBasedFrame = true; }
-  SizeT getStackReg() const override { return Traits::RegisterSet::Reg_esp; }
+  SizeT getStackReg() const override { return Traits::StackPtr; }
-  SizeT getFrameReg() const override { return Traits::RegisterSet::Reg_ebp; }
+  SizeT getFrameReg() const override { return Traits::FramePtr; }
  SizeT getFrameOrStackReg() const override {
    return IsEbpBasedFrame ? getFrameReg() : getStackReg();
  }

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -187,7 +187,7 @@ bool BoolFolding<MachineTraits>::hasComplexLowering(const Inst *Instr) {
  default:
    return false;
  case PK_Icmp64:
-    return true;
+    return !MachineTraits::Is64Bit;
  case PK_Fcmp:
    return MachineTraits::TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()]
               .C2 != MachineTraits::Cond::Br_None;
@@ -765,10 +765,6 @@ bool TargetX86Base<Machine>::doBranchOpt(Inst *I, const CfgNode *NextNode) {
 template <class Machine>
 Variable *TargetX86Base<Machine>::getPhysicalRegister(SizeT RegNum, Type Ty) {
-  // Special case: never allow partial reads/writes to/from %rBP and %rSP.
-  if (RegNum == Traits::RegisterSet::Reg_esp ||
-      RegNum == Traits::RegisterSet::Reg_ebp)
-    Ty = Traits::WordType;
  if (Ty == IceType_void)
    Ty = IceType_i32;
  if (PhysicalRegisters[Ty].empty())
@@ -998,7 +994,7 @@ void TargetX86Base<Machine>::lowerAlloca(const InstAlloca *Inst) {
  if (UseFramePointer)
    setHasFramePointer();
-  Variable *esp = getPhysicalRegister(Traits::RegisterSet::Reg_esp);
+  Variable *esp = getPhysicalRegister(getStackReg());
  if (OverAligned) {
    _and(esp, Ctx->getConstantInt32(-Alignment));
  }
@@ -1713,13 +1709,17 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
    // div and idiv are the few arithmetic operators that do not allow
    // immediates as the operand.
    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Eax;
-    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    uint32_t Edx;
    switch (Ty) {
    default:
-      llvm_unreachable("Bad type for udiv");
+      llvm::report_fatal_error("Bad type for udiv");
-    // fallthrough
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
      break;
    case IceType_i16:
      Eax = Traits::RegisterSet::Reg_ax;
@@ -1773,8 +1773,11 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
    switch (Ty) {
    default:
-      llvm_unreachable("Bad type for sdiv");
+      llvm::report_fatal_error("Bad type for sdiv");
-    // fallthrough
+    case IceType_i64:
+      T_edx = makeReg(Ty, Traits::getRdxOrDie());
+      _mov(T, Src0, Traits::getRaxOrDie());
+      break;
    case IceType_i32:
      T_edx = makeReg(Ty, Traits::RegisterSet::Reg_edx);
      _mov(T, Src0, Traits::RegisterSet::Reg_eax);
@@ -1794,13 +1797,18 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
    break;
  case InstArithmetic::Urem: {
    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Eax;
-    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    uint32_t Edx;
    switch (Ty) {
    default:
-      llvm_unreachable("Bad type for urem");
+      llvm::report_fatal_error("Bad type for urem");
-    // fallthrough
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
      break;
    case IceType_i16:
      Eax = Traits::RegisterSet::Reg_ax;
@@ -1858,13 +1866,18 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
      }
    }
    Src1 = legalize(Src1, Legal_Reg | Legal_Mem);
-    uint32_t Eax = Traits::RegisterSet::Reg_eax;
+    uint32_t Eax;
-    uint32_t Edx = Traits::RegisterSet::Reg_edx;
+    uint32_t Edx;
    switch (Ty) {
    default:
-      llvm_unreachable("Bad type for srem");
+      llvm::report_fatal_error("Bad type for srem");
-    // fallthrough
+    case IceType_i64:
+      Eax = Traits::getRaxOrDie();
+      Edx = Traits::getRdxOrDie();
+      break;
    case IceType_i32:
+      Eax = Traits::RegisterSet::Reg_eax;
+      Edx = Traits::RegisterSet::Reg_edx;
      break;
    case IceType_i16:
      Eax = Traits::RegisterSet::Reg_ax;
@@ -3538,15 +3551,13 @@ void TargetX86Base<Machine>::lowerIntrinsicCall(
    return;
  }
  case Intrinsics::Stacksave: {
-    Variable *esp =
+    Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg());
-        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
    Variable *Dest = Instr->getDest();
    _mov(Dest, esp);
    return;
  }
  case Intrinsics::Stackrestore: {
-    Variable *esp =
+    Variable *esp = Func->getTarget()->getPhysicalRegister(getStackReg());
-        Func->getTarget()->getPhysicalRegister(Traits::RegisterSet::Reg_esp);
    _redefined(_mov(esp, Instr->getArg(0)));
    return;
  }
@@ -3588,8 +3599,10 @@ void TargetX86Base<Machine>::lowerAtomicCmpxchg(Variable *DestPrev,
  int32_t Eax;
  switch (Ty) {
  default:
-    llvm_unreachable("Bad type for cmpxchg");
+    llvm::report_fatal_error("Bad type for cmpxchg");
-  // fallthrough
+  case IceType_i64:
+    Eax = Traits::getRaxOrDie();
+    break;
  case IceType_i32:
    Eax = Traits::RegisterSet::Reg_eax;
    break;
@@ -3860,8 +3873,10 @@ void TargetX86Base<Machine>::expandAtomicRMWAsCmpxchg(LowerBinOp Op_Lo,
  int32_t Eax;
  switch (Ty) {
  default:
-    llvm_unreachable("Bad type for atomicRMW");
+    llvm::report_fatal_error("Bad type for atomicRMW");
-  // fallthrough
+  case IceType_i64:
+    Eax = Traits::getRaxOrDie();
+    break;
  case IceType_i32:
    Eax = Traits::RegisterSet::Reg_eax;
    break;
@@ -3930,31 +3945,32 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
  // Cttz, is similar, but uses bsf instead, and doesn't require the xor
  // bit position conversion, and the speculation is reversed.
  assert(Ty == IceType_i32 || Ty == IceType_i64);
-  Variable *T = makeReg(IceType_i32);
+  const Type DestTy = Traits::Is64Bit ? Dest->getType() : IceType_i32;
+  Variable *T = makeReg(DestTy);
  Operand *FirstValRM = legalize(FirstVal, Legal_Mem | Legal_Reg);
  if (Cttz) {
    _bsf(T, FirstValRM);
  } else {
    _bsr(T, FirstValRM);
  }
-  Variable *T_Dest = makeReg(IceType_i32);
+  Variable *T_Dest = makeReg(DestTy);
-  Constant *ThirtyTwo = Ctx->getConstantInt32(32);
+  Constant *_31 = Ctx->getConstantInt32(31);
-  Constant *ThirtyOne = Ctx->getConstantInt32(31);
+  Constant *_32 = Ctx->getConstantInt(DestTy, 32);
  if (Cttz) {
-    _mov(T_Dest, ThirtyTwo);
+    _mov(T_Dest, _32);
  } else {
-    Constant *SixtyThree = Ctx->getConstantInt32(63);
+    Constant *_63 = Ctx->getConstantInt(DestTy, 63);
-    _mov(T_Dest, SixtyThree);
+    _mov(T_Dest, _63);
  }
  _cmov(T_Dest, T, Traits::Cond::Br_ne);
  if (!Cttz) {
-    _xor(T_Dest, ThirtyOne);
+    _xor(T_Dest, _31);
  }
  if (Traits::Is64Bit || Ty == IceType_i32) {
    _mov(Dest, T_Dest);
    return;
  }
-  _add(T_Dest, ThirtyTwo);
+  _add(T_Dest, _32);
  auto *DestLo = llvm::cast<Variable>(loOperand(Dest));
  auto *DestHi = llvm::cast<Variable>(hiOperand(Dest));
  // Will be using "test" on this, so we need a registerized variable.
@@ -3964,7 +3980,7 @@ void TargetX86Base<Machine>::lowerCountZeros(bool Cttz, Type Ty, Variable *Dest,
    _bsf(T_Dest2, SecondVar);
  } else {
    _bsr(T_Dest2, SecondVar);
-    _xor(T_Dest2, ThirtyOne);
+    _xor(T_Dest2, _31);
  }
  _test(SecondVar, SecondVar);
  _cmov(T_Dest2, T_Dest, Traits::Cond::Br_e);
@@ -4178,6 +4194,7 @@ void TargetX86Base<Machine>::lowerMemset(Operand *Dest, Operand *Val,
        assert(VecReg != nullptr);
        _storeq(VecReg, Mem);
      } else {
+        assert(Ty != IceType_i64);
        _store(Ctx->getConstantInt(Ty, SpreadValue), Mem);
      }
    };
@@ -6135,8 +6152,7 @@ Variable *TargetX86Base<Machine>::makeReg(Type Type, int32_t RegNum) {
 template <class Machine>
 const Type TargetX86Base<Machine>::TypeForSize[] = {
-    IceType_i8, IceType_i16, IceType_i32,
+    IceType_i8, IceType_i16, IceType_i32, IceType_f64, IceType_v16i8};
-    (Traits::Is64Bit ? IceType_i64 : IceType_f64), IceType_v16i8};
 template <class Machine>
 Type TargetX86Base<Machine>::largestTypeInSize(uint32_t Size,
                                               uint32_t MaxSize) {

--- a/src/IceTypes.def
+++ b/src/IceTypes.def
@@ -22,7 +22,7 @@
 #define TARGETARCH_TABLE                                                       \
  /* enum value, printable string, is_elf64,   e_machine, e_flags */           \
  X(Target_X8632, "x86-32", false, EM_386,     0)                              \
-  X(Target_X8664, "x86-64", true,  EM_X86_64,  0)                              \
+  X(Target_X8664, "x86-64", false, EM_X86_64,  0)                              \
  X(Target_ARM32, "arm32",  false, EM_ARM,     EF_ARM_EABI_VER5)               \
  X(Target_ARM64, "arm64",  true,  EM_AARCH64, 0)                              \
  X(Target_MIPS32,"mips32", false, EM_MIPS,    0)                              \

--- a/unittest/AssemblerX8664/ControlFlow.cpp
+++ b/unittest/AssemblerX8664/ControlFlow.cpp
@@ -160,6 +160,7 @@ TEST_F(AssemblerX8664Test, CallAddr) {
  do {                                                                         \
    const uint32_t T0 = allocateQword();                                       \
    const uint64_t V0 = 0xA0C0FFEEBEEFFEEFull;                                 \
+    const uint32_t T1 = allocateDword();                                       \
    __ call(Immediate(16));                                                    \
    int CallTargetAddr = codeBytesSize() + 12;                                 \
    __ mov(IceType_i8, Encoded_GPR_##Dst##l(), Immediate(0xf4));               \
@@ -168,9 +169,9 @@ TEST_F(AssemblerX8664Test, CallAddr) {
      __ hlt();                                                                \
    }                                                                          \
    __ mov(IceType_i64, Encoded_GPR_##Dst##q(), dwordAddress(T0));             \
-    __ mov(IceType_i64, Encoded_GPR_##Src##q(), Encoded_GPR_rsp());            \
-    __ call(Address(Encoded_GPR_##Src##q(), 0, AssemblerFixup::NoFixup));      \
    __ popl(Encoded_GPR_##Src##q());                                           \
+    __ mov(IceType_i32, dwordAddress(T1), Encoded_GPR_##Src##d());             \
+    __ call(dwordAddress(T1));                                                 \
                                                                               \
    AssembledTest test = assemble();                                           \
    test.setQwordTo(T0, V0);                                                   \

--- a/unittest/AssemblerX8664/GPRArith.cpp
+++ b/unittest/AssemblerX8664/GPRArith.cpp
@@ -329,15 +329,18 @@ TEST_F(AssemblerX8664LowLevelTest, LeaAbsolute) {
    static constexpr char TestString[] = "(" #Dst ", " #Value ")";             \
    __ lea(IceType_i32, GPRRegister::Encoded_Reg_##Dst,                        \
           Address(Value, AssemblerFixup::NoFixup));                           \
-    static constexpr uint32_t ByteCount = 6;                                   \
+    static constexpr uint32_t ByteCount = 8;                                   \
    ASSERT_EQ(ByteCount, codeBytesSize()) << TestString;                       \
    static constexpr uint8_t Opcode = 0x8D;                                    \
    static constexpr uint8_t ModRM =                                           \
-        /*mod=*/0x00 | /*reg*/ (GPRRegister::Encoded_Reg_##Dst << 3) |         \
+        /*mod*/ 0x00 | /*reg*/ (GPRRegister::Encoded_Reg_##Dst << 3) |         \
-        /*rm*/ GPRRegister::Encoded_Reg_ebp;                                   \
+        /*rm*/ GPRRegister::Encoded_Reg_esp;                                   \
+    static constexpr uint8_t SIB =                                             \
+        /*Scale*/ 0x00 | /*Index*/ (GPRRegister::Encoded_Reg_esp << 3) |       \
+        /*base*/ GPRRegister::Encoded_Reg_ebp;                                 \
    ASSERT_TRUE(verifyBytes<ByteCount>(                                        \
-        codeBytes(), Opcode, ModRM, (Value)&0xFF, (Value >> 8) & 0xFF,         \
+        codeBytes(), 0x67, Opcode, ModRM, SIB, (Value)&0xFF,                   \
-        (Value >> 16) & 0xFF, (Value >> 24) & 0xFF));                          \
+        (Value >> 8) & 0xFF, (Value >> 16) & 0xFF, (Value >> 24) & 0xFF));     \
    reset();                                                                   \
  } while (0)

--- a/unittest/AssemblerX8664/Locked.cpp
+++ b/unittest/AssemblerX8664/Locked.cpp
@@ -206,55 +206,60 @@ TEST_F(AssemblerX8664LowLevelTest, Xadd) {
  {
    __ xadd(IceType_i8, Address(0x1FF00, AssemblerFixup::NoFixup),
            Encoded_GPR_r14(), NotLocked);
-    static constexpr uint8_t ByteCountNotLocked8 = 8;
+    static constexpr uint8_t ByteCountNotLocked8 = 10;
    ASSERT_EQ(ByteCountNotLocked8, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x44, 0x0F, 0xC0,
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x67, 0x44, 0x0F,
-                                                 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                                 0xC0, 0x34, 0x25, 0x00, 0xFF,
+                                                 0x01, 0x00));
    reset();
    __ xadd(IceType_i8, Address(0x1FF00, AssemblerFixup::NoFixup),
            Encoded_GPR_r14(), Locked);
    static constexpr uint8_t ByteCountLocked8 = 1 + ByteCountNotLocked8;
    ASSERT_EQ(ByteCountLocked8, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountLocked8>(
+    ASSERT_TRUE(verifyBytes<ByteCountLocked8>(codeBytes(), 0xF0, 0x67, 0x44,
-        codeBytes(), 0xF0, 0x44, 0x0F, 0xC0, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                              0x0F, 0xC0, 0x34, 0x25, 0x00,
+                                              0xFF, 0x01, 0x00));
    reset();
  }
  {
    __ xadd(IceType_i16, Address(0x1FF00, AssemblerFixup::NoFixup),
            Encoded_GPR_r14(), NotLocked);
-    static constexpr uint8_t ByteCountNotLocked16 = 9;
+    static constexpr uint8_t ByteCountNotLocked16 = 11;
    ASSERT_EQ(ByteCountNotLocked16, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountNotLocked16>(
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked16>(codeBytes(), 0x66, 0x67, 0x44,
-        codeBytes(), 0x66, 0x44, 0x0F, 0xC1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                                  0x0F, 0xC1, 0x34, 0x25, 0x00,
+                                                  0xFF, 0x01, 0x00));
    reset();
    __ xadd(IceType_i16, Address(0x1FF00, AssemblerFixup::NoFixup),
            Encoded_GPR_r14(), Locked);
    static constexpr uint8_t ByteCountLocked16 = 1 + ByteCountNotLocked16;
    ASSERT_EQ(ByteCountLocked16, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x44,
+    ASSERT_TRUE(verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x67,
-                                               0x0F, 0xC1, 0x35, 0x00, 0xFF,
+                                               0x44, 0x0F, 0xC1, 0x34, 0x25,
-                                               0x01, 0x00));
+                                               0x00, 0xFF, 0x01, 0x00));
    reset();
  }
  {
    __ xadd(IceType_i32, Address(0x1FF00, AssemblerFixup::NoFixup),
            Encoded_GPR_r14(), NotLocked);
-    static constexpr uint8_t ByteCountNotLocked32 = 8;
+    static constexpr uint8_t ByteCountNotLocked32 = 10;
    ASSERT_EQ(ByteCountNotLocked32, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountNotLocked32>(
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked32>(codeBytes(), 0x67, 0x44, 0x0F,
-        codeBytes(), 0x44, 0x0F, 0xC1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                                  0xC1, 0x34, 0x25, 0x00, 0xFF,
+                                                  0x01, 0x00));
    reset();
    __ xadd(IceType_i32, Address(0x1FF00, AssemblerFixup::NoFixup),
            Encoded_GPR_r14(), Locked);
    static constexpr uint8_t ByteCountLocked32 = 1 + ByteCountNotLocked32;
    ASSERT_EQ(ByteCountLocked32, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountLocked32>(
+    ASSERT_TRUE(verifyBytes<ByteCountLocked32>(codeBytes(), 0xF0, 0x67, 0x44,
-        codeBytes(), 0xF0, 0x44, 0x0F, 0xC1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                               0x0F, 0xC1, 0x34, 0x25, 0x00,
+                                               0xFF, 0x01, 0x00));
    reset();
  }
 }
@@ -333,17 +338,17 @@ TEST_F(AssemblerX8664LowLevelTest, Cmpxchg8b) {
  // Ensures that cmpxchg8b emits a lock prefix accordingly.
  __ cmpxchg8b(Address(0x1FF00, AssemblerFixup::NoFixup), NotLocked);
-  static constexpr uint8_t ByteCountNotLocked = 7;
+  static constexpr uint8_t ByteCountNotLocked = 9;
  ASSERT_EQ(ByteCountNotLocked, codeBytesSize());
-  ASSERT_TRUE(verifyBytes<ByteCountNotLocked>(codeBytes(), 0x0F, 0xC7, 0x0D,
+  ASSERT_TRUE(verifyBytes<ByteCountNotLocked>(
-                                              0x00, 0xFF, 0x01, 0x00));
+      codeBytes(), 0x67, 0x0F, 0xC7, 0x0C, 0x25, 0x00, 0xFF, 0x01, 0x00));
  reset();
  __ cmpxchg8b(Address(0x1FF00, AssemblerFixup::NoFixup), Locked);
  static constexpr uint8_t ByteCountLocked = 1 + ByteCountNotLocked;
  ASSERT_EQ(ByteCountLocked, codeBytesSize());
-  ASSERT_TRUE(verifyBytes<ByteCountLocked>(codeBytes(), 0xF0, 0x0F, 0xC7, 0x0D,
+  ASSERT_TRUE(verifyBytes<ByteCountLocked>(codeBytes(), 0xF0, 0x67, 0x0F, 0xC7,
-                                           0x00, 0xFF, 0x01, 0x00));
+                                           0x0C, 0x25, 0x00, 0xFF, 0x01, 0x00));
  reset();
 }
@@ -441,55 +446,60 @@ TEST_F(AssemblerX8664LowLevelTest, Cmpxchg) {
  {
    __ cmpxchg(IceType_i8, Address(0x1FF00, AssemblerFixup::NoFixup),
               Encoded_GPR_r14(), NotLocked);
-    static constexpr uint8_t ByteCountNotLocked8 = 8;
+    static constexpr uint8_t ByteCountNotLocked8 = 10;
    ASSERT_EQ(ByteCountNotLocked8, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x44, 0x0F, 0xB0,
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked8>(codeBytes(), 0x67, 0x44, 0x0F,
-                                                 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                                 0xB0, 0x34, 0x25, 0x00, 0xFF,
+                                                 0x01, 0x00));
    reset();
    __ cmpxchg(IceType_i8, Address(0x1FF00, AssemblerFixup::NoFixup),
               Encoded_GPR_r14(), Locked);
    static constexpr uint8_t ByteCountLocked8 = 1 + ByteCountNotLocked8;
    ASSERT_EQ(ByteCountLocked8, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountLocked8>(
+    ASSERT_TRUE(verifyBytes<ByteCountLocked8>(codeBytes(), 0xF0, 0x67, 0x44,
-        codeBytes(), 0xF0, 0x44, 0x0F, 0xB0, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                              0x0F, 0xB0, 0x34, 0x25, 0x00,
+                                              0xFF, 0x01, 0x00));
    reset();
  }
  {
    __ cmpxchg(IceType_i16, Address(0x1FF00, AssemblerFixup::NoFixup),
               Encoded_GPR_r14(), NotLocked);
-    static constexpr uint8_t ByteCountNotLocked16 = 9;
+    static constexpr uint8_t ByteCountNotLocked16 = 11;
    ASSERT_EQ(ByteCountNotLocked16, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountNotLocked16>(
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked16>(codeBytes(), 0x66, 0x67, 0x44,
-        codeBytes(), 0x66, 0x44, 0x0F, 0xB1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                                  0x0F, 0xB1, 0x34, 0x25, 0x00,
+                                                  0xFF, 0x01, 0x00));
    reset();
    __ cmpxchg(IceType_i16, Address(0x1FF00, AssemblerFixup::NoFixup),
               Encoded_GPR_r14(), Locked);
    static constexpr uint8_t ByteCountLocked16 = 1 + ByteCountNotLocked16;
    ASSERT_EQ(ByteCountLocked16, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x44,
+    ASSERT_TRUE(verifyBytes<ByteCountLocked16>(codeBytes(), 0x66, 0xF0, 0x67,
-                                               0x0F, 0xB1, 0x35, 0x00, 0xFF,
+                                               0x44, 0x0F, 0xB1, 0x34, 0x25,
-                                               0x01, 0x00));
+                                               0x00, 0xFF, 0x01, 0x00));
    reset();
  }
  {
    __ cmpxchg(IceType_i32, Address(0x1FF00, AssemblerFixup::NoFixup),
               Encoded_GPR_r14(), NotLocked);
-    static constexpr uint8_t ByteCountNotLocked32 = 8;
+    static constexpr uint8_t ByteCountNotLocked32 = 10;
    ASSERT_EQ(ByteCountNotLocked32, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountNotLocked32>(
+    ASSERT_TRUE(verifyBytes<ByteCountNotLocked32>(codeBytes(), 0x67, 0x44, 0x0F,
-        codeBytes(), 0x44, 0x0F, 0xB1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                                  0xB1, 0x34, 0x25, 0x00, 0xFF,
+                                                  0x01, 0x00));
    reset();
    __ cmpxchg(IceType_i32, Address(0x1FF00, AssemblerFixup::NoFixup),
               Encoded_GPR_r14(), Locked);
    static constexpr uint8_t ByteCountLocked32 = 1 + ByteCountNotLocked32;
    ASSERT_EQ(ByteCountLocked32, codeBytesSize());
-    ASSERT_TRUE(verifyBytes<ByteCountLocked32>(
+    ASSERT_TRUE(verifyBytes<ByteCountLocked32>(codeBytes(), 0xF0, 0x67, 0x44,
-        codeBytes(), 0xF0, 0x44, 0x0F, 0xB1, 0x35, 0x00, 0xFF, 0x01, 0x00));
+                                               0x0F, 0xB1, 0x34, 0x25, 0x00,
+                                               0xFF, 0x01, 0x00));
    reset();
  }
 }

--- a/unittest/AssemblerX8664/LowLevel.cpp
+++ b/unittest/AssemblerX8664/LowLevel.cpp
--- a/unittest/AssemblerX8664/TestUtil.h
+++ b/unittest/AssemblerX8664/TestUtil.h
@@ -701,7 +701,7 @@ protected:
      EXPECT_LT(MySize, MaxCodeSize);
      assert(MySize < MaximumCodeSize);
      ExecutableData = mmap(nullptr, Size, PROT_WRITE | PROT_READ | PROT_EXEC,
-                            MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+                            MAP_PRIVATE | MAP_ANONYMOUS | MAP_32BIT, -1, 0);
      EXPECT_NE(MAP_FAILED, ExecutableData) << strerror(errno);
      assert(MAP_FAILED != ExecutableData);
      std::memcpy(ExecutableData, Data, MySize);