Subzero: Refactor x86 register representation to actively use aliases.

Sets up additional register attributes, plus the notion of register classes, to enable robust usage of the high 8-bit GPRs (ah/bh/ch/dh), for both x86-32 and x86-64. (Note that the x86-64 changes are currently untested.) We add a Register Class field to the Variable class. The default register class is a value corresponding to the variable's type, but the target can extend the set of register class values, and the target lowering can assign different register classes as needed. The register allocator uses the register class instead of the type to determine the set of registers to draw from. For x86-64, the high 8-bit registers are not included in the general register allocation pool, but there are explicit references to ah for lowering the div/rem instructions. The target lowering is modified as needed to make sure types are appropriate and register use in instructions is legalized. Some other fixes and cleanups are included in this CL: * Makefile.standalone changes. Source files are reordered so that the more expensive compiles are done earlier, speeding up parallel builds by decreasing fragmentation. A dependency error is fixed for check-spec. * A bug is fixed in advanced phi lowering. When a temporary is introduced to break a cycle, we were neglecting to updated the predecessor count for one of the operands, leading to an assertion failure. (Applying that fix to master resulted in no changes to spec2k code generation.) A consistency check is added to help find future problems like this. Also, refactored iteration over the Phi descriptor array to use range-based for loops and avoid directly indexing the array. * Removed most of the "IceType_" prefixes in x-macro tables for brevity. * Fix a correctness TODO in the register allocator. This had no effect on spec2k code generation in master or in this CL, so we were probably just lucky. * Made some much-needed s/Dest->getType()/Ty/ changes for brevity, in the target lowering sections that needed other changes. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4095 R=jpp@chromium.org Review URL: https://codereview.chromium.org/1427973003 .

Subzero: Refactor x86 register representation to actively use aliases.
c59288b3 · Jim Stichnoth · ea15bbe7 · c59288b3 · c59288b3 · c59288b3
Commit c59288b3 authored Nov 09, 2015 by Jim Stichnoth
28 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -196,9 +196,20 @@ LDFLAGS := $(HOST_FLAGS) -L$(LIBCXX_INSTALL_PATH)/lib -Wl,--gc-sections \
 # Not specifying -Wl,--gc-sections but instead doing bitcode linking GC w/ LTO.
 SB_LDFLAGS := $(LINKOPTLEVEL) $(LD_EXTRA)

+# List the target-specific source files first, which generally take longer to
+# compile, in the hope of improving parallel build time.
 SRCS = \
-  IceAssembler.cpp \
  IceAssemblerARM32.cpp \
+  IceInstARM32.cpp \
+  IceInstMIPS32.cpp \
+  IceInstX8632.cpp \
+  IceInstX8664.cpp \
+  IceTargetLowering.cpp \
+  IceTargetLoweringARM32.cpp \
+  IceTargetLoweringMIPS32.cpp \
+  IceTargetLoweringX8632.cpp \
+  IceTargetLoweringX8664.cpp \
+  IceAssembler.cpp \
  IceBrowserCompileServer.cpp \
  IceCfg.cpp \
  IceCfgNode.cpp \
@@ -211,10 +222,6 @@ SRCS = \
  IceGlobalContext.cpp \
  IceGlobalInits.cpp \
  IceInst.cpp \
-  IceInstARM32.cpp \
-  IceInstMIPS32.cpp \
-  IceInstX8632.cpp \
-  IceInstX8664.cpp \
  IceIntrinsics.cpp \
  IceLiveness.cpp \
  IceLoopAnalyzer.cpp \
@@ -222,11 +229,6 @@ SRCS = \
  IceRegAlloc.cpp \
  IceRNG.cpp \
  IceSwitchLowering.cpp \
-  IceTargetLowering.cpp \
-  IceTargetLoweringARM32.cpp \
-  IceTargetLoweringMIPS32.cpp \
-  IceTargetLoweringX8632.cpp \
-  IceTargetLoweringX8664.cpp \
  IceThreading.cpp \
  IceTimerTree.cpp \
  IceTranslator.cpp \
@@ -397,9 +399,11 @@ endif
 check-unit: $(OBJDIR)/run_unittests
 	$(OBJDIR)/run_unittests

-ALLSPEC := 177.mesa 179.art 183.equake 188.ammp 164.gzip 175.vpr 176.gcc \
-           181.mcf 186.crafty 197.parser 253.perlbmk 254.gap 255.vortex \
-           256.bzip2 300.twolf 252.eon
+# List the spec2k components in roughly reverse order of runtime, to help with
+# parallel execution speed.
+ALLSPEC := 253.perlbmk 177.mesa 188.ammp 256.bzip2 164.gzip 179.art 183.equake \
+           175.vpr 176.gcc 181.mcf 186.crafty 197.parser 254.gap 255.vortex \
+           300.twolf 252.eon
 .PHONY: $(ALLSPEC)

 TARGET := x8632
@@ -414,12 +418,12 @@ ifeq ($(TARGET),arm32)
  SPEC := -O2 --filetype=asm
 endif

-%.spec2k: %
+%.spec2k: % $(OBJDIR)/pnacl-sz make_symlink runtime
 	./pydir/szbuild_spec2k.py -v --force --target=$(TARGETFLAG) $(SPEC) $<
 	( cd ../../../tests/spec2k; \
 	 ./run_all.sh RunTimedBenchmarks $(SETUP) train $< )

-check-spec: $(OBJDIR)/pnacl-sz make_symlink $(ALLSPEC:=.spec2k)
+check-spec: $(ALLSPEC:=.spec2k)

 check: check-lit check-unit check-xtest


--- a/src/IceCfg.cpp
+++ b/src/IceCfg.cpp
@@ -457,19 +457,17 @@ void Cfg::sortAllocas(CfgVector<Inst *> &Allocas, InstList &Insts,
    return;
  // Sort by decreasing alignment.  This does not really matter at the moment,
  // but will allow compacting stack allocation when we fuse to one alloca.
-  std::sort(Allocas.begin(), Allocas.end(),
-            [](Inst *I1, Inst *I2) {
-              auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
-              auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
-              return A1->getAlignInBytes() > A2->getAlignInBytes();
-            });
-  for (Inst *Instr: Allocas) {
+  std::sort(Allocas.begin(), Allocas.end(), [](Inst *I1, Inst *I2) {
+    auto *A1 = llvm::dyn_cast<InstAlloca>(I1);
+    auto *A2 = llvm::dyn_cast<InstAlloca>(I2);
+    return A1->getAlignInBytes() > A2->getAlignInBytes();
+  });
+  for (Inst *Instr : Allocas) {
    auto *Alloca = llvm::cast<InstAlloca>(Instr);
    // Move the alloca to its sorted position.
-    InstAlloca *NewAlloca = InstAlloca::create(this,
-                                               Alloca->getSizeInBytes(),
-                                               Alloca->getAlignInBytes(),
-                                               Alloca->getDest());
+    InstAlloca *NewAlloca =
+        InstAlloca::create(this, Alloca->getSizeInBytes(),
+                           Alloca->getAlignInBytes(), Alloca->getDest());
    if (IsKnownFrameOffset)
      NewAlloca->setKnownFrameOffset();
    Insts.push_front(NewAlloca);
@@ -506,8 +504,7 @@ void Cfg::processAllocas() {
        // Allocations aligned more than the stack require a frame pointer.
        RequiresFramePointer = true;
        AlignedAllocas.push_back(Alloca);
-      }
-      else
+      } else
        FixedAllocas.push_back(Alloca);
    }
  }

--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -304,6 +304,7 @@ class PhiDesc {
  PhiDesc() = delete;
  PhiDesc(const PhiDesc &) = delete;
  PhiDesc &operator=(const PhiDesc &) = delete;
+
 public:
  PhiDesc(InstPhi *Phi, Variable *Dest) : Phi(Phi), Dest(Dest) {}
  PhiDesc(PhiDesc &&) = default;
@@ -457,7 +458,7 @@ void CfgNode::advancedPhiLowering() {
        if (Item2.Processed)
          continue;
        // There shouldn't be two different Phis with the same Dest variable or
-          // register.
+        // register.
        assert((&Item == &Item2) || !sameVarOrReg(Target, Dest, Item2.Dest));
        if (sameVarOrReg(Target, Dest, Item2.Src))
          ++Item.NumPred;

--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -55,13 +55,13 @@
  X(Reg_bl, 3, "bl", Reg_ebx, 0,1,0,0, 1,0,0,0,1, 0, 0,0,0,1,1,                \
    REGLIST2(RegX8632, ebx, bx))                                               \
  /* High 8-bit registers */                                                   \
-  X(Reg_ah, 4, "ah", Reg_eax, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_ah, 4, "ah", Reg_eax, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
    REGLIST2(RegX8632, eax, ax))                                               \
-  X(Reg_ch, 5, "ch", Reg_ecx, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_ch, 5, "ch", Reg_ecx, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
    REGLIST2(RegX8632, ecx, cx))                                               \
-  X(Reg_dh, 6, "dh", Reg_edx, 1,0,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_dh, 6, "dh", Reg_edx, 1,0,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
    REGLIST2(RegX8632, edx, dx))                                               \
-  X(Reg_bh, 7, "bh", Reg_ebx, 0,1,0,0, 1,0,0,0,0, 0, 0,0,0,0,1,                \
+  X(Reg_bh, 7, "bh", Reg_ebx, 0,1,0,0, 1,0,0,0,1, 0, 0,0,0,0,1,                \
    REGLIST2(RegX8632, ebx, bx))                                               \
  /* End of 8-bit register set */
 //#define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,
@@ -212,22 +212,22 @@
 //#define X(val, emit)

 #define ICETYPEX8632_TABLE                                                     \
-  /* tag, element type, cvt , sdss, pack, width, fld */                        \
-  X(IceType_void,  IceType_void, "?",  "",   "",  "",  "")                     \
-  X(IceType_i1,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i8,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i16,   IceType_void, "si", "",   "",  "w", "")                     \
-  X(IceType_i32,   IceType_void, "si", "",   "",  "l", "")                     \
-  X(IceType_i64,   IceType_void, "si", "",   "",  "q", "")                     \
-  X(IceType_f32,   IceType_void, "ss", "ss", "d", "",  "s")                    \
-  X(IceType_f64,   IceType_void, "sd", "sd", "q", "",  "l")                    \
-  X(IceType_v4i1,  IceType_i32,  "?",  "",   "d", "",  "")                     \
-  X(IceType_v8i1,  IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v16i1, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v16i8, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v8i16, IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v4i32, IceType_i32,  "dq", "",   "d", "",  "")                     \
-  X(IceType_v4f32, IceType_f32,  "ps", "",   "d", "",  "")
+  /* tag,  element type, cvt , sdss, pack, width, fld */                       \
+  X(void,  void,         "?",  "",   "",   "",    "")                          \
+  X(i1,    void,         "si", "",   "",   "b",   "")                          \
+  X(i8,    void,         "si", "",   "",   "b",   "")                          \
+  X(i16,   void,         "si", "",   "",   "w",   "")                          \
+  X(i32,   void,         "si", "",   "",   "l",   "")                          \
+  X(i64,   void,         "si", "",   "",   "q",   "")                          \
+  X(f32,   void,         "ss", "ss", "d",  "",    "s")                         \
+  X(f64,   void,         "sd", "sd", "q",  "",    "l")                         \
+  X(v4i1,  i32,          "?",  "",   "d",  "",    "")                          \
+  X(v8i1,  i16,          "?",  "",   "w",  "",    "")                          \
+  X(v16i1, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v16i8, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v8i16, i16,          "?",  "",   "w",  "",    "")                          \
+  X(v4i32, i32,          "dq", "",   "d",  "",    "")                          \
+  X(v4f32, f32,          "ps", "",   "d",  "",    "")
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)

 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
--- a/src/IceInstX8664.def
+++ b/src/IceInstX8664.def
@@ -292,22 +292,22 @@
 //#define X(val, emit)

 #define ICETYPEX8664_TABLE                                                     \
-  /* tag         , element type, cvt , sdss, pack, width, fld */               \
-  X(IceType_void,  IceType_void, "?",  "",   "",  "",  "")                     \
-  X(IceType_i1,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i8,    IceType_void, "si", "",   "",  "b", "")                     \
-  X(IceType_i16,   IceType_void, "si", "",   "",  "w", "")                     \
-  X(IceType_i32,   IceType_void, "si", "",   "",  "l", "")                     \
-  X(IceType_i64,   IceType_void, "si", "",   "",  "q", "")                     \
-  X(IceType_f32,   IceType_void, "ss", "ss", "d", "",  "s")                    \
-  X(IceType_f64,   IceType_void, "sd", "sd", "q", "",  "l")                    \
-  X(IceType_v4i1,  IceType_i32,  "?",  "",   "d", "",  "")                     \
-  X(IceType_v8i1,  IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v16i1, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v16i8, IceType_i8,   "?",  "",   "b", "",  "")                     \
-  X(IceType_v8i16, IceType_i16,  "?",  "",   "w", "",  "")                     \
-  X(IceType_v4i32, IceType_i32,  "dq", "",   "d", "",  "")                     \
-  X(IceType_v4f32, IceType_f32,  "ps", "",   "d", "",  "")
+  /* tag,  element type, cvt , sdss, pack, width, fld */                       \
+  X(void,  void,         "?",  "",   "",   "",    "")                          \
+  X(i1,    void,         "si", "",   "",   "b",   "")                          \
+  X(i8,    void,         "si", "",   "",   "b",   "")                          \
+  X(i16,   void,         "si", "",   "",   "w",   "")                          \
+  X(i32,   void,         "si", "",   "",   "l",   "")                          \
+  X(i64,   void,         "si", "",   "",   "q",   "")                          \
+  X(f32,   void,         "ss", "ss", "d",  "",    "s")                         \
+  X(f64,   void,         "sd", "sd", "q",  "",    "l")                         \
+  X(v4i1,  i32,          "?",  "",   "d",  "",    "")                          \
+  X(v8i1,  i16,          "?",  "",   "w",  "",    "")                          \
+  X(v16i1, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v16i8, i8,           "?",  "",   "b",  "",    "")                          \
+  X(v8i16, i16,          "?",  "",   "w",  "",    "")                          \
+  X(v4i32, i32,          "dq", "",   "d",  "",    "")                          \
+  X(v4f32, f32,          "ps", "",   "d",  "",    "")
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)

 #endif // SUBZERO_SRC_ICEINSTX8664_DEF
--- a/src/IceInstX86BaseImpl.h
+++ b/src/IceInstX86BaseImpl.h
@@ -1384,38 +1384,35 @@ void InstX86Cbwdq<Machine>::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(this->getSrcSize() == 1);
  Operand *Src0 = this->getSrc(0);
-  assert(llvm::isa<Variable>(Src0));
+  int32_t DestReg = this->getDest()->getRegNum();
+  int32_t SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
  switch (Src0->getType()) {
  default:
    llvm_unreachable("unexpected source type!");
    break;
  case IceType_i8:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax ||
+           DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ah);
    Str << "\t"
        << "cbtw";
    break;
  case IceType_i16:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
    Str << "\t"
        << "cwtd";
    break;
  case IceType_i32:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
    Str << "\t"
        << "cltd";
    break;
  case IceType_i64:
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
    Str << "\t"
        << "cdto";
    break;
@@ -1428,35 +1425,32 @@ void InstX86Cbwdq<Machine>::emitIAS(const Cfg *Func) const {
      Func->getAssembler<typename InstX86Base<Machine>::Traits::Assembler>();
  assert(this->getSrcSize() == 1);
  Operand *Src0 = this->getSrc(0);
-  assert(llvm::isa<Variable>(Src0));
+  int32_t DestReg = this->getDest()->getRegNum();
+  int32_t SrcReg = llvm::cast<Variable>(Src0)->getRegNum();
+  (void)DestReg;
+  (void)SrcReg;
  switch (Src0->getType()) {
  default:
    llvm_unreachable("unexpected source type!");
    break;
  case IceType_i8:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_al);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax ||
+           DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ah);
    Asm->cbw();
    break;
  case IceType_i16:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_ax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_dx);
    Asm->cwd();
    break;
  case IceType_i32:
-    assert(llvm::cast<Variable>(Src0)->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(SrcReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_eax);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
    Asm->cdq();
    break;
  case IceType_i64:
-    assert(this->getDest()->getRegNum() ==
-           InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
+    assert(DestReg == InstX86Base<Machine>::Traits::RegisterSet::Reg_edx);
    Asm->cqo();
    break;
  }
@@ -2278,32 +2272,29 @@ template <class Machine> void InstX86Mov<Machine>::emit(const Cfg *Func) const {
  } else {
    Str << "\tmov"
        << (!isScalarFloatingType(DestTy)
-                ? this->getWidthString(SrcTy)
+                ? this->getWidthString(DestTy)
                : InstX86Base<Machine>::Traits::TypeAttributes[DestTy]
                      .SdSsString) << "\t";
  }
-  // For an integer truncation operation, src is wider than dest. Ideally, we
-  // use a mov instruction whose data width matches the narrower dest. This is
-  // a problem if e.g. src is a register like esi or si where there is no 8-bit
-  // version of the register. To be safe, we instead widen the dest to match
-  // src. This works even for stack-allocated dest variables because
-  // typeWidthOnStack() pads to a 4-byte boundary even if only a lower portion
-  // is used.
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
  // TODO: This assert disallows usages such as copying a floating
  // point value between a vector and a scalar (which movss is used for). Clean
  // this up.
  assert(Func->getTarget()->typeWidthInBytesOnStack(DestTy) ==
         Func->getTarget()->typeWidthInBytesOnStack(SrcTy));
-  Src->emit(Func);
+  const Operand *NewSrc = Src;
+  if (auto *SrcVar = llvm::dyn_cast<Variable>(Src)) {
+    int32_t NewRegNum = Variable::NoRegister;
+    if (SrcVar->hasReg())
+      NewRegNum = InstX86Base<Machine>::Traits::getGprForType(
+          DestTy, SrcVar->getRegNum());
+    if (SrcTy != DestTy)
+      NewSrc = SrcVar->asType(DestTy, NewRegNum);
+  }
+  NewSrc->emit(Func);
  Str << ", ";
-  int32_t NewRegNum = Variable::NoRegister;
-  if (this->getDest()->hasReg())
-    NewRegNum = InstX86Base<Machine>::Traits::getGprForType(
-        SrcTy, this->getDest()->getRegNum());
-  const Variable *NewDest = SrcTy == DestTy
-                                ? this->getDest()
-                                : this->getDest()->asType(SrcTy, NewRegNum);
-  NewDest->emit(Func);
+  this->getDest()->emit(Func);
 }

 template <class Machine>
@@ -2330,13 +2321,8 @@ void InstX86Mov<Machine>::emitIAS(const Cfg *Func) const {
      Machine>::Traits::Assembler::GPREmitterAddrOp GPRAddrEmitter = {
      &InstX86Base<Machine>::Traits::Assembler::mov,
      &InstX86Base<Machine>::Traits::Assembler::mov};
-  // For an integer truncation operation, src is wider than dest. Ideally, we
-  // use a mov instruction whose data width matches the narrower dest. This is
-  // a problem if e.g. src is a register like esi or si where there is no 8-bit
-  // version of the register. To be safe, we instead widen the dest to match
-  // src. This works even for stack-allocated dest variables because
-  // typeWidthOnStack() pads to a 4-byte boundary even if only a lower portion
-  // is used.
+  // For an integer truncation operation, src is wider than dest. In this case,
+  // we use a mov instruction whose data width matches the narrower dest.
  // TODO: This assert disallows usages such as copying a floating
  // point value between a vector and a scalar (which movss is used for). Clean
  // this up.
@@ -2366,7 +2352,7 @@ void InstX86Mov<Machine>::emitIAS(const Cfg *Func) const {
        return;
      }
      if (isScalarIntegerType(SrcTy)) {
-        DestTy = SrcTy;
+        SrcTy = DestTy;
      }
      emitIASRegOpTyGPR<Machine>(Func, DestTy, Dest, Src, GPRRegEmitter);
      return;

--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -428,6 +428,23 @@ private:

 Ostream &operator<<(Ostream &Str, const LiveRange &L);

+/// RegClass indicates the physical register class that a Variable may be
+/// register-allocated from.  By default, a variable's register class is
+/// directly associated with its type.  However, the target lowering may define
+/// additional target-specific register classes by extending the set of enum
+/// values.
+enum RegClass : uint8_t {
+// Define RC_void, RC_i1, RC_i8, etc.
+#define X(tag, sizeLog2, align, elts, elty, str) RC_##tag = IceType_##tag,
+  ICETYPE_TABLE
+#undef X
+      RC_Target,
+  // Leave plenty of space for target-specific values.
+  RC_Max = std::numeric_limits<uint8_t>::max()
+};
+static_assert(RC_Target == static_cast<RegClass>(IceType_NUM),
+              "Expected RC_Target and IceType_NUM to be the same");
+
 /// Variable represents an operand that is register-allocated or
 /// stack-allocated. If it is register-allocated, it will ultimately have a
 /// non-negative RegNum field.
@@ -493,6 +510,9 @@ public:
    return RegRequirement == RR_MustNotHaveRegister;
  }

+  void setRegClass(uint8_t RC) { RegisterClass = static_cast<RegClass>(RC); }
+  RegClass getRegClass() const { return RegisterClass; }
+
  LiveRange &getLiveRange() { return Live; }
  const LiveRange &getLiveRange() const { return Live; }
  void setLiveRange(const LiveRange &Range) { Live = Range; }
@@ -537,7 +557,8 @@ public:

 protected:
  Variable(OperandKind K, Type Ty, SizeT Index)
-      : Operand(K, Ty), Number(Index) {
+      : Operand(K, Ty), Number(Index),
+        RegisterClass(static_cast<RegClass>(Ty)) {
    Vars = VarsReal;
    Vars[0] = this;
    NumVars = 1;
@@ -553,6 +574,7 @@ protected:
  /// pointer and other physical registers specifically referenced by name.
  bool IgnoreLiveness = false;
  RegRequirement RegRequirement = RR_MayHaveRegister;
+  RegClass RegisterClass;
  /// RegNum is the allocated register, or NoRegister if it isn't
  /// register-allocated.
  int32_t RegNum = NoRegister;

--- a/src/IceRegAlloc.cpp
+++ b/src/IceRegAlloc.cpp
@@ -833,8 +833,7 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull,
    Iter.Cur = Unhandled.back();
    Unhandled.pop_back();
    dumpLiveRangeTrace("\nConsidering  ", Iter.Cur);
-    Iter.RegMask =
-        RegMaskFull & Target->getRegisterSetForType(Iter.Cur->getType());
+    Iter.RegMask = RegMaskFull & Target->getRegistersForVariable(Iter.Cur);
    KillsRange.trim(Iter.Cur->getLiveRange().getStart());

    // Check for pre-colored ranges. If Cur is pre-colored, it definitely gets
@@ -862,11 +861,10 @@ void LinearScan::scan(const llvm::SmallBitVector &RegMaskFull,
    // Disable AllowOverlap if an Active variable, which is not Prefer, shares
    // Prefer's register, and has a definition within Cur's live range.
    if (Iter.AllowOverlap) {
+      const llvm::SmallBitVector &Aliases = *RegAliases[Iter.PreferReg];
      for (const Variable *Item : Active) {
        int32_t RegNum = Item->getRegNumTmp();
-        // TODO(stichnot): Consider aliases of RegNum.  This is probably a
-        // correctness issue.
-        if (Item != Iter.Prefer && RegNum == Iter.PreferReg &&
+        if (Item != Iter.Prefer && Aliases[RegNum] &&
            overlapsDefs(Func, Iter.Cur, Item)) {
          Iter.AllowOverlap = false;
          dumpDisableOverlap(Func, Item, "Active");

--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -17,6 +17,7 @@

 #include "IceDefs.h"
 #include "IceInstARM32.def"
+#include "IceOperand.h" // RC_Target
 #include "IceTypes.h"

 namespace Ice {
@@ -118,6 +119,9 @@ public:
  static const char *RegNames[];
 };

+// Extend enum RegClass with ARM32-specific register classes (if any).
+enum RegClassARM32 : uint8_t { RCARM32_NUM = RC_Target };
+
 } // end of namespace Ice

 #endif // SUBZERO_SRC_ICEREGISTERSARM32_H
--- a/src/IceRegistersMIPS32.h
+++ b/src/IceRegistersMIPS32.h
@@ -17,6 +17,7 @@

 #include "IceDefs.h"
 #include "IceInstMIPS32.def"
+#include "IceOperand.h" // RC_Target
 #include "IceTypes.h"

 namespace Ice {
@@ -59,6 +60,9 @@ static inline GPRRegister getEncodedGPR(int32_t RegNum) {

 } // end of namespace RegMIPS32

+// Extend enum RegClass with MIPS32-specific register classes (if any).
+enum RegClassMIPS32 : uint8_t { RCMIPS32_NUM = RC_Target };
+
 } // end of namespace Ice

 #endif // SUBZERO_SRC_ICEREGISTERSMIPS32_H
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -234,7 +234,8 @@ public:

  virtual llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                              RegSetMask Exclude) const = 0;
-  virtual const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const = 0;
+  virtual const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const = 0;
  virtual const llvm::SmallBitVector &getAliasesForRegister(SizeT) const = 0;

  void regAlloc(RegAllocKind Kind);

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -74,8 +74,11 @@ public:
  IceString getRegName(SizeT RegNum, Type Ty) const override;
  llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                      RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSet[RC];
  }
  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
    return RegisterAliases[Reg];
@@ -554,7 +557,7 @@ protected:
  bool MaybeLeafFunc = true;
  size_t SpillAreaSizeBytes = 0;
  // TODO(jpp): std::array instead of array.
-  static llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  static llvm::SmallBitVector TypeToRegisterSet[RCARM32_NUM];
  static llvm::SmallBitVector RegisterAliases[RegARM32::Reg_NUM];
  static llvm::SmallBitVector ScratchRegs;
  llvm::SmallBitVector RegsUsed;

--- a/src/IceTargetLoweringMIPS32.h
+++ b/src/IceTargetLoweringMIPS32.h
@@ -42,8 +42,11 @@ public:
  IceString getRegName(SizeT RegNum, Type Ty) const override;
  llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                      RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(RC < RC_Target);
+    return TypeToRegisterSet[RC];
  }
  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
    return RegisterAliases[Reg];
@@ -231,7 +234,7 @@ protected:

  bool UsesFramePointer = false;
  bool NeedsStackAlignment = false;
-  static llvm::SmallBitVector TypeToRegisterSet[IceType_NUM];
+  static llvm::SmallBitVector TypeToRegisterSet[RCMIPS32_NUM];
  static llvm::SmallBitVector RegisterAliases[RegMIPS32::Reg_NUM];
  static llvm::SmallBitVector ScratchRegs;
  llvm::SmallBitVector RegsUsed;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -74,7 +74,7 @@ const size_t MachineTraits<TargetX8632>::TableIcmp64Size =
 const MachineTraits<TargetX8632>::TableTypeX8632AttributesType
    MachineTraits<TargetX8632>::TableTypeX8632Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
-  { elementty }                                                                \
+  { IceType_##elementty }                                                      \
  ,
        ICETYPEX8632_TABLE
 #undef X
@@ -87,7 +87,7 @@ const uint32_t MachineTraits<TargetX8632>::X86_STACK_ALIGNMENT_BYTES = 16;
 const char *MachineTraits<TargetX8632>::TargetName = "X8632";

 template <>
-std::array<llvm::SmallBitVector, IceType_NUM>
+std::array<llvm::SmallBitVector, RCX86_NUM>
    TargetX86Base<TargetX8632>::TypeToRegisterSet = {};

 template <>
@@ -957,7 +957,7 @@ enum _tmp_enum {
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  static const int _table1_##tag = tag;
+  static const int _table1_##tag = IceType_##tag;
 ICETYPE_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the

--- a/src/IceTargetLoweringX8632Traits.h
+++ b/src/IceTargetLoweringX8632Traits.h
@@ -22,8 +22,9 @@
 #include "IceInstX8632.def"
 #include "IceOperand.h"
 #include "IceRegistersX8632.h"
-#include "IceTargetLoweringX8632.def"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX8632.def"
+#include "IceTargetLoweringX86RegClass.h"

 #include <array>

@@ -398,7 +399,7 @@ template <> struct MachineTraits<TargetX8632> {
  }

  static void initRegisterSet(
-      std::array<llvm::SmallBitVector, IceType_NUM> *TypeToRegisterSet,
+      std::array<llvm::SmallBitVector, RCX86_NUM> *TypeToRegisterSet,
      std::array<llvm::SmallBitVector, RegisterSet::Reg_NUM> *RegisterAliases,
      llvm::SmallBitVector *ScratchRegs) {
    llvm::SmallBitVector IntegerRegistersI32(RegisterSet::Reg_NUM);
@@ -406,6 +407,11 @@ template <> struct MachineTraits<TargetX8632> {
    llvm::SmallBitVector IntegerRegistersI8(RegisterSet::Reg_NUM);
    llvm::SmallBitVector FloatRegisters(RegisterSet::Reg_NUM);
    llvm::SmallBitVector VectorRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc64To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc32To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc16To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc8RcvrRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector AhRcvrRegisters(RegisterSet::Reg_NUM);
    llvm::SmallBitVector InvalidRegisters(RegisterSet::Reg_NUM);
    ScratchRegs->resize(RegisterSet::Reg_NUM);
 #define X(val, encode, name, base, scratch, preserved, stackptr, frameptr,     \
@@ -416,6 +422,11 @@ template <> struct MachineTraits<TargetX8632> {
  (IntegerRegistersI8)[RegisterSet::val] = is8;                                \
  (FloatRegisters)[RegisterSet::val] = isXmm;                                  \
  (VectorRegisters)[RegisterSet::val] = isXmm;                                 \
+  (Trunc64To8Registers)[RegisterSet::val] = is64To8;                           \
+  (Trunc32To8Registers)[RegisterSet::val] = is32To8;                           \
+  (Trunc16To8Registers)[RegisterSet::val] = is16To8;                           \
+  (Trunc8RcvrRegisters)[RegisterSet::val] = isTrunc8Rcvr;                      \
+  (AhRcvrRegisters)[RegisterSet::val] = isAhRcvr;                              \
  (*RegisterAliases)[RegisterSet::val].resize(RegisterSet::Reg_NUM);           \
  for (SizeT RegAlias : aliases) {                                             \
    assert(!(*RegisterAliases)[RegisterSet::val][RegAlias] &&                  \
@@ -427,21 +438,26 @@ template <> struct MachineTraits<TargetX8632> {
    REGX8632_TABLE;
 #undef X

-    (*TypeToRegisterSet)[IceType_void] = InvalidRegisters;
-    (*TypeToRegisterSet)[IceType_i1] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i8] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i16] = IntegerRegistersI16;
-    (*TypeToRegisterSet)[IceType_i32] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_i64] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_f32] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_f64] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_v4i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i8] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i16] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4i32] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_void] = InvalidRegisters;
+    (*TypeToRegisterSet)[RC_i1] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i8] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i16] = IntegerRegistersI16;
+    (*TypeToRegisterSet)[RC_i32] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_i64] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_f32] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_f64] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_v4i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i8] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i16] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4i32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RCX86_Is64To8] = Trunc64To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is32To8] = Trunc32To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is16To8] = Trunc16To8Registers;
+    (*TypeToRegisterSet)[RCX86_IsTrunc8Rcvr] = Trunc8RcvrRegisters;
+    (*TypeToRegisterSet)[RCX86_IsAhRcvr] = AhRcvrRegisters;
  }

  static llvm::SmallBitVector
@@ -512,7 +528,12 @@ template <> struct MachineTraits<TargetX8632> {
    Index |= (is8 << (AttrKey++));                                             \
    Index |= (is16 << (AttrKey++));                                            \
    Index |= (is32 << (AttrKey++));                                            \
+    Index |= (is64 << (AttrKey++));                                            \
    Index |= (isXmm << (AttrKey++));                                           \
+    Index |= (is16To8 << (AttrKey++));                                         \
+    Index |= (is32To8 << (AttrKey++));                                         \
+    Index |= (is64To8 << (AttrKey++));                                         \
+    Index |= (isTrunc8Rcvr << (AttrKey++));                                    \
    /* val is assigned to an equivalence class based on its properties. */     \
    EquivalenceClasses[Index].push_back(RegisterSet::val);                     \
  }

--- a/src/IceTargetLoweringX8664.cpp
+++ b/src/IceTargetLoweringX8664.cpp
@@ -74,7 +74,7 @@ const size_t MachineTraits<TargetX8664>::TableIcmp64Size =
 const MachineTraits<TargetX8664>::TableTypeX8664AttributesType
    MachineTraits<TargetX8664>::TableTypeX8664Attributes[] = {
 #define X(tag, elementty, cvt, sdss, pack, width, fld)                         \
-  { elementty }                                                                \
+  { IceType_##elementty }                                                      \
  ,
        ICETYPEX8664_TABLE
 #undef X
@@ -87,7 +87,7 @@ const uint32_t MachineTraits<TargetX8664>::X86_STACK_ALIGNMENT_BYTES = 16;
 const char *MachineTraits<TargetX8664>::TargetName = "X8664";

 template <>
-std::array<llvm::SmallBitVector, IceType_NUM>
+std::array<llvm::SmallBitVector, RCX86_NUM>
    TargetX86Base<TargetX8664>::TypeToRegisterSet = {};

 template <>
@@ -955,7 +955,7 @@ enum _tmp_enum {
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  static const int _table1_##tag = tag;
+  static const int _table1_##tag = IceType_##tag;
 ICETYPE_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the

--- a/src/IceTargetLoweringX8664Traits.h
+++ b/src/IceTargetLoweringX8664Traits.h
@@ -24,6 +24,7 @@
 #include "IceRegistersX8664.h"
 #include "IceTargetLowering.h"
 #include "IceTargetLoweringX8664.def"
+#include "IceTargetLoweringX86RegClass.h"

 #include <array>

@@ -379,7 +380,7 @@ template <> struct MachineTraits<TargetX8664> {
  static int32_t getGprForType(Type, int32_t RegNum) { return RegNum; }

  static void initRegisterSet(
-      std::array<llvm::SmallBitVector, IceType_NUM> *TypeToRegisterSet,
+      std::array<llvm::SmallBitVector, RCX86_NUM> *TypeToRegisterSet,
      std::array<llvm::SmallBitVector, RegisterSet::Reg_NUM> *RegisterAliases,
      llvm::SmallBitVector *ScratchRegs) {
    llvm::SmallBitVector IntegerRegistersI64(RegisterSet::Reg_NUM);
@@ -388,6 +389,11 @@ template <> struct MachineTraits<TargetX8664> {
    llvm::SmallBitVector IntegerRegistersI8(RegisterSet::Reg_NUM);
    llvm::SmallBitVector FloatRegisters(RegisterSet::Reg_NUM);
    llvm::SmallBitVector VectorRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc64To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc32To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc16To8Registers(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector Trunc8RcvrRegisters(RegisterSet::Reg_NUM);
+    llvm::SmallBitVector AhRcvrRegisters(RegisterSet::Reg_NUM);
    llvm::SmallBitVector InvalidRegisters(RegisterSet::Reg_NUM);
    ScratchRegs->resize(RegisterSet::Reg_NUM);

@@ -400,6 +406,11 @@ template <> struct MachineTraits<TargetX8664> {
  (IntegerRegistersI8)[RegisterSet::val] = is8;                                \
  (FloatRegisters)[RegisterSet::val] = isXmm;                                  \
  (VectorRegisters)[RegisterSet::val] = isXmm;                                 \
+  (Trunc64To8Registers)[RegisterSet::val] = is64To8;                           \
+  (Trunc32To8Registers)[RegisterSet::val] = is32To8;                           \
+  (Trunc16To8Registers)[RegisterSet::val] = is16To8;                           \
+  (Trunc8RcvrRegisters)[RegisterSet::val] = isTrunc8Rcvr;                      \
+  (AhRcvrRegisters)[RegisterSet::val] = isAhRcvr;                              \
  (*RegisterAliases)[RegisterSet::val].resize(RegisterSet::Reg_NUM);           \
  for (SizeT RegAlias : aliases) {                                             \
    assert(!(*RegisterAliases)[RegisterSet::val][RegAlias] &&                  \
@@ -411,21 +422,26 @@ template <> struct MachineTraits<TargetX8664> {
    REGX8664_TABLE;
 #undef X

-    (*TypeToRegisterSet)[IceType_void] = InvalidRegisters;
-    (*TypeToRegisterSet)[IceType_i1] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i8] = IntegerRegistersI8;
-    (*TypeToRegisterSet)[IceType_i16] = IntegerRegistersI16;
-    (*TypeToRegisterSet)[IceType_i32] = IntegerRegistersI32;
-    (*TypeToRegisterSet)[IceType_i64] = IntegerRegistersI64;
-    (*TypeToRegisterSet)[IceType_f32] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_f64] = FloatRegisters;
-    (*TypeToRegisterSet)[IceType_v4i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i1] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v16i8] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v8i16] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4i32] = VectorRegisters;
-    (*TypeToRegisterSet)[IceType_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_void] = InvalidRegisters;
+    (*TypeToRegisterSet)[RC_i1] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i8] = IntegerRegistersI8;
+    (*TypeToRegisterSet)[RC_i16] = IntegerRegistersI16;
+    (*TypeToRegisterSet)[RC_i32] = IntegerRegistersI32;
+    (*TypeToRegisterSet)[RC_i64] = IntegerRegistersI64;
+    (*TypeToRegisterSet)[RC_f32] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_f64] = FloatRegisters;
+    (*TypeToRegisterSet)[RC_v4i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i1] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v16i8] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v8i16] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4i32] = VectorRegisters;
+    (*TypeToRegisterSet)[RC_v4f32] = VectorRegisters;
+    (*TypeToRegisterSet)[RCX86_Is64To8] = Trunc64To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is32To8] = Trunc32To8Registers;
+    (*TypeToRegisterSet)[RCX86_Is16To8] = Trunc16To8Registers;
+    (*TypeToRegisterSet)[RCX86_IsTrunc8Rcvr] = Trunc8RcvrRegisters;
+    (*TypeToRegisterSet)[RCX86_IsAhRcvr] = AhRcvrRegisters;
  }

  static llvm::SmallBitVector
@@ -498,6 +514,10 @@ template <> struct MachineTraits<TargetX8664> {
    Index |= (is32 << (AttrKey++));                                            \
    Index |= (is64 << (AttrKey++));                                            \
    Index |= (isXmm << (AttrKey++));                                           \
+    Index |= (is16To8 << (AttrKey++));                                         \
+    Index |= (is32To8 << (AttrKey++));                                         \
+    Index |= (is64To8 << (AttrKey++));                                         \
+    Index |= (isTrunc8Rcvr << (AttrKey++));                                    \
    /* val is assigned to an equivalence class based on its properties. */     \
    EquivalenceClasses[Index].push_back(RegisterSet::val);                     \
  }

--- a/src/IceTargetLoweringX86Base.h
+++ b/src/IceTargetLoweringX86Base.h
@@ -20,6 +20,7 @@
 #include "IceInst.h"
 #include "IceSwitchLowering.h"
 #include "IceTargetLowering.h"
+#include "IceTargetLoweringX86RegClass.h"
 #include "IceUtils.h"

 #include <array>
@@ -73,8 +74,11 @@ public:
  IceString getRegName(SizeT RegNum, Type Ty) const override;
  llvm::SmallBitVector getRegisterSet(RegSetMask Include,
                                      RegSetMask Exclude) const override;
-  const llvm::SmallBitVector &getRegisterSetForType(Type Ty) const override {
-    return TypeToRegisterSet[Ty];
+  const llvm::SmallBitVector &
+  getRegistersForVariable(const Variable *Var) const override {
+    RegClass RC = Var->getRegClass();
+    assert(static_cast<RegClassX86>(RC) < RCX86_NUM);
+    return TypeToRegisterSet[RC];
  }

  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
@@ -263,6 +267,7 @@ protected:
  static Type firstTypeThatFitsSize(uint32_t Size,
                                    uint32_t MaxSize = NoSizeLimit);

+  Variable *copyToReg8(Operand *Src, int32_t RegNum = Variable::NoRegister);
  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);

  /// \name Returns a vector in a register with the given constant entries.
@@ -674,7 +679,7 @@ protected:
  bool NeedsStackAlignment = false;
  size_t SpillAreaSizeBytes = 0;
  size_t FixedAllocaSizeBytes = 0;
-  static std::array<llvm::SmallBitVector, IceType_NUM> TypeToRegisterSet;
+  static std::array<llvm::SmallBitVector, RCX86_NUM> TypeToRegisterSet;
  static std::array<llvm::SmallBitVector, Traits::RegisterSet::Reg_NUM>
      RegisterAliases;
  static llvm::SmallBitVector ScratchRegs;

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
--- a/src/IceTargetLoweringX86RegClass.h
+++ b/src/IceTargetLoweringX86RegClass.h
+//===- subzero/src/IceTargetLoweringX86RegClass.h - x86 reg class -*- C++ -*-=//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the X86 register class extensions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
+#define SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
+
+#include "IceOperand.h" // RC_Target
+
+namespace Ice {
+namespace X86Internal {
+
+// Extend enum RegClass with x86-specific register classes.
+enum RegClassX86 : uint8_t {
+  RCX86_Is64To8 = RC_Target, // 64-bit GPR trivially truncable to 8-bit
+  RCX86_Is32To8,             // 32-bit GPR trivially truncable to 8-bit
+  RCX86_Is16To8,             // 16-bit GPR trivially truncable to 8-bit
+  RCX86_IsTrunc8Rcvr,        // 8-bit GPR that can receive a trunc operation
+  RCX86_IsAhRcvr,            // 8-bit GPR that can be a mov dest from %ah
+  RCX86_NUM
+};
+
+} // end of namespace X86Internal
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICETARGETLOWERINGX86REGCLASS_H
--- a/src/IceTypes.cpp
+++ b/src/IceTypes.cpp
@@ -95,7 +95,7 @@ struct TypeAttributeFields {

 const struct TypeAttributeFields TypeAttributes[] = {
 #define X(tag, sizeLog2, align, elts, elty, str)                               \
-  { sizeLog2, align, elts, elty, str }                                         \
+  { sizeLog2, align, elts, IceType_##elty, str }                               \
  ,
    ICETYPE_TABLE
 #undef X
@@ -120,7 +120,8 @@ const TypePropertyFields TypePropertiesTable[] = {
          CompareResult)                                                       \
  {                                                                            \
    IsVec, IsInt, IsInt & !IsVec, IsInt & IsVec, IsIntArith, IsFloat,          \
-        IsFloat & !IsVec, IsFloat & IsVec, IsLoadStore, IsParam, CompareResult \
+        IsFloat & !IsVec, IsFloat & IsVec, IsLoadStore, IsParam,               \
+        IceType_##CompareResult                                                \
  }                                                                            \
  ,
    ICETYPE_PROPS_TABLE

--- a/src/IceTypes.def
+++ b/src/IceTypes.def
@@ -29,23 +29,23 @@
 //#define X(tag, str, is_elf64, e_machine, e_flags)

 #define ICETYPE_TABLE                                                          \
-  /* enum value, log_2(size), align, # elts, element type, printable */        \
-  /*     string (size and alignment in bytes) */                               \
-  X(IceType_void,  -1,  0,     1,      IceType_void, "void")                   \
-  X(IceType_i1,     0,  1,     1,      IceType_i1,   "i1")                     \
-  X(IceType_i8,     0,  1,     1,      IceType_i8,   "i8")                     \
-  X(IceType_i16,    1,  1,     1,      IceType_i16,  "i16")                    \
-  X(IceType_i32,    2,  1,     1,      IceType_i32,  "i32")                    \
-  X(IceType_i64,    3,  1,     1,      IceType_i64,  "i64")                    \
-  X(IceType_f32,    2,  4,     1,      IceType_f32,  "float")                  \
-  X(IceType_f64,    3,  8,     1,      IceType_f64,  "double")                 \
-  X(IceType_v4i1,   4,  1,     4,      IceType_i1,   "<4 x i1>")               \
-  X(IceType_v8i1,   4,  1,     8,      IceType_i1,   "<8 x i1>")               \
-  X(IceType_v16i1,  4,  1,    16,      IceType_i1,   "<16 x i1>")              \
-  X(IceType_v16i8,  4,  1,    16,      IceType_i8,   "<16 x i8>")              \
-  X(IceType_v8i16,  4,  2,     8,      IceType_i16,  "<8 x i16>")              \
-  X(IceType_v4i32,  4,  4,     4,      IceType_i32,  "<4 x i32>")              \
-  X(IceType_v4f32,  4,  4,     4,      IceType_f32,  "<4 x float>")            \
+  /* enum value, log_2(size), align, # elts, element type, */                  \
+  /*     printable string (size and alignment in bytes) */                     \
+  X(void,  -1,  0,  1, void, "void")                                           \
+  X(i1,     0,  1,  1, i1,   "i1")                                             \
+  X(i8,     0,  1,  1, i8,   "i8")                                             \
+  X(i16,    1,  1,  1, i16,  "i16")                                            \
+  X(i32,    2,  1,  1, i32,  "i32")                                            \
+  X(i64,    3,  1,  1, i64,  "i64")                                            \
+  X(f32,    2,  4,  1, f32,  "float")                                          \
+  X(f64,    3,  8,  1, f64,  "double")                                         \
+  X(v4i1,   4,  1,  4, i1,   "<4 x i1>")                                       \
+  X(v8i1,   4,  1,  8, i1,   "<8 x i1>")                                       \
+  X(v16i1,  4,  1, 16, i1,   "<16 x i1>")                                      \
+  X(v16i8,  4,  1, 16, i8,   "<16 x i8>")                                      \
+  X(v8i16,  4,  2,  8, i16,  "<8 x i16>")                                      \
+  X(v4i32,  4,  4,  4, i32,  "<4 x i32>")                                      \
+  X(v4f32,  4,  4,  4, f32,  "<4 x float>")                                    \
 //#define X(tag, sizeLog2, align, elts, elty, str)

 // Dictionary:
@@ -58,22 +58,22 @@
 //   CR - Result type of compare instruction for argument type
 //        (IceType_void if disallowed)
 #define ICETYPE_PROPS_TABLE                                                    \
-  /* Enum Value    V  I  F IA  LS P CR */                                      \
-  X(IceType_void,  0, 0, 0, 0, 0, 0, IceType_void)                             \
-  X(IceType_i1,    0, 1, 0, 0, 0, 0, IceType_i1)                               \
-  X(IceType_i8,    0, 1, 0, 1, 1, 0, IceType_i1)                               \
-  X(IceType_i16,   0, 1, 0, 1, 1, 0, IceType_i1)                               \
-  X(IceType_i32,   0, 1, 0, 1, 1, 1, IceType_i1)                               \
-  X(IceType_i64,   0, 1, 0, 1, 1, 1, IceType_i1)                               \
-  X(IceType_f32,   0, 0, 1, 0, 1, 1, IceType_i1)                               \
-  X(IceType_f64,   0, 0, 1, 0, 1, 1, IceType_i1)                               \
-  X(IceType_v4i1,  1, 1, 0, 0, 0, 1, IceType_v4i1)                             \
-  X(IceType_v8i1,  1, 1, 0, 0, 0, 1, IceType_v8i1)                             \
-  X(IceType_v16i1, 1, 1, 0, 0, 0, 1, IceType_v16i1)                            \
-  X(IceType_v16i8, 1, 1, 0, 1, 1, 1, IceType_v16i1)                            \
-  X(IceType_v8i16, 1, 1, 0, 1, 1, 1, IceType_v8i1)                             \
-  X(IceType_v4i32, 1, 1, 0, 1, 1, 1, IceType_v4i1)                             \
-  X(IceType_v4f32, 1, 0, 1, 0, 1, 1, IceType_v4i1)                             \
+  /* Enum Value    V  I  F IA LS  P  CR */                                     \
+  X(void,          0, 0, 0, 0, 0, 0, void)                                     \
+  X(i1,            0, 1, 0, 0, 0, 0, i1)                                       \
+  X(i8,            0, 1, 0, 1, 1, 0, i1)                                       \
+  X(i16,           0, 1, 0, 1, 1, 0, i1)                                       \
+  X(i32,           0, 1, 0, 1, 1, 1, i1)                                       \
+  X(i64,           0, 1, 0, 1, 1, 1, i1)                                       \
+  X(f32,           0, 0, 1, 0, 1, 1, i1)                                       \
+  X(f64,           0, 0, 1, 0, 1, 1, i1)                                       \
+  X(v4i1,          1, 1, 0, 0, 0, 1, v4i1)                                     \
+  X(v8i1,          1, 1, 0, 0, 0, 1, v8i1)                                     \
+  X(v16i1,         1, 1, 0, 0, 0, 1, v16i1)                                    \
+  X(v16i8,         1, 1, 0, 1, 1, 1, v16i1)                                    \
+  X(v8i16,         1, 1, 0, 1, 1, 1, v8i1)                                     \
+  X(v4i32,         1, 1, 0, 1, 1, 1, v4i1)                                     \
+  X(v4f32,         1, 0, 1, 0, 1, 1, v4i1)                                     \
 //#define X(tag, IsVec, IsInt, IsFloat, IsIntArith, IsLoadStore, IsParam,      \
 //          CompareResult)


--- a/src/IceTypes.h
+++ b/src/IceTypes.h
@@ -22,7 +22,7 @@
 namespace Ice {

 enum Type {
-#define X(tag, sizeLog2, align, elts, elty, str) tag,
+#define X(tag, sizeLog2, align, elts, elty, str) IceType_##tag,
  ICETYPE_TABLE
 #undef X
      IceType_NUM

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -726,7 +726,7 @@ entry:
 ; CHECK-NEXT: movsx  eax,ax
 ;
 ; OPTM1-LABEL: trunc64To16Signed
-; OPTM1:      mov     eax,DWORD PTR [esp+
+; OPTM1:      mov     ax,WORD PTR [esp+
 ; OPTM1: movsx  eax,

 ; ARM32-LABEL: trunc64To16Signed
@@ -808,7 +808,7 @@ entry:
 ; CHECK-NEXT: movzx  eax,ax
 ;
 ; OPTM1-LABEL: trunc64To16Unsigned
-; OPTM1:      mov     eax,DWORD PTR [esp+
+; OPTM1:      mov     ax,WORD PTR [esp+
 ; OPTM1: movzx  eax,

 ; ARM32-LABEL: trunc64To16Unsigned
@@ -840,12 +840,12 @@ entry:
 }
 ; CHECK-LABEL: trunc64To1
 ; CHECK:      mov     eax,DWORD PTR [esp+0x4]
-; CHECK:      and     eax,0x1
+; CHECK:      and     al,0x1
 ; CHECK-NOT:  and     eax,0x1
 ;
 ; OPTM1-LABEL: trunc64To1
 ; OPTM1:      mov     eax,DWORD PTR [esp+
-; OPTM1:      and     eax,0x1
+; OPTM1:      and     al,0x1
 ; OPTM1-NOT:  and     eax,0x1

 ; ARM32-LABEL: trunc64To1

--- a/tests_lit/llvm2ice_tests/ebp_args.ll
+++ b/tests_lit/llvm2ice_tests/ebp_args.ll
@@ -31,7 +31,7 @@ eblock:
 ; CHECK:  sub   esp,0x80
 ; CHECK:  mov   DWORD PTR [ebp-0x4],esp
 ; CHECK:  mov   eax,DWORD PTR [ebp+0xc]
-; CHECK:  mov   DWORD PTR [ebp-0x8],eax
+; CHECK:  mov   BYTE PTR [ebp-0x8],al
 ; CHECK:  movzx eax,BYTE PTR [ebp-0x8]
 ; CHECK:  mov   DWORD PTR [ebp-0xc],eax
 ; CHECK:  sub   esp,0x10

--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -1085,7 +1085,7 @@ entry:
  ret i32 %old_ext
 }
 ; CHECK-LABEL: test_atomic_cmpxchg_16
-; CHECK: mov eax,{{.*}}
+; CHECK: mov {{ax|eax}},{{.*}}
 ; CHECK: lock cmpxchg WORD PTR [e{{[^a].}}],{{[^a]}}x
 ; ARM32-LABEL: test_atomic_cmpxchg_16
 ; ARM32: dmb

--- a/tests_lit/llvm2ice_tests/randomize-regalloc.ll
+++ b/tests_lit/llvm2ice_tests/randomize-regalloc.ll
@@ -26,11 +26,11 @@ entry:
 ; OPTM1_1-NEXT: movups  XMMWORD PTR [esp+0x20],xmm0
 ; OPTM1_1-NEXT: movups  XMMWORD PTR [esp+0x10],xmm1
 ; OPTM1_1-NEXT: movups  xmm0,XMMWORD PTR [esp+0x20]
-; OPTM1_1-NEXT: pshufd  xmm1,XMMWORD PTR [esp+0x20],0x31
+; OPTM1_1-NEXT: pshufd  xmm6,XMMWORD PTR [esp+0x20],0x31
 ; OPTM1_1-NEXT: pshufd  xmm2,XMMWORD PTR [esp+0x10],0x31
 ; OPTM1_1-NEXT: pmuludq xmm0,XMMWORD PTR [esp+0x10]
-; OPTM1_1-NEXT: pmuludq xmm1,xmm2
-; OPTM1_1-NEXT: shufps  xmm0,xmm1,0x88
+; OPTM1_1-NEXT: pmuludq xmm6,xmm2
+; OPTM1_1-NEXT: shufps  xmm0,xmm6,0x88
 ; OPTM1_1-NEXT: pshufd  xmm0,xmm0,0xd8
 ; OPTM1_1-NEXT: movups  XMMWORD PTR [esp],xmm0
 ; OPTM1_1-NEXT: movups  xmm0,XMMWORD PTR [esp]
@@ -40,9 +40,9 @@ entry:
 ; CHECK_1-LABEL: mul_v4i32
 ; CHECK_1: movups  xmm7,xmm0
 ; CHECK_1-NEXT: pshufd  xmm0,xmm0,0x31
-; CHECK_1-NEXT: pshufd  xmm4,xmm1,0x31
+; CHECK_1-NEXT: pshufd  xmm5,xmm1,0x31
 ; CHECK_1-NEXT: pmuludq xmm7,xmm1
-; CHECK_1-NEXT: pmuludq xmm0,xmm4
+; CHECK_1-NEXT: pmuludq xmm0,xmm5
 ; CHECK_1-NEXT: shufps  xmm7,xmm0,0x88
 ; CHECK_1-NEXT: pshufd  xmm7,xmm7,0xd8
 ; CHECK_1-NEXT: movups  xmm0,xmm7
@@ -53,11 +53,11 @@ entry:
 ; OPTM1_123-NEXT: movups  XMMWORD PTR [esp+0x20],xmm0
 ; OPTM1_123-NEXT: movups  XMMWORD PTR [esp+0x10],xmm1
 ; OPTM1_123-NEXT: movups  xmm0,XMMWORD PTR [esp+0x20]
-; OPTM1_123-NEXT: pshufd  xmm3,XMMWORD PTR [esp+0x20],0x31
-; OPTM1_123-NEXT: pshufd  xmm7,XMMWORD PTR [esp+0x10],0x31
+; OPTM1_123-NEXT: pshufd  xmm6,XMMWORD PTR [esp+0x20],0x31
+; OPTM1_123-NEXT: pshufd  xmm2,XMMWORD PTR [esp+0x10],0x31
 ; OPTM1_123-NEXT: pmuludq xmm0,XMMWORD PTR [esp+0x10]
-; OPTM1_123-NEXT: pmuludq xmm3,xmm7
-; OPTM1_123-NEXT: shufps  xmm0,xmm3,0x88
+; OPTM1_123-NEXT: pmuludq xmm6,xmm2
+; OPTM1_123-NEXT: shufps  xmm0,xmm6,0x88
 ; OPTM1_123-NEXT: pshufd  xmm0,xmm0,0xd8
 ; OPTM1_123-NEXT: movups  XMMWORD PTR [esp],xmm0
 ; OPTM1_123-NEXT: movups  xmm0,XMMWORD PTR [esp]
@@ -65,14 +65,14 @@ entry:
 ; OPTM1_123-NEXT: ret

 ; CHECK_123-LABEL: mul_v4i32
-; CHECK_123: movups  xmm4,xmm0
+; CHECK_123: movups  xmm5,xmm0
 ; CHECK_123-NEXT: pshufd  xmm0,xmm0,0x31
 ; CHECK_123-NEXT: pshufd  xmm7,xmm1,0x31
-; CHECK_123-NEXT: pmuludq xmm4,xmm1
+; CHECK_123-NEXT: pmuludq xmm5,xmm1
 ; CHECK_123-NEXT: pmuludq xmm0,xmm7
-; CHECK_123-NEXT: shufps  xmm4,xmm0,0x88
-; CHECK_123-NEXT: pshufd  xmm4,xmm4,0xd8
-; CHECK_123-NEXT: movups  xmm0,xmm4
+; CHECK_123-NEXT: shufps  xmm5,xmm0,0x88
+; CHECK_123-NEXT: pshufd  xmm5,xmm5,0xd8
+; CHECK_123-NEXT: movups  xmm0,xmm5
 ; CHECK_123-NEXT: ret
 }


--- a/tests_lit/llvm2ice_tests/rng.ll
+++ b/tests_lit/llvm2ice_tests/rng.ll
@@ -189,14 +189,14 @@ entry:
  ret <4 x i32> %res

 ; REGALLOC-LABEL: func4
-; REGALLOC: movups  xmm5,xmm0
+; REGALLOC: movups  xmm3,xmm0
 ; REGALLOC-NEXT: pshufd  xmm0,xmm0,0x31
 ; REGALLOC-NEXT: pshufd  xmm4,xmm1,0x31
-; REGALLOC-NEXT: pmuludq xmm5,xmm1
+; REGALLOC-NEXT: pmuludq xmm3,xmm1
 ; REGALLOC-NEXT: pmuludq xmm0,xmm4
-; REGALLOC-NEXT: shufps  xmm5,xmm0,0x88
-; REGALLOC-NEXT: pshufd  xmm5,xmm5,0xd8
-; REGALLOC-NEXT: movups  xmm0,xmm5
+; REGALLOC-NEXT: shufps  xmm3,xmm0,0x88
+; REGALLOC-NEXT: pshufd  xmm3,xmm3,0xd8
+; REGALLOC-NEXT: movups  xmm0,xmm3
 ; REGALLOC-NEXT: ret
 }