Subzero. ARM32. Show FP lowering some love.

After some time of being neglected, this CL improves FP lowering for ARM32. 1) It emits vpush {list}, and vpop {list} when possible. 2) It stops saving alised Vfp registers multiple times (yes, sz used to save both D and S registers even when they aliased.) 3) Introduces Vmla (fp multiply and accumulate) and Vmls (multiply and subtract.) (1 + 2) minimally (but positively) affected SPEC. (3) caused a 2% geomean improvement. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1481133002 .

Subzero. ARM32. Show FP lowering some love.
eb13acc6 · John Porto · e293b5f4 · eb13acc6 · eb13acc6 · eb13acc6
Commit eb13acc6 authored Dec 09, 2015 by John Porto
15 changed files
--- a/pydir/gen_arm32_reg_tables.py
+++ b/pydir/gen_arm32_reg_tables.py
--- a/src/IceCfgNode.cpp
+++ b/src/IceCfgNode.cpp
@@ -981,7 +981,7 @@ bool emitLiveRangesEnded(Ostream &Str, const Cfg *Func, const Inst *Instr,
      if (Printed)
        Str << ",";
      else
-        Str << " \t# END=";
+        Str << " \t@ END=";
      Var->emit(Func);
      Printed = true;
    }

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -185,6 +185,22 @@ void InstARM32::emitThreeAddrFP(const char *Opcode, const InstARM32 *Inst,
  Inst->getSrc(1)->emit(Func);
 }

+void InstARM32::emitFourAddrFP(const char *Opcode, const InstARM32 *Inst,
+                               const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 3);
+  assert(Inst->getSrc(0) == Inst->getDest());
+  Str << "\t" << Opcode << getVecWidthString(Inst->getDest()->getType())
+      << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(2)->emit(Func);
+}
+
 void InstARM32Pred::emitFourAddr(const char *Opcode, const InstARM32Pred *Inst,
                                 const Cfg *Func) {
  if (!BuildDefs::dump())
@@ -571,18 +587,43 @@ IceString InstARM32Label::getName(const Cfg *Func) const {
  return ".L" + Func->getFunctionName() + "$local$__" + std::to_string(Number);
 }

+namespace {
+// Requirements for Push/Pop:
+//  1) All the Variables have the same type;
+//  2) All the variables have registers assigned to them.
+void validatePushOrPopRegisterListOrDie(const VarList &RegList) {
+  Type PreviousTy = IceType_void;
+  for (Variable *Reg : RegList) {
+    if (PreviousTy != IceType_void && Reg->getType() != PreviousTy) {
+      llvm::report_fatal_error("Type mismatch when popping/pushing "
+                               "registers.");
+    }
+
+    if (!Reg->hasReg()) {
+      llvm::report_fatal_error("Push/pop operand does not have a register "
+                               "assigned to it.");
+    }
+
+    PreviousTy = Reg->getType();
+  }
+}
+} // end of anonymous namespace
+
 InstARM32Pop::InstARM32Pop(Cfg *Func, const VarList &Dests)
    : InstARM32(Func, InstARM32::Pop, 0, nullptr), Dests(Dests) {
  // Track modifications to Dests separately via FakeDefs. Also, a pop
  // instruction affects the stack pointer and so it should not be allowed to
  // be automatically dead-code eliminated. This is automatic since we leave
  // the Dest as nullptr.
+  validatePushOrPopRegisterListOrDie(Dests);
 }

 InstARM32Push::InstARM32Push(Cfg *Func, const VarList &Srcs)
    : InstARM32(Func, InstARM32::Push, Srcs.size(), nullptr) {
-  for (Variable *Source : Srcs)
+  validatePushOrPopRegisterListOrDie(Srcs);
+  for (Variable *Source : Srcs) {
    addSource(Source);
+  }
 }

 InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
@@ -736,8 +777,10 @@ template <> const char *InstARM32Udiv::Opcode = "udiv";
 // FP
 template <> const char *InstARM32Vadd::Opcode = "vadd";
 template <> const char *InstARM32Vdiv::Opcode = "vdiv";
-template <> const char *InstARM32Vmul::Opcode = "vmul";
 template <> const char *InstARM32Veor::Opcode = "veor";
+template <> const char *InstARM32Vmla::Opcode = "vmla";
+template <> const char *InstARM32Vmls::Opcode = "vmls";
+template <> const char *InstARM32Vmul::Opcode = "vmul";
 template <> const char *InstARM32Vsub::Opcode = "vsub";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
@@ -1216,51 +1259,74 @@ template <> void InstARM32Uxt::emitIAS(const Cfg *Func) const {
    emitUsingTextFixup(Func);
 }

+namespace {
+
+bool isAssignedConsecutiveRegisters(Variable *Before, Variable *After) {
+  assert(Before->hasReg());
+  assert(After->hasReg());
+  return Before->getRegNum() + 1 == After->getRegNum();
+}
+
+} // end of anonymous namespace
+
 void InstARM32Pop::emit(const Cfg *Func) const {
-  // TODO(jpp): Improve FP register save/restore.
  if (!BuildDefs::dump())
    return;
-  SizeT IntegerCount = 0;
-  for (const Operand *Op : Dests) {
-    if (isScalarIntegerType(Op->getType())) {
-      ++IntegerCount;
-    }
+
+  const SizeT DestSize = Dests.size();
+  if (DestSize == 0) {
+    assert(false && "Empty pop list");
+    return;
  }
+
  Ostream &Str = Func->getContext()->getStrEmit();
-  bool NeedNewline = false;
-  if (IntegerCount != 0) {
+
+  Variable *Reg = Dests[0];
+  if (isScalarIntegerType(Reg->getType())) {
+    // GPR push.
    Str << "\t"
-        << "pop"
-        << "\t{";
-    bool PrintComma = false;
-    for (const Operand *Op : Dests) {
-      if (isScalarIntegerType(Op->getType())) {
-        if (PrintComma)
-          Str << ", ";
-        Op->emit(Func);
-        PrintComma = true;
-      }
+           "pop"
+           "\t{";
+    Reg->emit(Func);
+    for (SizeT i = 1; i < DestSize; ++i) {
+      Str << ", ";
+      Reg = Dests[i];
+      Reg->emit(Func);
    }
    Str << "}";
-    NeedNewline = true;
+    return;
  }

-  for (const Operand *Op : Dests) {
-    if (isScalarIntegerType(Op->getType()))
-      continue;
-    if (NeedNewline) {
-      Str << "\n";
+  // VFP "s" reg push.
+  SizeT End = DestSize - 1;
+  SizeT Start = DestSize - 1;
+  Reg = Dests[DestSize - 1];
+  Str << "\t"
+         "vpop"
+         "\t{";
+  for (SizeT i = 2; i <= DestSize; ++i) {
+    Variable *PreviousReg = Dests[DestSize - i];
+    if (!isAssignedConsecutiveRegisters(PreviousReg, Reg)) {
+      Dests[Start]->emit(Func);
+      for (SizeT j = Start + 1; j <= End; ++j) {
+        Str << ", ";
+        Dests[j]->emit(Func);
+      }
      startNextInst(Func);
-      NeedNewline = false;
+      Str << "}\n\t"
+             "vpop"
+             "\t{";
+      End = DestSize - i;
    }
-    Str << "\t"
-        << "vpop"
-        << "\t{";
-    Op->emit(Func);
-    Str << "}";
-    NeedNewline = true;
+    Reg = PreviousReg;
+    Start = DestSize - i;
+  }
+  Dests[Start]->emit(Func);
+  for (SizeT j = Start + 1; j <= End; ++j) {
+    Str << ", ";
+    Dests[j]->emit(Func);
  }
-  assert(NeedNewline); // caller will add the newline
+  Str << "}";
 }

 void InstARM32Pop::emitIAS(const Cfg *Func) const {
@@ -1310,56 +1376,55 @@ void InstARM32Pop::dump(const Cfg *Func) const {
 }

 void InstARM32Push::emit(const Cfg *Func) const {
-  // TODO(jpp): Improve FP register save/restore.
  if (!BuildDefs::dump())
    return;
-  SizeT IntegerCount = 0;
-  for (SizeT i = 0; i < getSrcSize(); ++i) {
-    if (isScalarIntegerType(getSrc(i)->getType())) {
-      ++IntegerCount;
-    }
+
+  // Push can't be emitted if there are no registers to save. This should never
+  // happen, but if it does, we don't need to bring Subzero down -- we just skip
+  // emitting the push instruction (and maybe emit a nop?) The assert() is here
+  // so that we can detect this error during development.
+  const SizeT SrcSize = getSrcSize();
+  if (SrcSize == 0) {
+    assert(false && "Empty push list");
+    return;
  }
+
  Ostream &Str = Func->getContext()->getStrEmit();
-  bool NeedNewline = false;
-  for (SizeT i = getSrcSize(); i > 0; --i) {
-    Operand *Op = getSrc(i - 1);
-    if (isScalarIntegerType(Op->getType()))
-      continue;
-    if (NeedNewline) {
-      Str << "\n";
-      startNextInst(Func);
-      NeedNewline = false;
-    }
+
+  Variable *Reg = llvm::cast<Variable>(getSrc(0));
+  if (isScalarIntegerType(Reg->getType())) {
+    // GPR push.
    Str << "\t"
-        << "vpush"
-        << "\t{";
-    Op->emit(Func);
+           "push"
+           "\t{";
+    Reg->emit(Func);
+    for (SizeT i = 1; i < SrcSize; ++i) {
+      Str << ", ";
+      getSrc(i)->emit(Func);
+    }
    Str << "}";
-    NeedNewline = true;
+    return;
  }
-  if (IntegerCount != 0) {
-    if (NeedNewline) {
-      Str << "\n";
+
+  // VFP "s" reg push.
+  Str << "\t"
+         "vpush"
+         "\t{";
+  Reg->emit(Func);
+  for (SizeT i = 1; i < SrcSize; ++i) {
+    Variable *NextReg = llvm::cast<Variable>(getSrc(i));
+    if (isAssignedConsecutiveRegisters(Reg, NextReg)) {
+      Str << ", ";
+    } else {
      startNextInst(Func);
-      NeedNewline = false;
-    }
-    Str << "\t"
-        << "push"
-        << "\t{";
-    bool PrintComma = false;
-    for (SizeT i = 0; i < getSrcSize(); ++i) {
-      Operand *Op = getSrc(i);
-      if (isScalarIntegerType(Op->getType())) {
-        if (PrintComma)
-          Str << ", ";
-        Op->emit(Func);
-        PrintComma = true;
-      }
+      Str << "}\n\t"
+             "vpush"
+             "\t{";
    }
-    Str << "}";
-    NeedNewline = true;
+    Reg = NextReg;
+    Reg->emit(Func);
  }
-  assert(NeedNewline); // caller will add the newline
+  Str << "}";
 }

 void InstARM32Push::emitIAS(const Cfg *Func) const {
@@ -1925,8 +1990,10 @@ template class InstARM32ThreeAddrGPR<InstARM32::Udiv>;

 template class InstARM32ThreeAddrFP<InstARM32::Vadd>;
 template class InstARM32ThreeAddrFP<InstARM32::Vdiv>;
-template class InstARM32ThreeAddrFP<InstARM32::Vmul>;
 template class InstARM32ThreeAddrFP<InstARM32::Veor>;
+template class InstARM32ThreeAddrFP<InstARM32::Vmul>;
+template class InstARM32ThreeAddrFP<InstARM32::Vmla>;
+template class InstARM32ThreeAddrFP<InstARM32::Vmls>;
 template class InstARM32ThreeAddrFP<InstARM32::Vsub>;

 template class InstARM32LoadBase<InstARM32::Ldr>;

--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -28,6 +28,9 @@
 // LR is not considered isInt to avoid being allocated as a register. It is
 // technically preserved, but save/restore is handled separately, based on
 // whether or not the function MaybeLeafFunc.
+//
+// The register tables can be generated using the gen_arm32_reg_tables.py
+// script.

 #define REGARM32_GPR_TABLE                                                     \
  /* val, encode, name, scratch,preserved,stackptr,frameptr,                   \
@@ -69,21 +72,6 @@
 //          isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init)

 // S registers 0-15 are scratch, but 16-31 are preserved.
-// Regenerate this with the following python script:
-//
-// def print_sregs():
-//   for i in xrange(0, 32):
-//     is_scratch = 1 if i < 16 else 0
-//     is_preserved = 1 if i >= 16 else 0
-//     print ('  X(Reg_s{regnum:<2}, {regnum:<2}, "s{regnum}", ' +
-//            '{scratch}, {preserved}, 0, 0, 0, 0, 1, 0, 0, ' +
-//            'REGLIST2(RegARM32, d{regnum:<2}, ' +
-//            'q{regnum_q:<2}))    \\').format(
-//            regnum=i, regnum_d=i>>1,
-//            regnum_q=i>>2, scratch=is_scratch, preserved=is_preserved)
-//
-// print_sregs()
-//
 #define REGARM32_FP32_TABLE                                                    \
  /* val, encode, name, scratch,preserved,stackptr,frameptr,                   \
     isInt,isI64Pair,isFP32,isFP64,isVec128, alias_init */                     \
@@ -128,29 +116,6 @@
 // registers. In processors supporting the D32 feature this will effectively
 // cause double allocation to bias towards allocating "high" D registers, which
 // do not alias any S registers.
-//
-// Regenerate this with the following python script:
-// def print_dregs():
-//   for i in xrange(31, 15, -1):
-//     is_scratch = 1 if (i < 8 or i >= 16) else 0
-//     is_preserved = 1 if (8 <= i and i < 16) else 0
-//     print ('  X(Reg_d{regnum:<2}, {regnum:<2}, "d{regnum}", ' +
-//            '{scratch}, {preserved}, 0, 0, 0, 0, 0, 1, 0, ' +
-//            'REGLIST1(RegARM32, q{regnum_q:<2})    \\').format(
-//            regnum=i, regnum_q=i>>1, scratch=is_scratch,
-//            preserved=is_preserved)
-//   for i in xrange(15, -1, -1):
-//     is_scratch = 1 if (i < 8 or i >= 16) else 0
-//     is_preserved = 1 if (8 <= i and i < 16) else 0
-//     print ('  X(Reg_d{regnum:<2}, {regnum:<2}, "d{regnum}", ' +
-//            '{scratch}, {preserved}, 0, 0, 0, 0, 0, 1, 0, ' +
-//            'REGLIST3(RegARM32, s{regnum_s0:<2}, s{regnum_s1:<2}, ' +
-//            'q{regnum_q:<2}))   \\').format(
-//            regnum_s0 = (i<<1), regnum_s1 = (i<<1) + 1, regnum=i,
-//            regnum_q=i>>1, scratch=is_scratch, preserved=is_preserved)
-//
-// print_dregs()
-//
 #define REGARM32_FP64_TABLE                                                    \
  /* val, encode, name, scratch,preserved,stackptr,frameptr,                   \
     isInt,isI64Pair,isFP32,isFP64,isVec128, alias_init */                     \
@@ -192,31 +157,6 @@
 // Q registers 0-3 are scratch, 4-7 are preserved, and 8-15 are also scratch
 // (if supported by the D32 feature). Q registers are defined in reverse order
 // for the same reason as D registers.
-//
-// Regenerate this with the following python script:
-// def print_qregs():
-//   for i in xrange(15, 7, -1):
-//     is_scratch = 1 if (i < 4 or i >= 8) else 0
-//     is_preserved = 1 if (4 <= i and i < 8) else 0
-//     print ('  X(Reg_q{regnum:<2}, {regnum:<2}, "q{regnum}", ' +
-//            '{scratch}, {preserved}, 0, 0, 0, 0, 0, 0, 1, REGLIST2(' +
-//            'RegARM32, d{regnum_d0:<2}, d{regnum_d1:<2}))    \\').format(
-//            regnum_d0=(i<<1), regnum_d1=(i<<1)+1, regnum=i,
-//            scratch=is_scratch, preserved=is_preserved)
-//   for i in xrange(7, -1, -1):
-//     is_scratch = 1 if (i < 4 or i >= 8) else 0
-//     is_preserved = 1 if (4 <= i and i < 8) else 0
-//     print ('  X(Reg_q{regnum:<2}, {regnum:<2}, "q{regnum}", ' +
-//            '{scratch}, {preserved}, 0, 0, 0, 0, 0, 0, 1, REGLIST6(' +
-//            'RegARM32, s{regnum_s0:<2}, s{regnum_s1:<2}, ' +
-//            's{regnum_s2:<2}, s{regnum_s3:<2}, ' +
-//            'd{regnum_d0:<2}, d{regnum_d1:<2}))    \\').format(
-//            regnum_s0=(i<<2), regnum_s1=(i<<2)+1, regnum_s2=(i<<2)+2,
-//            regnum_s3=(i<<2)+3, regnum_d0=(i<<1), regnum_d1=(i<<1)+1,
-//            regnum=i, scratch=is_scratch, preserved=is_preserved)
-//
-// print_qregs()
-//
 #define REGARM32_VEC128_TABLE                                                  \
  /* val, encode, name, scratch, preserved, stackptr, frameptr,                \
     isInt, isI64Pair, isFP32, isFP64, isVec128, alias_init */                 \

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -415,6 +415,8 @@ public:
    Vcvt,
    Vdiv,
    Veor,
+    Vmla,
+    Vmls,
    Vmrs,
    Vmul,
    Vsqrt,
@@ -436,6 +438,8 @@ public:
  /// Shared emit routines for common forms of instructions.
  static void emitThreeAddrFP(const char *Opcode, const InstARM32 *Inst,
                              const Cfg *Func);
+  static void emitFourAddrFP(const char *Opcode, const InstARM32 *Inst,
+                             const Cfg *Func);

  void dump(const Cfg *Func) const override;

@@ -708,7 +712,7 @@ private:
 /// Instructions of the form x := y op z, for vector/FP. We leave these as
 /// unconditional: "ARM deprecates the conditional execution of any instruction
 /// encoding provided by the Advanced SIMD Extension that is not also provided
-/// by the Floating-point (VFP) extension". They do not set flags.
+/// by the floating-point (VFP) extension". They do not set flags.
 template <InstARM32::InstKindARM32 K>
 class InstARM32ThreeAddrFP : public InstARM32 {
  InstARM32ThreeAddrFP() = delete;
@@ -796,6 +800,54 @@ private:
  static const char *Opcode;
 };

+/// Instructions of the form x := x op1 (y op2 z). E.g., multiply accumulate.
+/// We leave these as unconditional: "ARM deprecates the conditional execution
+/// of any instruction encoding provided by the Advanced SIMD Extension that is
+/// not also provided by the floating-point (VFP) extension". They do not set
+/// flags.
+template <InstARM32::InstKindARM32 K>
+class InstARM32FourAddrFP : public InstARM32 {
+  InstARM32FourAddrFP() = delete;
+  InstARM32FourAddrFP(const InstARM32FourAddrFP &) = delete;
+  InstARM32FourAddrFP &operator=(const InstARM32FourAddrFP &) = delete;
+
+public:
+  // Every operand must be a register.
+  static InstARM32FourAddrFP *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                     Variable *Src1) {
+    return new (Func->allocate<InstARM32FourAddrFP>())
+        InstARM32FourAddrFP(Func, Dest, Src0, Src1);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitFourAddrFP(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override { emitUsingTextFixup(Func); }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = ";
+    Str << Opcode << "." << getDest()->getType() << " ";
+    dumpDest(Func);
+    Str << ", ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32FourAddrFP(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1)
+      : InstARM32(Func, K, 3, Dest) {
+    addSource(Dest);
+    addSource(Src0);
+    addSource(Src1);
+  }
+
+  static const char *Opcode;
+};
+
 /// Instructions of the form x cmpop y (setting flags).
 template <InstARM32::InstKindARM32 K>
 class InstARM32CmpLike : public InstARM32Pred {
@@ -855,8 +907,10 @@ using InstARM32Sub = InstARM32ThreeAddrGPR<InstARM32::Sub>;
 using InstARM32Udiv = InstARM32ThreeAddrGPR<InstARM32::Udiv>;
 using InstARM32Vadd = InstARM32ThreeAddrFP<InstARM32::Vadd>;
 using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
-using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
 using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
+using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
+using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>;
+using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
 using InstARM32Vsub = InstARM32ThreeAddrFP<InstARM32::Vsub>;
 using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>;
 using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
@@ -1001,8 +1055,8 @@ private:
  InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
 };

-/// Pop into a list of GPRs. Technically this can be predicated, but we don't
-/// need that functionality.
+/// Pops a list of registers. It may be a list of GPRs, or a list of VFP "s"
+/// regs, but not both. In any case, the list must be sorted.
 class InstARM32Pop : public InstARM32 {
  InstARM32Pop() = delete;
  InstARM32Pop(const InstARM32Pop &) = delete;
@@ -1023,8 +1077,8 @@ private:
  VarList Dests;
 };

-/// Push a list of GPRs. Technically this can be predicated, but we don't need
-/// that functionality.
+/// Pushes a list of registers. Just like Pop (see above), the list may be of
+/// GPRs, or VFP "s" registers, but not both.
 class InstARM32Push : public InstARM32 {
  InstARM32Push() = delete;
  InstARM32Push(const InstARM32Push &) = delete;

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -60,9 +60,9 @@ public:
  static TargetARM32 *create(Cfg *Func) { return new TargetARM32(Func); }

  void initNodeForLowering(CfgNode *Node) override {
-    BoolComputations.forgetProducers();
-    BoolComputations.recordProducers(Node);
-    BoolComputations.dump(Func);
+    Computations.forgetProducers();
+    Computations.recordProducers(Node);
+    Computations.dump(Func);
  }

  void translateOm1() override;
@@ -798,6 +798,12 @@ protected:
  void _vmrs(CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vmrs::create(Func, Pred));
  }
+  void _vmla(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vmla::create(Func, Dest, Src0, Src1));
+  }
+  void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vmls::create(Func, Dest, Src0, Src1));
+  }
  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert(InstARM32Vmul::create(Func, Dest, Src0, Src1));
  }
@@ -1019,6 +1025,8 @@ protected:
  static llvm::SmallBitVector ScratchRegs;
  llvm::SmallBitVector RegsUsed;
  VarList PhysicalRegisters[IceType_NUM];
+  VarList PreservedGPRs;
+  VarList PreservedSRegs;

  /// Helper class that understands the Calling Convention and register
  /// assignments. The first few integer type parameters can use r0-r3,
@@ -1081,10 +1089,10 @@ private:
  std::unordered_map<Operand *, void (TargetARM32::*)(const InstCall *Inst)>
      ARM32HelpersPostamble;

-  class BoolComputationTracker {
+  class ComputationTracker {
  public:
-    BoolComputationTracker() = default;
-    ~BoolComputationTracker() = default;
+    ComputationTracker() = default;
+    ~ComputationTracker() = default;

    void forgetProducers() { KnownComputations.clear(); }
    void recordProducers(CfgNode *Node);
@@ -1118,9 +1126,9 @@ private:
    }

  private:
-    class BoolComputationEntry {
+    class ComputationEntry {
    public:
-      explicit BoolComputationEntry(Inst *I) : Instr(I) {}
+      ComputationEntry(Inst *I, Type Ty) : Instr(I), ComputationType(Ty) {}
      Inst *const Instr;
      // Boolean folding is disabled for variables whose live range is multi
      // block. We conservatively initialize IsLiveOut to true, and set it to
@@ -1130,13 +1138,16 @@ private:
      // disabled.
      bool IsLiveOut = true;
      int32_t NumUses = 0;
+      Type ComputationType;
    };

-    using BoolComputationMap = std::unordered_map<SizeT, BoolComputationEntry>;
-    BoolComputationMap KnownComputations;
+    // ComputationMap maps a Variable number to a payload identifying which
+    // instruction defined it.
+    using ComputationMap = std::unordered_map<SizeT, ComputationEntry>;
+    ComputationMap KnownComputations;
  };

-  BoolComputationTracker BoolComputations;
+  ComputationTracker Computations;

  // AllowTemporaryWithNoReg indicates if TargetARM32::makeReg() can be invoked
  // without specifying a physical register. This is needed for creating unbound

--- a/tests_lit/assembler/arm32/mov-const.ll
+++ b/tests_lit/assembler/arm32/mov-const.ll
@@ -24,96 +24,92 @@ define internal i32 @foo(i32 %x) {
 entry:

 ; ASM-LABEL: foo:
-; ASM-NEXT: .Lfoo$entry:
-; ******* Movw case to check *******
-; ASM-NEXT:     movw    ip, #4092
-; ASM-NEXT:     sub     sp, sp, ip
-; ASM-NEXT:     str     r0, [sp, #4088]
-; ASM-NEXT:     # [sp, #4088] = def.pseudo
 ; DIS-LABEL: 00000000 <foo>:
-; DIS-NEXT:    0:       e300cffc
-; DIS-NEXT:    4:       e04dd00c
-; DIS-NEXT:    8:       e58d0ff8
-
 ; IASM-LABEL: foo:
+
+; ASM-NEXT: .Lfoo$entry:
 ; IASM-NEXT: .Lfoo$entry:

+; ASM-NEXT:     movw    ip, #4092
+; DIS-NEXT:    0:       e300cffc
 ; IASM-NEXT:    .byte 0xfc
 ; IASM-NEXT:    .byte 0xcf
 ; IASM-NEXT:    .byte 0x0
 ; IASM-NEXT:    .byte 0xe3

+; ASM-NEXT:     sub     sp, sp, ip
+; DIS-NEXT:    4:       e04dd00c
 ; IASM-NEXT:    .byte 0xc
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0x4d
 ; IASM-NEXT:    .byte 0xe0

+; ASM-NEXT:     str     r0, [sp, #4088]
+; DIS-NEXT:    8:       e58d0ff8
 ; IASM-NEXT:    .byte 0xf8
 ; IASM-NEXT:    .byte 0xf
 ; IASM-NEXT:    .byte 0x8d
 ; IASM-NEXT:    .byte 0xe5

+; ASM-NEXT:     # [sp, #4088] = def.pseudo
+
  %mul = mul i32 %x, %x

 ; ASM-NEXT:     ldr     r0, [sp, #4088]
-; ASM-NEXT:     ldr     r1, [sp, #4088]
-; ASM-NEXT:     mul     r0, r0, r1
-; ASM-NEXT:     str     r0, [sp, #4084]
-; ASM-NEXT:     # [sp, #4084] = def.pseudo
-
 ; DIS-NEXT:    c:       e59d0ff8
-; DIS-NEXT:   10:       e59d1ff8
-; DIS-NEXT:   14:       e0000190
-; DIS-NEXT:   18:       e58d0ff4
-
 ; IASM-NEXT:    .byte 0xf8
 ; IASM-NEXT:    .byte 0xf
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe5

+; ASM-NEXT:     ldr     r1, [sp, #4088]
+; DIS-NEXT:   10:       e59d1ff8
 ; IASM-NEXT:    .byte 0xf8
 ; IASM-NEXT:    .byte 0x1f
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe5

+; ASM-NEXT:     mul     r0, r0, r1
+; DIS-NEXT:   14:       e0000190
 ; IASM-NEXT:    .byte 0x90
 ; IASM-NEXT:    .byte 0x1
 ; IASM-NEXT:    .byte 0x0
 ; IASM-NEXT:    .byte 0xe0

+; ASM-NEXT:     str     r0, [sp, #4084]
+; DIS-NEXT:   18:       e58d0ff4
 ; IASM-NEXT:    .byte 0xf4
 ; IASM-NEXT:    .byte 0xf
 ; IASM-NEXT:    .byte 0x8d
 ; IASM-NEXT:    .byte 0xe5

+; ASM-NEXT:     # [sp, #4084] = def.pseudo
+
  ret i32 %mul

 ; ASM-NEXT:     ldr     r0, [sp, #4084]
-; ******* Movw case to check *******
-; ASM-NEXT:     movw    ip, #4092
-; ASM-NEXT:     add     sp, sp, ip
-; ASM-NEXT:     bx      lr
-
 ; DIS-NEXT:   1c:       e59d0ff4
-; DIS-NEXT:   20:       e300cffc
-; DIS-NEXT:   24:       e08dd00c
-; DIS-NEXT:   28:       e12fff1e
-
 ; IASM-NEXT:    .byte 0xf4
 ; IASM-NEXT:    .byte 0xf
 ; IASM-NEXT:    .byte 0x9d
 ; IASM-NEXT:    .byte 0xe5

+; ASM-NEXT:     movw    ip, #4092
+; DIS-NEXT:   20:       e300cffc
 ; IASM-NEXT:    .byte 0xfc
 ; IASM-NEXT:    .byte 0xcf
 ; IASM-NEXT:    .byte 0x0
 ; IASM-NEXT:    .byte 0xe3

+; ASM-NEXT:     add     sp, sp, ip
+; DIS-NEXT:   24:       e08dd00c
 ; IASM-NEXT:    .byte 0xc
 ; IASM-NEXT:    .byte 0xd0
 ; IASM-NEXT:    .byte 0x8d
 ; IASM-NEXT:    .byte 0xe0

+; ASM-NEXT:     bx      lr
+; DIS-NEXT:   28:       e12fff1e
 ; IASM-NEXT:    .byte 0x1e
 ; IASM-NEXT:    .byte 0xff
 ; IASM-NEXT:    .byte 0x2f
@@ -121,84 +117,88 @@ entry:

 }

-define internal void @saveMinus1(i32 %loc) {
-; ASM-LABEL:saveMinus1:
-; DIS-LABEL:00000030 <saveMinus1>:
-; IASM-LABEL:saveMinus1:
+define internal void @saveConstI32(i32 %loc) {
+; ASM-LABEL:saveConstI32:
+; DIS-LABEL:00000030 <saveConstI32>:
+; IASM-LABEL:saveConstI32:

 entry:
-; ASM-NEXT:.LsaveMinus1$entry:
+; ASM-NEXT:.LsaveConstI32$entry:
+; IASM-NEXT:.LsaveConstI32$entry:
+
 ; ASM-NEXT:     movw    ip, #4088
 ; DIS-NEXT:  30:        e300cff8
-; IASM-NEXT:.LsaveMinus1$entry:
+; IASM-NEXT:    .byte 0xf8
+; IASM-NEXT:    .byte 0xcf
+; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0xe3

 ; ASM-NEXT:     sub     sp, sp, ip
 ; DIS-NEXT:  34:        e04dd00c
-; IASM-NEXT:	.byte 0xf8
-; IASM-NEXT:	.byte 0xcf
-; IASM-NEXT:	.byte 0x0
-; IASM-NEXT:	.byte 0xe3
+; IASM-NEXT:    .byte 0xc
+; IASM-NEXT:    .byte 0xd0
+; IASM-NEXT:    .byte 0x4d
+; IASM-NEXT:    .byte 0xe0

 ; ASM-NEXT:     str     r0, [sp, #4084]
-; ASM-NEXT:     # [sp, #4084] = def.pseudo 
+; ASM-NEXT:     # [sp, #4084] = def.pseudo
 ; DIS-NEXT:  38:        e58d0ff4
-; IASM-NEXT:	.byte 0xc
-; IASM-NEXT:	.byte 0xd0
-; IASM-NEXT:	.byte 0x4d
-; IASM-NEXT:	.byte 0xe0
+; IASM-NEXT:    .byte 0xf4
+; IASM-NEXT:    .byte 0xf
+; IASM-NEXT:    .byte 0x8d
+; IASM-NEXT:    .byte 0xe5

  %loc.asptr = inttoptr i32 %loc to i32*
-  store i32 -1, i32* %loc.asptr, align 1
+  store i32 524289, i32* %loc.asptr, align 1

 ; ASM-NEXT:     ldr     r0, [sp, #4084]
 ; DIS-NEXT:  3c:        e59d0ff4
-; IASM-NEXT:	.byte 0xf4
-; IASM-NEXT:	.byte 0xf
-; IASM-NEXT:	.byte 0x8d
-; IASM-NEXT:	.byte 0xe5
-
-; ASM-NEXT:     movw    r1, #65535
-; DIS-NEXT:  40:        e30f1fff
-; IASM-NEXT:	.byte 0xf4
-; IASM-NEXT:	.byte 0xf
-; IASM-NEXT:	.byte 0x9d
-; IASM-NEXT:	.byte 0xe5
-
-; ASM-NEXT:     movt    r1, #65535
-; DIS-NEXT:  44:        e34f1fff
-; IASM-NEXT:	.byte 0xff
-; IASM-NEXT:	.byte 0x1f
-; IASM-NEXT:	.byte 0xf
-; IASM-NEXT:	.byte 0xe3
+; IASM-NEXT:    .byte 0xf4
+; IASM-NEXT:    .byte 0xf
+; IASM-NEXT:    .byte 0x9d
+; IASM-NEXT:    .byte 0xe5
+
+; ASM-NEXT:     movw     r1, #1
+; DIS-NEXT:  40:        e3001001
+; IASM-NEXT:    .byte 0x1
+; IASM-NEXT:    .byte 0x10
+; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0xe3
+
+; ASM-NEXT:     movt    r1, #8
+; DIS-NEXT:  44:        e3401008
+; IASM-NEXT:    .byte 0x8
+; IASM-NEXT:    .byte 0x10
+; IASM-NEXT:    .byte 0x40
+; IASM-NEXT:    .byte 0xe3

 ; ASM-NEXT:     str     r1, [r0]
 ; DIS-NEXT:  48:        e5801000
-; IASM-NEXT:	.byte 0xff
-; IASM-NEXT:	.byte 0x1f
-; IASM-NEXT:	.byte 0x4f
-; IASM-NEXT:	.byte 0xe3
+; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0x10
+; IASM-NEXT:    .byte 0x80
+; IASM-NEXT:    .byte 0xe5

  ret void

 ; ASM-NEXT:     movw    ip, #4088
 ; DIS-NEXT:  4c:        e300cff8
-; IASM-NEXT:	.byte 0x0
-; IASM-NEXT:	.byte 0x10
-; IASM-NEXT:	.byte 0x80
-; IASM-NEXT:	.byte 0xe5
+; IASM-NEXT:    .byte 0xf8
+; IASM-NEXT:    .byte 0xcf
+; IASM-NEXT:    .byte 0x0
+; IASM-NEXT:    .byte 0xe3

 ; ASM-NEXT:     add     sp, sp, ip
 ; DIS-NEXT:  50:        e08dd00c
-; IASM-NEXT:	.byte 0xf8
-; IASM-NEXT:	.byte 0xcf
-; IASM-NEXT:	.byte 0x0
-; IASM-NEXT:	.byte 0xe3
+; IASM-NEXT:    .byte 0xc
+; IASM-NEXT:    .byte 0xd0
+; IASM-NEXT:    .byte 0x8d
+; IASM-NEXT:    .byte 0xe0

 ; ASM-NEXT:     bx      lr
 ; DIS-NEXT:  54:        e12fff1e
-; IASM-NEXT:	.byte 0xc
-; IASM-NEXT:	.byte 0xd0
-; IASM-NEXT:	.byte 0x8d
-; IASM-NEXT:	.byte 0xe0
-
+; IASM-NEXT:    .byte 0x1e
+; IASM-NEXT:    .byte 0xff
+; IASM-NEXT:    .byte 0x2f
+; IASM-NEXT:    .byte 0xe1
 }
--- a/tests_lit/assembler/arm32/sandboxing.ll
+++ b/tests_lit/assembler/arm32/sandboxing.ll
@@ -9,7 +9,9 @@
 ; RUN:   -ffunction-sections  | FileCheck %s

 declare void @call_target()
-declare void @call_target1(i32 %arg)
+declare void @call_target1(i32 %arg0)
+declare void @call_target2(i32 %arg0, i32 %arg1)
+declare void @call_target3(i32 %arg0, i32 %arg1, i32 %arg2)
 @global_short = internal global [2 x i8] zeroinitializer

 ; A direct call sequence uses the right mask and register-call sequence.
@@ -60,7 +62,7 @@ entry:
 ; CHECK-LABEL: bundle_lock_without_padding
 ; CHECK: 0: {{.*}} movw
 ; CHECK-NEXT: movt
-; CHECK-NEXT: movw
+; CHECK-NEXT: mov
 ; CHECK-NEXT: nop
 ; CHECK-NEXT: bic [[REG:r[0-9]+]], {{.*}} 0xc0000000
 ; CHECK-NEXT: strh {{.*}}, {{[[]}}[[REG]]
@@ -91,18 +93,16 @@ define internal void @bundle_lock_align_to_end_padding_0() {
 entry:
  call void @call_target()
  ; bundle boundary
-  store i16 0, i16* undef, align 1
-  call void @call_target()
+  call void @call_target3(i32 1, i32 2, i32 3)
  ; bundle boundary
  ret void
 }
 ; CHECK-LABEL: bundle_lock_align_to_end_padding_0
 ; CHECK: c: {{.*}} bl {{.*}} call_target
-; CHECK-NEXT: movw
-; CHECK-NEXT: movw
-; CHECK-NEXT: bic [[REG:r[0-9]+]]
-; CHECK-NEXT: strh {{.*}}, {{[[]}}[[REG]]
-; CHECK: {{[0-9]+}}c: {{.*}} bl {{.*}} call_target
+; CHECK-NEXT: mov
+; CHECK-NEXT: mov
+; CHECK-NEXT: mov
+; CHECK-NEXT: {{[0-9]+}}c: {{.*}} bl {{.*}} call_target3
 ; CHECK-NEXT: add sp
 ; CHECK-NEXT: bic sp, {{.*}} 0xc0000000
 ; CHECK-NEXT: pop
@@ -114,41 +114,29 @@ define internal void @bundle_lock_align_to_end_padding_1() {
 entry:
  call void @call_target()
  ; bundle boundary
-  store i32 65536, i32* undef, align 1
-  ; bundle boundary
-  call void @call_target()
+  call void @call_target2(i32 1, i32 2)
  ; bundle boundary
  ret void
 }
 ; CHECK-LABEL: bundle_lock_align_to_end_padding_1
 ; CHECK: {{[0-9]*}}c: {{.*}} bl {{.*}} call_target
-; CHECK-NEXT: movw [[BASE:r[0-9]+]]
-; CHECK-NEXT: movw [[REG:r[0-9]+]], #0
-; CHECK-NEXT: movt [[REG]], #1
+; CHECK-NEXT: mov
+; CHECK-NEXT: mov
 ; CHECK-NEXT: nop
-; CHECK-NEXT: bic [[BASE]], [[BASE]], {{.*}} 0xc0000000
-; CHECK-NEXT: str [[REG]], {{[[]}}[[BASE]]
-; CHECK-NEXT: nop
-; CHECK-NEXT: bl {{.*}} call_target
+; CHECK-NEXT: bl {{.*}} call_target2
 ; CHECK: {{[0-9]+}}0: {{.*}} bic lr, lr, {{.*}} 0xc000000f
 ; CHECK-NEXT: {{.*}} bx lr

 ; Bundle lock align_to_end with two bunches of padding.
-define internal void @bundle_lock_align_to_end_padding_2(i32 %target) {
+define internal void @bundle_lock_align_to_end_padding_2() {
 entry:
-  call void @call_target1(i32 1)
+  call void @call_target2(i32 1, i32 2)
  ; bundle boundary
-  %__1 = inttoptr i32 %target to void (i32, i32, i32)*
-  call void %__1(i32 2, i32 3, i32 4)
  ret void
 }
 ; CHECK-LABEL: bundle_lock_align_to_end_padding_2
-; CHECK: {{[0-9]+}}0:
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: bl {{.*}} call_target
-; CHECK: {{[0-9]+}}c: {{.*}} movw r2, #4
+; CHECK: mov
+; CHECK-NEXT: mov
 ; CHECK-NEXT: nop
 ; CHECK-NEXT: nop
-; CHECK-NEXT: bic [[REG:r[0-9]+]], [[REG]], {{.*}} 0xc000000f
-; CHECK-NEXT: {{.*}} blx [[REG]]
+; CHECK-NEXT: bl {{.*}} call_target2
--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -89,17 +89,17 @@ entry:

 ; ARM32-LABEL: pass64BitArg
 ; ARM32:      str     {{.*}}, [sp]
-; ARM32:      movw    r2, #123
+; ARM32:      mov     r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
-; ARM32:      movw    r2, #123
+; ARM32:      mov     r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
-; ARM32:      movw    r2, #123
+; ARM32:      mov     r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline


@@ -142,7 +142,7 @@ entry:
 ; ARM32:      str     [[REG2]], [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
-; ARM32:      movw    r2, #123
+; ARM32:      mov     r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline

 define internal i32 @pass64BitUndefArg() {
@@ -162,9 +162,9 @@ entry:
 ; OPTM1: call {{.*}} R_{{.*}} ignore64BitArgNoInline
 ; ARM32-LABEL: pass64BitUndefArg
 ; ARM32: sub sp
-; ARM32: movw {{.*}}, #0
+; ARM32: mov {{.*}}, #0
 ; ARM32: str
-; ARM32: movw {{.*}}, #123
+; ARM32: mov {{.*}}, #123
 ; ARM32: bl {{.*}} ignore64BitArgNoInline

 define internal i64 @return64BitArg(i64 %padding, i64 %a) {

--- a/tests_lit/llvm2ice_tests/arith.ll
+++ b/tests_lit/llvm2ice_tests/arith.ll
@@ -117,7 +117,7 @@ entry:
 ; CHECK-LABEL: MulImm
 ; CHECK: imul e{{.*}},e{{.*}},0x63
 ; ARM32-LABEL: MulImm
-; ARM32-OPTM1: movw {{.*}}, #99
+; ARM32-OPTM1: mov {{.*}}, #99
 ; ARM32-OPTM1: mul r{{.*}}, r{{.*}}, r{{.*}}
 ; ARM32-OPT2: rsb [[T:r[0-9]+]], [[S:r[0-9]+]], [[S]], lsl #2
 ; ARM32-OPT2-DAG: add [[T]], [[T]], [[S]], lsl #7
@@ -141,8 +141,8 @@ entry:
 ; CHECK-NOT: mul {{[0-9]+}}
 ;
 ; ARM32-LABEL: MulImm64
-; ARM32: movw {{.*}}, #99
-; ARM32: movw {{.*}}, #0
+; ARM32: mov {{.*}}, #99
+; ARM32: mov {{.*}}, #0
 ; ARM32: mul r
 ; ARM32: mla r
 ; ARM32: umull r

--- a/tests_lit/llvm2ice_tests/fp.convert.ll
+++ b/tests_lit/llvm2ice_tests/fp.convert.ll
@@ -358,7 +358,7 @@ entry:
 ; CHECK: cvtsi2sd {{.*[^1]}}
 ; CHECK: fld
 ; ARM32-LABEL: signed32ToDoubleConst
-; ARM32-DAG: movw [[CONST:r[0-9]+]], #123
+; ARM32-DAG: mov [[CONST:r[0-9]+]], #123
 ; ARM32-DAG: vmov [[SRC:s[0-9]+]], [[CONST]]
 ; ARM32-DAG: vcvt.f64.s32 {{d[0-9]+}}, [[SRC]]


--- a/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-atomic-intrinsics.ll
@@ -1329,7 +1329,7 @@ entry:
 ; CHECK-LABEL: test_atomic_is_lock_free
 ; CHECK: mov {{.*}},0x1
 ; ARM32-LABEL: test_atomic_is_lock_free
-; ARM32: movw {{.*}}, #1
+; ARM32: mov {{.*}}, #1

 define internal i32 @test_not_lock_free(i32 %iptr) {
 entry:

--- a/tests_lit/llvm2ice_tests/return_immediates.ll
+++ b/tests_lit/llvm2ice_tests/return_immediates.ll
@@ -303,8 +303,8 @@ define internal i64 @ret_64bits_shift_left0() {
 ; CHECK-NEXT: mov eax,0xff
 ; CHECK-NEXT: mov edx,0xff
 ; ARM32-LABEL: ret_64bits_shift_left0
-; ARM32-NEXT: movw r0, #255
-; ARM32-NEXT: movw r1, #255
+; ARM32-NEXT: mov r0, #255
+; ARM32-NEXT: mov r1, #255
 ; MIPS32-LABEL: ret_64bits_shift_left0
 ; MIPS32-NEXT: li	v0,255
 ; MIPS32-NEXT: li	v1,255

--- a/tests_lit/llvm2ice_tests/switch-opt.ll
+++ b/tests_lit/llvm2ice_tests/switch-opt.ll
@@ -130,5 +130,5 @@ sw.default:
  ret i32 20
 }
 ; ARM32-LABEL: testSwitchUndef64
-; ARM32: movw {{.*}}, #0
-; ARM32: movw {{.*}}, #0
+; ARM32: mov {{.*}}, #0
+; ARM32: mov {{.*}}, #0