Subzero. ARM32. Improve constant lowering.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1438773004 .

Subzero. ARM32. Improve constant lowering.
ccea793f · John Porto · a98091d4 · ccea793f · ccea793f · ccea793f
Commit ccea793f authored Nov 17, 2015 by John Porto
11 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -284,6 +284,87 @@ bool OperandARM32FlexImm::canHoldImm(uint32_t Immediate, uint32_t *RotateAmt,
  return false;
 }
+OperandARM32FlexFpImm::OperandARM32FlexFpImm(Cfg * /*Func*/, Type Ty,
+                                             uint32_t ModifiedImm)
+    : OperandARM32Flex(kFlexFpImm, Ty), ModifiedImm(ModifiedImm) {}
+bool OperandARM32FlexFpImm::canHoldImm(Operand *C, uint32_t *ModifiedImm) {
+  switch (C->getType()) {
+  default:
+    llvm::report_fatal_error("Unhandled fp constant type.");
+  case IceType_f32: {
+    // We violate llvm naming conventions a bit here so that the constants are
+    // named after the bit fields they represent. See "A7.5.1 Operation of
+    // modified immediate constants, Floating-point" in the ARM ARM.
+    static constexpr uint32_t a = 0x80000000u;
+    static constexpr uint32_t B = 0x40000000;
+    static constexpr uint32_t bbbbb = 0x3E000000;
+    static constexpr uint32_t cdefgh = 0x01F80000;
+    static constexpr uint32_t AllowedBits = a | B | bbbbb | cdefgh;
+    static_assert(AllowedBits == 0xFFF80000u,
+                  "Invalid mask for f32 modified immediates.");
+    const float F32 = llvm::cast<ConstantFloat>(C)->getValue();
+    const uint32_t I32 = *reinterpret_cast<const uint32_t *>(&F32);
+    if (I32 & ~AllowedBits) {
+      // constant has disallowed bits.
+      return false;
+    }
+    if ((I32 & bbbbb) != bbbbb && (I32 & bbbbb)) {
+      // not all bbbbb bits are 0 or 1.
+      return false;
+    }
+    if (((I32 & B) != 0) == ((I32 & bbbbb) != 0)) {
+      // B ^ b = 0;
+      return false;
+    }
+    *ModifiedImm = ((I32 & a) ? 0x80 : 0x00) | ((I32 & bbbbb) ? 0x40 : 0x00) |
+                   ((I32 & cdefgh) >> 19);
+    return true;
+  }
+  case IceType_f64: {
+    static constexpr uint32_t a = 0x80000000u;
+    static constexpr uint32_t B = 0x40000000;
+    static constexpr uint32_t bbbbbbbb = 0x3FC00000;
+    static constexpr uint32_t cdefgh = 0x003F0000;
+    static constexpr uint32_t AllowedBits = a | B | bbbbbbbb | cdefgh;
+    static_assert(AllowedBits == 0xFFFF0000u,
+                  "Invalid mask for f64 modified immediates.");
+    const double F64 = llvm::cast<ConstantDouble>(C)->getValue();
+    const uint64_t I64 = *reinterpret_cast<const uint64_t *>(&F64);
+    if (I64 & 0xFFFFFFFFu) {
+      // constant has disallowed bits.
+      return false;
+    }
+    const uint32_t I32 = I64 >> 32;
+    if (I32 & ~AllowedBits) {
+      // constant has disallowed bits.
+      return false;
+    }
+    if ((I32 & bbbbbbbb) != bbbbbbbb && (I32 & bbbbbbbb)) {
+      // not all bbbbb bits are 0 or 1.
+      return false;
+    }
+    if (((I32 & B) != 0) == ((I32 & bbbbbbbb) != 0)) {
+      // B ^ b = 0;
+      return false;
+    }
+    *ModifiedImm = ((I32 & a) ? 0x80 : 0x00) |
+                   ((I32 & bbbbbbbb) ? 0x40 : 0x00) | ((I32 & cdefgh) >> 16);
+    return true;
+  }
+  }
+}
+OperandARM32FlexFpZero::OperandARM32FlexFpZero(Cfg * /*Func*/, Type Ty)
+    : OperandARM32Flex(kFlexFpZero, Ty) {}
 OperandARM32FlexReg::OperandARM32FlexReg(Cfg *Func, Type Ty, Variable *Reg,
                                         ShiftKind ShiftOp, Operand *ShiftAmt)
    : OperandARM32Flex(kFlexReg, Ty), Reg(Reg), ShiftOp(ShiftOp),
@@ -557,15 +638,18 @@ template <> void InstARM32Tst::emitIAS(const Cfg *Func) const {
    emitUsingTextFixup(Func);
 }
-InstARM32Vcmp::InstARM32Vcmp(Cfg *Func, Variable *Src0, Variable *Src1,
+InstARM32Vcmp::InstARM32Vcmp(Cfg *Func, Variable *Src0, Operand *Src1,
                             CondARM32::Cond Predicate)
    : InstARM32Pred(Func, InstARM32::Vcmp, 2, nullptr, Predicate) {
+  HasSideEffects = true;
  addSource(Src0);
  addSource(Src1);
 }
 InstARM32Vmrs::InstARM32Vmrs(Cfg *Func, CondARM32::Cond Predicate)
-    : InstARM32Pred(Func, InstARM32::Vmrs, 0, nullptr, Predicate) {}
+    : InstARM32Pred(Func, InstARM32::Vmrs, 0, nullptr, Predicate) {
+  HasSideEffects = true;
+}
 InstARM32Vabs::InstARM32Vabs(Cfg *Func, Variable *Dest, Variable *Src,
                             CondARM32::Cond Predicate)
@@ -605,6 +689,7 @@ template <> const char *InstARM32Lsr::Opcode = "lsr";
 template <> const char *InstARM32Mul::Opcode = "mul";
 template <> const char *InstARM32Orr::Opcode = "orr";
 template <> const char *InstARM32Rsb::Opcode = "rsb";
+template <> const char *InstARM32Rsc::Opcode = "rsc";
 template <> const char *InstARM32Sbc::Opcode = "sbc";
 template <> const char *InstARM32Sdiv::Opcode = "sdiv";
 template <> const char *InstARM32Sub::Opcode = "sub";
@@ -613,11 +698,13 @@ template <> const char *InstARM32Udiv::Opcode = "udiv";
 template <> const char *InstARM32Vadd::Opcode = "vadd";
 template <> const char *InstARM32Vdiv::Opcode = "vdiv";
 template <> const char *InstARM32Vmul::Opcode = "vmul";
+template <> const char *InstARM32Veor::Opcode = "veor";
 template <> const char *InstARM32Vsub::Opcode = "vsub";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
 template <> const char *InstARM32Mls::Opcode = "mls";
 // Cmp-like ops
+template <> const char *InstARM32Cmn::Opcode = "cmn";
 template <> const char *InstARM32Cmp::Opcode = "cmp";
 template <> const char *InstARM32Tst::Opcode = "tst";
@@ -1701,6 +1788,67 @@ void OperandARM32FlexImm::dump(const Cfg * /* Func */, Ostream &Str) const {
  Str << "#(" << Imm << " ror 2*" << RotateAmt << ")";
 }
+namespace {
+static constexpr uint32_t a = 0x80;
+static constexpr uint32_t b = 0x40;
+static constexpr uint32_t cdefgh = 0x3F;
+static constexpr uint32_t AllowedBits = a | b | cdefgh;
+static_assert(AllowedBits == 0xFF,
+              "Invalid mask for f32/f64 constant rematerialization.");
+// There's no loss in always returning the modified immediate as float.
+// TODO(jpp): returning a double causes problems when outputting the constants
+// for filetype=asm. Why?
+float materializeFloatImmediate(uint32_t ModifiedImm) {
+  const uint32_t Ret = ((ModifiedImm & a) ? 0x80000000 : 0) |
+                       ((ModifiedImm & b) ? 0x3E000000 : 0x40000000) |
+                       ((ModifiedImm & cdefgh) << 19);
+  return *reinterpret_cast<const float *>(&Ret);
+}
+} // end of anonymous namespace
+void OperandARM32FlexFpImm::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Invalid flex fp imm type.");
+  case IceType_f64:
+  case IceType_f32:
+    Str << "#" << materializeFloatImmediate(ModifiedImm)
+        << " @ Modified: " << ModifiedImm;
+    break;
+  }
+}
+void OperandARM32FlexFpImm::dump(const Cfg * /*Func*/, Ostream &Str) const {
+  if (!BuildDefs::dump())
+    return;
+  Str << "#" << materializeFloatImmediate(ModifiedImm)
+      << InstARM32::getVecWidthString(Ty);
+}
+void OperandARM32FlexFpZero::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  switch (Ty) {
+  default:
+    llvm::report_fatal_error("Invalid flex fp imm type.");
+  case IceType_f64:
+  case IceType_f32:
+    Str << "#0.0";
+  }
+}
+void OperandARM32FlexFpZero::dump(const Cfg * /*Func*/, Ostream &Str) const {
+  if (!BuildDefs::dump())
+    return;
+  Str << "#0.0" << InstARM32::getVecWidthString(Ty);
+}
 void OperandARM32FlexReg::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
@@ -1741,6 +1889,7 @@ template class InstARM32ThreeAddrGPR<InstARM32::Lsr>;
 template class InstARM32ThreeAddrGPR<InstARM32::Mul>;
 template class InstARM32ThreeAddrGPR<InstARM32::Orr>;
 template class InstARM32ThreeAddrGPR<InstARM32::Rsb>;
+template class InstARM32ThreeAddrGPR<InstARM32::Rsc>;
 template class InstARM32ThreeAddrGPR<InstARM32::Sbc>;
 template class InstARM32ThreeAddrGPR<InstARM32::Sdiv>;
 template class InstARM32ThreeAddrGPR<InstARM32::Sub>;
@@ -1749,6 +1898,7 @@ template class InstARM32ThreeAddrGPR<InstARM32::Udiv>;
 template class InstARM32ThreeAddrFP<InstARM32::Vadd>;
 template class InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 template class InstARM32ThreeAddrFP<InstARM32::Vmul>;
+template class InstARM32ThreeAddrFP<InstARM32::Veor>;
 template class InstARM32ThreeAddrFP<InstARM32::Vsub>;
 template class InstARM32LoadBase<InstARM32::Ldr>;
@@ -1768,6 +1918,7 @@ template class InstARM32UnaryopFP<InstARM32::Vsqrt>;
 template class InstARM32FourAddrGPR<InstARM32::Mla>;
 template class InstARM32FourAddrGPR<InstARM32::Mls>;
+template class InstARM32CmpLike<InstARM32::Cmn>;
 template class InstARM32CmpLike<InstARM32::Cmp>;
 template class InstARM32CmpLike<InstARM32::Tst>;

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -40,6 +40,8 @@ public:
    kMem,
    kFlexStart,
    kFlexImm = kFlexStart,
+    kFlexFpImm,
+    kFlexFpZero,
    kFlexReg,
    kFlexEnd = kFlexReg
  };
@@ -205,6 +207,59 @@ private:
  uint32_t RotateAmt;
 };
+/// Modified Floating-point constant.
+class OperandARM32FlexFpImm : public OperandARM32Flex {
+  OperandARM32FlexFpImm() = delete;
+  OperandARM32FlexFpImm(const OperandARM32FlexFpImm &) = delete;
+  OperandARM32FlexFpImm &operator=(const OperandARM32FlexFpImm &) = delete;
+public:
+  static OperandARM32FlexFpImm *create(Cfg *Func, Type Ty,
+                                       uint32_t ModifiedImm) {
+    return new (Func->allocate<OperandARM32FlexFpImm>())
+        OperandARM32FlexFpImm(Func, Ty, ModifiedImm);
+  }
+  void emit(const Cfg *Func) const override;
+  using OperandARM32::dump;
+  void dump(const Cfg *Func, Ostream &Str) const override;
+  static bool classof(const Operand *Operand) {
+    return Operand->getKind() == static_cast<OperandKind>(kFlexFpImm);
+  }
+  static bool canHoldImm(Operand *C, uint32_t *ModifiedImm);
+private:
+  OperandARM32FlexFpImm(Cfg *Func, Type Ty, uint32_t ModifiedImm);
+  uint32_t ModifiedImm;
+};
+/// An operand for representing the 0.0 immediate in vcmp.
+class OperandARM32FlexFpZero : public OperandARM32Flex {
+  OperandARM32FlexFpZero() = delete;
+  OperandARM32FlexFpZero(const OperandARM32FlexFpZero &) = delete;
+  OperandARM32FlexFpZero &operator=(const OperandARM32FlexFpZero &) = delete;
+public:
+  static OperandARM32FlexFpZero *create(Cfg *Func, Type Ty) {
+    return new (Func->allocate<OperandARM32FlexFpZero>())
+        OperandARM32FlexFpZero(Func, Ty);
+  }
+  void emit(const Cfg *Func) const override;
+  using OperandARM32::dump;
+  void dump(const Cfg *Func, Ostream &Str) const override;
+  static bool classof(const Operand *Operand) {
+    return Operand->getKind() == static_cast<OperandKind>(kFlexFpZero);
+  }
+private:
+  OperandARM32FlexFpZero(Cfg *Func, Type Ty);
+};
 /// Shifted register variant.
 class OperandARM32FlexReg : public OperandARM32Flex {
  OperandARM32FlexReg() = delete;
@@ -289,6 +344,7 @@ public:
    Bic,
    Br,
    Call,
+    Cmn,
    Cmp,
    Clz,
    Dmb,
@@ -312,6 +368,7 @@ public:
    Ret,
    Rev,
    Rsb,
+    Rsc,
    Sbc,
    Sdiv,
    Str,
@@ -328,6 +385,7 @@ public:
    Vcmp,
    Vcvt,
    Vdiv,
+    Veor,
    Vmrs,
    Vmul,
    Vsqrt,
@@ -609,6 +667,7 @@ private:
  InstARM32ThreeAddrGPR(Cfg *Func, Variable *Dest, Variable *Src0,
                        Operand *Src1, CondARM32::Cond Predicate, bool SetFlags)
      : InstARM32Pred(Func, K, 2, Dest, Predicate), SetFlags(SetFlags) {
+    HasSideEffects = SetFlags;
    addSource(Src0);
    addSource(Src1);
  }
@@ -741,6 +800,7 @@ private:
  InstARM32CmpLike(Cfg *Func, Variable *Src0, Operand *Src1,
                   CondARM32::Cond Predicate)
      : InstARM32Pred(Func, K, 2, nullptr, Predicate) {
+    HasSideEffects = true;
    addSource(Src0);
    addSource(Src1);
  }
@@ -759,6 +819,7 @@ using InstARM32Lsr = InstARM32ThreeAddrGPR<InstARM32::Lsr>;
 using InstARM32Mul = InstARM32ThreeAddrGPR<InstARM32::Mul>;
 using InstARM32Orr = InstARM32ThreeAddrGPR<InstARM32::Orr>;
 using InstARM32Rsb = InstARM32ThreeAddrGPR<InstARM32::Rsb>;
+using InstARM32Rsc = InstARM32ThreeAddrGPR<InstARM32::Rsc>;
 using InstARM32Sbc = InstARM32ThreeAddrGPR<InstARM32::Sbc>;
 using InstARM32Sdiv = InstARM32ThreeAddrGPR<InstARM32::Sdiv>;
 using InstARM32Sub = InstARM32ThreeAddrGPR<InstARM32::Sub>;
@@ -766,6 +827,7 @@ using InstARM32Udiv = InstARM32ThreeAddrGPR<InstARM32::Udiv>;
 using InstARM32Vadd = InstARM32ThreeAddrFP<InstARM32::Vadd>;
 using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
+using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
 using InstARM32Vsub = InstARM32ThreeAddrFP<InstARM32::Vsub>;
 using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>;
 using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
@@ -785,6 +847,7 @@ using InstARM32Uxt = InstARM32UnaryopGPR<InstARM32::Uxt, true>;
 using InstARM32Vsqrt = InstARM32UnaryopFP<InstARM32::Vsqrt>;
 using InstARM32Mla = InstARM32FourAddrGPR<InstARM32::Mla>;
 using InstARM32Mls = InstARM32FourAddrGPR<InstARM32::Mls>;
+using InstARM32Cmn = InstARM32CmpLike<InstARM32::Cmn>;
 using InstARM32Cmp = InstARM32CmpLike<InstARM32::Cmp>;
 using InstARM32Tst = InstARM32CmpLike<InstARM32::Tst>;
@@ -1178,12 +1241,18 @@ public:
    return new (Func->allocate<InstARM32Vcmp>())
        InstARM32Vcmp(Func, Src0, Src1, Predicate);
  }
+  static InstARM32Vcmp *create(Cfg *Func, Variable *Src0,
+                               OperandARM32FlexFpZero *Src1,
+                               CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Vcmp>())
+        InstARM32Vcmp(Func, Src0, Src1, Predicate);
+  }
  void emit(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, Vcmp); }
 private:
-  InstARM32Vcmp(Cfg *Func, Variable *Src0, Variable *Src1,
+  InstARM32Vcmp(Cfg *Func, Variable *Src0, Operand *Src1,
                CondARM32::Cond Predicate);
 };

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -140,7 +140,23 @@ public:
  bool hasCPUFeature(TargetARM32Features::ARM32InstructionSet I) const {
    return CPUFeatures.hasFeature(I);
  }
+  enum OperandLegalization {
+    Legal_None = 0,
+    Legal_Reg = 1 << 0,  /// physical register, not stack location
+    Legal_Flex = 1 << 1, /// A flexible operand2, which can hold rotated small
+                         /// immediates, shifted registers, or modified fp imm.
+    Legal_Mem = 1 << 2,  /// includes [r0, r1 lsl #2] as well as [sp, #12]
+    Legal_All = ~Legal_None
+  };
+  using LegalMask = uint32_t;
  Operand *legalizeUndef(Operand *From, int32_t RegNum = Variable::NoRegister);
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
+                    int32_t RegNum = Variable::NoRegister);
+  Variable *legalizeToReg(Operand *From, int32_t RegNum = Variable::NoRegister);
+  GlobalContext *getCtx() const { return Ctx; }
 protected:
  explicit TargetARM32(Cfg *Func);
@@ -154,6 +170,8 @@ protected:
  void lowerAlloca(const InstAlloca *Inst) override;
  SafeBoolChain lowerInt1Arithmetic(const InstArithmetic *Inst);
+  void lowerInt64Arithmetic(InstArithmetic::OpKind Op, Variable *Dest,
+                            Operand *Src0, Operand *Src1);
  void lowerArithmetic(const InstArithmetic *Inst) override;
  void lowerAssign(const InstAssign *Inst) override;
  void lowerBr(const InstBr *Inst) override;
@@ -192,6 +210,12 @@ protected:
  CondWhenTrue lowerFcmpCond(const InstFcmp *Instr);
  void lowerFcmp(const InstFcmp *Instr) override;
+  CondWhenTrue lowerInt8AndInt16IcmpCond(InstIcmp::ICond Condition,
+                                         Operand *Src0, Operand *Src1);
+  CondWhenTrue lowerInt32IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
+                                  Operand *Src1);
+  CondWhenTrue lowerInt64IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
+                                  Operand *Src1);
  CondWhenTrue lowerIcmpCond(const InstIcmp *Instr);
  void lowerIcmp(const InstIcmp *Instr) override;
  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
@@ -211,18 +235,6 @@ protected:
  void randomlyInsertNop(float Probability,
                         RandomNumberGenerator &RNG) override;
-  enum OperandLegalization {
-    Legal_None = 0,
-    Legal_Reg = 1 << 0,  /// physical register, not stack location
-    Legal_Flex = 1 << 1, /// A flexible operand2, which can hold rotated small
-                         /// immediates, or shifted registers.
-    Legal_Mem = 1 << 2,  /// includes [r0, r1 lsl #2] as well as [sp, #12]
-    Legal_All = ~Legal_None
-  };
-  using LegalMask = uint32_t;
-  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
-                    int32_t RegNum = Variable::NoRegister);
-  Variable *legalizeToReg(Operand *From, int32_t RegNum = Variable::NoRegister);
  OperandARM32Mem *formMemoryOperand(Operand *Ptr, Type Ty);
  Variable64On32 *makeI64RegPair();
@@ -299,6 +311,10 @@ protected:
  void _br(InstARM32Label *Label, CondARM32::Cond Condition) {
    Context.insert(InstARM32Br::create(Func, Label, Condition));
  }
+  void _cmn(Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Cmn::create(Func, Src0, Src1, Pred));
+  }
  void _cmp(Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred));
@@ -332,6 +348,12 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Lsl::create(Func, Dest, Src0, Src1, Pred));
  }
+  void _lsls(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    constexpr bool SetFlags = true;
+    Context.insert(
+        InstARM32Lsl::create(Func, Dest, Src0, Src1, Pred, SetFlags));
+  }
  void _lsr(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Lsr::create(Func, Dest, Src0, Src1, Pred));
@@ -654,6 +676,22 @@ protected:
  void _ret(Variable *LR, Variable *Src0 = nullptr) {
    Context.insert(InstARM32Ret::create(Func, LR, Src0));
  }
+  void _rscs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    constexpr bool SetFlags = true;
+    Context.insert(
+        InstARM32Rsc::create(Func, Dest, Src0, Src1, Pred, SetFlags));
+  }
+  void _rsc(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Rsc::create(Func, Dest, Src0, Src1, Pred));
+  }
+  void _rsbs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    constexpr bool SetFlags = true;
+    Context.insert(
+        InstARM32Rsb::create(Func, Dest, Src0, Src1, Pred, SetFlags));
+  }
  void _rsb(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Rsb::create(Func, Dest, Src0, Src1, Pred));
@@ -745,12 +783,19 @@ protected:
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vcmp::create(Func, Src0, Src1, Pred));
  }
+  void _vcmp(Variable *Src0, OperandARM32FlexFpZero *FpZero,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vcmp::create(Func, Src0, FpZero, Pred));
+  }
  void _vmrs(CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vmrs::create(Func, Pred));
  }
  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert(InstARM32Vmul::create(Func, Dest, Src0, Src1));
  }
+  void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Veor::create(Func, Dest, Src0, Src1));
+  }
  void _vsqrt(Variable *Dest, Variable *Src,
              CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vsqrt::create(Func, Dest, Src, Pred));

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -29,7 +29,6 @@
 #include "IceUtils.h"
 #include "llvm/Support/MathExtras.h"
-#include <cmath> // signbit()
 #include <stack>
 namespace Ice {
@@ -5506,16 +5505,6 @@ Variable *TargetX86Base<Machine>::copyToReg(Operand *Src, int32_t RegNum) {
  return Reg;
 }
-namespace {
-template <typename T> bool isPositiveZero(T Val) {
-  static_assert(std::is_floating_point<T>::value,
-                "Input type must be floating point");
-  return Val == 0 && !std::signbit(Val);
-}
-} // end of anonymous namespace
 template <class Machine>
 Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed,
                                          int32_t RegNum) {
@@ -5609,10 +5598,10 @@ Operand *TargetX86Base<Machine>::legalize(Operand *From, LegalMask Allowed,
    // operand.
    if (isScalarFloatingType(Ty)) {
      if (auto *ConstFloat = llvm::dyn_cast<ConstantFloat>(Const)) {
-        if (isPositiveZero(ConstFloat->getValue()))
+        if (Utils::isPositiveZero(ConstFloat->getValue()))
          return makeZeroedRegister(Ty, RegNum);
      } else if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(Const)) {
-        if (isPositiveZero(ConstDouble->getValue()))
+        if (Utils::isPositiveZero(ConstDouble->getValue()))
          return makeZeroedRegister(Ty, RegNum);
      }
      Variable *Base = nullptr;

--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -16,6 +16,7 @@
 #define SUBZERO_SRC_ICEUTILS_H
 #include <climits>
+#include <cmath> // std::signbit()
 namespace Ice {
@@ -117,6 +118,13 @@ public:
      return value;
    return (value >> shift) | (value << (32 - shift));
  }
+  /// Returns true if Val is +0.0. It requires T to be a floating point type.
+  template <typename T> static bool isPositiveZero(T Val) {
+    static_assert(std::is_floating_point<T>::value,
+                  "Input type must be floating point");
+    return Val == 0 && !std::signbit(Val);
+  }
 };
 } // end of namespace Ice

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -512,13 +512,13 @@ entry:
 ; OPTM1: sar {{.*}},0x1f
 ; ARM32-LABEL: shr64BitSigned
-; ARM32: lsr     [[T0:r[0-9]+]], r0, r2
+; ARM32: lsr     [[T0:r[0-9]+]], r{{[0-9]+}}, r{{[0-9]+}}
-; ARM32: rsb     [[T1:r[0-9]+]], r2, #32
+; ARM32: rsb     [[T1:r[0-9]+]], r{{[0-9]+}}, #32
-; ARM32: orr     r0, [[T0]], r1, lsl [[T1]]
+; ARM32: orr     r{{[0-9]+}}, [[T0]], r{{[0-9]+}}, lsl [[T1]]
-; ARM32: sub     [[T2:r[0-9]+]], r2, #32
+; ARM32: sub     [[T2:r[0-9]+]], r{{[0-9]+}}, #32
 ; ARM32: cmp     [[T2]], #0
-; ARM32: asrge   r0, r1, [[T2]]
+; ARM32: asrge   r{{[0-9]+}}, r{{[0-9]+}}, [[T2]]
-; ARM32: asr     r{{[0-9]+}}, r1, r2
+; ARM32: asr     r{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}}
 define internal i32 @shr64BitSignedTrunc(i64 %a, i64 %b) {
 entry:

--- a/tests_lit/llvm2ice_tests/arith.ll
+++ b/tests_lit/llvm2ice_tests/arith.ll
@@ -117,7 +117,7 @@ entry:
 ; CHECK-LABEL: MulImm
 ; CHECK: imul e{{.*}},e{{.*}},0x63
 ; ARM32-LABEL: MulImm
-; ARM32: mov {{.*}}, #99
+; ARM32: movw {{.*}}, #99
 ; ARM32: mul r{{.*}}, r{{.*}}, r{{.*}}
 ; MIPS32-LABEL: MulImm
 ; MIPS32: mul

--- a/tests_lit/llvm2ice_tests/bool-folding.ll
+++ b/tests_lit/llvm2ice_tests/bool-folding.ll
@@ -169,8 +169,7 @@ entry:
 ; CHECK: cmovl
 ; ARM32-LABEL: fold_cmp_select_64_undef
 ; ARM32: mov
-; ARM32: mov
+; ARM32: rsbs r{{[0-9]+}}, r{{[0-9]+}}, #0
-; ARM32: cmp {{r[0-9]+}}, r0
 ; ARM32: movlt
 ; ARM32: movlt
 ; ARM32: bx lr

--- a/tests_lit/llvm2ice_tests/fp.arm.call.ll
+++ b/tests_lit/llvm2ice_tests/fp.arm.call.ll
--- a/tests_lit/llvm2ice_tests/unreachable.ll
+++ b/tests_lit/llvm2ice_tests/unreachable.ll
@@ -39,7 +39,7 @@ return:                                           ; preds = %entry
 ; CHECK: ret
 ; ARM32-LABEL: divide
-; ARM32: cmp
+; ARM32: tst
 ; ARM32: .word 0xe7fedef0
 ; ARM32: bl {{.*}} __divsi3
 ; ARM32: bx lr