Subzero. ARM32 Fcmp lowering.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1356763004 .

Subzero. ARM32 Fcmp lowering.
2f5534f1 · John Porto · be87b2ec · 2f5534f1 · 2f5534f1 · 2f5534f1
Commit 2f5534f1 authored Sep 18, 2015 by John Porto
6 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -382,6 +382,16 @@ InstARM32Vcvt::InstARM32Vcvt(Cfg *Func, Variable *Dest, Variable *Src,
  addSource(Src);
 }
+InstARM32Vcmp::InstARM32Vcmp(Cfg *Func, Variable *Src0, Variable *Src1,
+                             CondARM32::Cond Predicate)
+    : InstARM32Pred(Func, InstARM32::Vcmp, 2, nullptr, Predicate) {
+  addSource(Src0);
+  addSource(Src1);
+}
+InstARM32Vmrs::InstARM32Vmrs(Cfg *Func, CondARM32::Cond Predicate)
+    : InstARM32Pred(Func, InstARM32::Vmrs, 0, nullptr, Predicate) {}
 // ======================== Dump routines ======================== //
 // Two-addr ops
@@ -507,8 +517,7 @@ void InstARM32Vmov::emitMultiDestSingleSource(const Cfg *Func) const {
  assert(!llvm::isa<OperandARM32Mem>(Src0));
  Str << "\t"
-      << "vmov"
+      << "vmov" << getPredicate() << "\t";
-      << "\t";
  Dest0->emit(Func);
  Str << ", ";
  Dest1->emit(Func);
@@ -529,8 +538,7 @@ void InstARM32Vmov::emitSingleDestMultiSource(const Cfg *Func) const {
  assert(!llvm::isa<OperandARM32Mem>(Src1));
  Str << "\t"
-      << "vmov"
+      << "vmov" << getPredicate() << "\t";
-      << "\t";
  Dest0->emit(Func);
  Str << ", ";
  Src0->emit(Func);
@@ -549,6 +557,14 @@ bool isVariableWithoutRegister(const Operand *Op) {
 bool isMemoryAccess(Operand *Op) {
  return isVariableWithoutRegister(Op) || llvm::isa<OperandARM32Mem>(Op);
 }
+bool isMoveBetweenCoreAndVFPRegisters(Variable *Dest, Operand *Src) {
+  const Type DestTy = Dest->getType();
+  const Type SrcTy = Src->getType();
+  assert(!(isScalarIntegerType(DestTy) && isScalarIntegerType(SrcTy)) &&
+         "At most one of vmov's operands can be a core register.");
+  return isScalarIntegerType(DestTy) || isScalarIntegerType(SrcTy);
+}
 } // end of anonymous namespace
 void InstARM32Vmov::emitSingleDestSingleSource(const Cfg *Func) const {
@@ -559,7 +575,14 @@ void InstARM32Vmov::emitSingleDestSingleSource(const Cfg *Func) const {
  if (Dest->hasReg()) {
    Operand *Src0 = getSrc(0);
    const char *ActualOpcode = isMemoryAccess(Src0) ? "vldr" : "vmov";
-    Str << "\t" << ActualOpcode << "\t";
+    // when vmov{c}'ing, we need to emit a width string. Otherwise, the
+    // assembler might be tempted to assume we want a vector vmov{c}, and that
+    // is disallowed because ARM.
+    const char *WidthString =
+        (isMemoryAccess(Src0) || isMoveBetweenCoreAndVFPRegisters(Dest, Src0))
+            ? ""
+            : getVecWidthString(Src0->getType());
+    Str << "\t" << ActualOpcode << getPredicate() << WidthString << "\t";
    Dest->emit(Func);
    Str << ", ";
    Src0->emit(Func);
@@ -567,8 +590,7 @@ void InstARM32Vmov::emitSingleDestSingleSource(const Cfg *Func) const {
    Variable *Src0 = llvm::cast<Variable>(getSrc(0));
    assert(Src0->hasReg());
    Str << "\t"
-           "vstr"
+           "vstr" << getPredicate() << "\t";
-           "\t";
    Src0->emit(Func);
    Str << ", ";
    Dest->emit(Func);
@@ -578,7 +600,6 @@ void InstARM32Vmov::emitSingleDestSingleSource(const Cfg *Func) const {
 void InstARM32Vmov::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
-  assert(CondARM32::AL == getPredicate());
  assert(isMultiDest() + isMultiSource() <= 1 && "Invalid vmov type.");
  if (isMultiDest()) {
    emitMultiDestSingleSource(Func);
@@ -1045,6 +1066,59 @@ void InstARM32Vcvt::dump(const Cfg *Func) const {
  dumpSources(Func);
 }
+void InstARM32Vcmp::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Str << "\t"
+         "vcmp" << getPredicate() << getVecWidthString(getSrc(0)->getType())
+      << "\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+}
+void InstARM32Vcmp::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+void InstARM32Vcmp::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "vcmp" << getPredicate() << getVecWidthString(getSrc(0)->getType());
+  dumpSources(Func);
+}
+void InstARM32Vmrs::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 0);
+  Str << "\t"
+         "vmrs" << getPredicate() << "\t"
+                                     "APSR_nzcv"
+                                     ", "
+                                     "FPSCR";
+}
+void InstARM32Vmrs::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 0);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+void InstARM32Vmrs::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "APSR{n,z,v,c} = vmrs" << getPredicate() << "\t"
+                                                     "FPSCR{n,z,c,v}";
+}
 void OperandARM32Mem::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -321,10 +321,12 @@ public:
    Umull,
    Uxt,
    Vadd,
+    Vcmp,
    Vcvt,
    Vdiv,
    Vldr,
    Vmov,
+    Vmrs,
    Vmul,
    Vsqrt,
    Vsub
@@ -1204,6 +1206,46 @@ private:
  Variable *Dest1 = nullptr;
 };
+class InstARM32Vcmp final : public InstARM32Pred {
+  InstARM32Vcmp() = delete;
+  InstARM32Vcmp(const InstARM32Vcmp &) = delete;
+  InstARM32Vcmp &operator=(const InstARM32Vcmp &) = delete;
+public:
+  static InstARM32Vcmp *create(Cfg *Func, Variable *Src0, Variable *Src1,
+                               CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Vcmp>())
+        InstARM32Vcmp(Func, Src0, Src1, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Vcmp); }
+private:
+  InstARM32Vcmp(Cfg *Func, Variable *Src0, Variable *Src1,
+                CondARM32::Cond Predicate);
+};
+/// Copies the FP Status and Control Register the core flags.
+class InstARM32Vmrs final : public InstARM32Pred {
+  InstARM32Vmrs() = delete;
+  InstARM32Vmrs(const InstARM32Vmrs &) = delete;
+  InstARM32Vmrs &operator=(const InstARM32Vmrs &) = delete;
+public:
+  static InstARM32Vmrs *create(Cfg *Func, CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Vmrs>()) InstARM32Vmrs(Func, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Vmrs); }
+private:
+  InstARM32Vmrs(Cfg *Func, CondARM32::Cond Predicate);
+};
 // Declare partial template specializations of emit() methods that already have
 // default implementations. Without this, there is the possibility of ODR
 // violations and link errors.

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -87,40 +87,39 @@ CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) {
 // instructions/operands that use the same enum key value. The tables are kept
 // separate to maintain a proper separation between abstraction layers. There
 // is a risk that the tables could get out of sync if enum values are reordered
-// or if entries are added or deleted. The following dummy namespaces use
+// or if entries are added or deleted. The following anonymous namespaces use
 // static_asserts to ensure everything is kept in sync.
 // Validate the enum values in ICMPARM32_TABLE.
-namespace dummy1 {
+namespace {
 // Define a temporary set of enum values based on low-level table entries.
-enum _tmp_enum {
+enum _icmp_ll_enum {
-#define X(val, signed, swapped64, C_32, C1_64, C2_64) _tmp_##val,
+#define X(val, signed, swapped64, C_32, C1_64, C2_64) _icmp_ll_##val,
  ICMPARM32_TABLE
 #undef X
      _num
 };
 // Define a set of constants based on high-level table entries.
-#define X(tag, str) static const int _table1_##tag = InstIcmp::tag;
+#define X(tag, str) static constexpr int _icmp_hl_##tag = InstIcmp::tag;
 ICEINSTICMP_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
 #define X(val, signed, swapped64, C_32, C1_64, C2_64)                          \
-  static const int _table2_##val = _tmp_##val;                                 \
  static_assert(                                                               \
-      _table1_##val == _table2_##val,                                          \
+      _icmp_ll_##val == _icmp_hl_##val,                                        \
-      "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE");
+      "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #val);
 ICMPARM32_TABLE
 #undef X
 // Repeat the static asserts with respect to the high-level table entries in
 // case the high-level table has extra entries.
 #define X(tag, str)                                                            \
  static_assert(                                                               \
-      _table1_##tag == _table2_##tag,                                          \
+      _icmp_hl_##tag == _icmp_ll_##tag,                                        \
-      "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE");
+      "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #tag);
 ICEINSTICMP_TABLE
 #undef X
-} // end of namespace dummy1
+} // end of anonymous namespace
 // Stack alignment
 const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;
@@ -2229,9 +2228,76 @@ void TargetARM32::lowerExtractElement(const InstExtractElement *Inst) {
  UnimplementedError(Func->getContext()->getFlags());
 }
+namespace {
+// Validates FCMPARM32_TABLE's declaration w.r.t. InstFcmp::FCondition ordering
+// (and naming).
+enum {
+#define X(val, CC0, CC1) _fcmp_ll_##val,
+  FCMPARM32_TABLE
+#undef X
+      _fcmp_ll_NUM
+};
+enum {
+#define X(tag, str) _fcmp_hl_##tag = InstFcmp::tag,
+  ICEINSTFCMP_TABLE
+#undef X
+      _fcmp_hl_NUM
+};
+static_assert(_fcmp_hl_NUM == _fcmp_ll_NUM,
+              "Inconsistency between high-level and low-level fcmp tags.");
+#define X(tag, str)                                                            \
+  static_assert(                                                               \
+      _fcmp_hl_##tag == _fcmp_ll_##tag,                                        \
+      "Inconsistency between high-level and low-level fcmp tag " #tag);
+ICEINSTFCMP_TABLE
+#undef X
+struct {
+  CondARM32::Cond CC0;
+  CondARM32::Cond CC1;
+} TableFcmp[] = {
+#define X(val, CC0, CC1)                                                       \
+  { CondARM32::CC0, CondARM32::CC1 }                                           \
+  ,
+    FCMPARM32_TABLE
+#undef X
+};
+} // end of anonymous namespace
 void TargetARM32::lowerFcmp(const InstFcmp *Inst) {
-  (void)Inst;
+  Variable *Dest = Inst->getDest();
-  UnimplementedError(Func->getContext()->getFlags());
+  if (isVectorType(Dest->getType())) {
+    UnimplementedError(Func->getContext()->getFlags());
+    return;
+  }
+  Variable *Src0R = legalizeToReg(Inst->getSrc(0));
+  Variable *Src1R = legalizeToReg(Inst->getSrc(1));
+  Variable *T = makeReg(IceType_i32);
+  _vcmp(Src0R, Src1R);
+  _mov(T, Ctx->getConstantZero(IceType_i32));
+  _vmrs();
+  Operand *One = Ctx->getConstantInt32(1);
+  InstFcmp::FCond Condition = Inst->getCondition();
+  assert(Condition < llvm::array_lengthof(TableFcmp));
+  CondARM32::Cond CC0 = TableFcmp[Condition].CC0;
+  CondARM32::Cond CC1 = TableFcmp[Condition].CC1;
+  if (CC0 != CondARM32::kNone) {
+    _mov(T, One, CC0);
+    // If this mov is not a maybe mov, but an actual mov (i.e., CC0 == AL), we
+    // don't want to set_dest_nonkillable so that liveness + dead-code
+    // elimination will get rid of the previous assignment (i.e., T = 0) above.
+    if (CC0 != CondARM32::AL)
+      _set_dest_nonkillable();
+  }
+  if (CC1 != CondARM32::kNone) {
+    assert(CC0 != CondARM32::kNone);
+    assert(CC1 != CondARM32::AL);
+    _mov_nonkillable(T, One, CC1);
+  }
+  _mov(Dest, T);
 }
 void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
@@ -2695,16 +2761,12 @@ void TargetARM32::lowerSelect(const InstSelect *Inst) {
    UnimplementedError(Func->getContext()->getFlags());
    return;
  }
-  if (isFloatingType(DestTy)) {
-    UnimplementedError(Func->getContext()->getFlags());
-    return;
-  }
  // TODO(jvoung): handle folding opportunities.
  // cmp cond, #0; mov t, SrcF; mov_cond t, SrcT; mov dest, t
  Variable *CmpOpnd0 = legalizeToReg(Condition);
  Operand *CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
  _cmp(CmpOpnd0, CmpOpnd1);
-  CondARM32::Cond Cond = CondARM32::NE;
+  static constexpr CondARM32::Cond Cond = CondARM32::NE;
  if (DestTy == IceType_i64) {
    SrcT = legalizeUndef(SrcT);
    SrcF = legalizeUndef(SrcF);
@@ -2726,6 +2788,20 @@ void TargetARM32::lowerSelect(const InstSelect *Inst) {
    _mov(DestHi, THi);
    return;
  }
+  if (isFloatingType(DestTy)) {
+    Variable *T = makeReg(DestTy);
+    SrcF = legalizeToReg(SrcF);
+    assert(DestTy == SrcF->getType());
+    _vmov(T, SrcF);
+    SrcT = legalizeToReg(SrcT);
+    assert(DestTy == SrcT->getType());
+    _vmov(T, SrcT, Cond);
+    _set_dest_nonkillable();
+    _vmov(Dest, T);
+    return;
+  }
  Variable *T = nullptr;
  SrcF = legalize(SrcF, Legal_Reg | Legal_Flex);
  _mov(T, SrcF);

--- a/src/IceTargetLoweringARM32.def
+++ b/src/IceTargetLoweringARM32.def
@@ -15,19 +15,47 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
 #define SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
+// Patterns for lowering fcmp. These are expected to be used in the following
+// manner:
+//
+//   mov reg, #0
+//   movCC0 reg, #1 /* only if CC0 != kNone */
+//   movCC1 reg, #1 /* only if CC1 != kNone */
+//
+// TODO(jpp): vector lowerings.
+#define FCMPARM32_TABLE                                                        \
+  /*  val, CC0,   CC1 */                                                       \
+  X(False, kNone, kNone)                                                       \
+  X(Oeq,   EQ,    kNone)                                                       \
+  X(Ogt,   GT,    kNone)                                                       \
+  X(Oge,   GE,    kNone)                                                       \
+  X(Olt,   MI,    kNone)                                                       \
+  X(Ole,   LS,    kNone)                                                       \
+  X(One,   MI,    GT)                                                          \
+  X(Ord,   VC,    kNone)                                                       \
+  X(Ueq,   EQ,    VS)                                                          \
+  X(Ugt,   HI,    kNone)                                                       \
+  X(Uge,   PL,    kNone)                                                       \
+  X(Ult,   LT,    kNone)                                                       \
+  X(Ule,   LE,    kNone)                                                       \
+  X(Une,   NE,    kNone)                                                       \
+  X(Uno,   VS,    kNone)                                                       \
+  X(True,  AL,    kNone)                                                       \
+//#define X(val, CC0, CC1)
 // Patterns for lowering icmp.
-#define ICMPARM32_TABLE                                             \
+#define ICMPARM32_TABLE                                                        \
-  /* val, is_signed, swapped64, C_32, C1_64, C2_64 */               \
+  /* val, is_signed, swapped64, C_32, C1_64, C2_64 */                          \
-  X(Eq,   false,     false,     EQ,   EQ,    NE)                    \
+  X(Eq,   false,     false,     EQ,   EQ,    NE)                               \
-  X(Ne,   false,     false,     NE,   NE,    EQ)                    \
+  X(Ne,   false,     false,     NE,   NE,    EQ)                               \
-  X(Ugt,  false,     false,     HI,   HI,    LS)                    \
+  X(Ugt,  false,     false,     HI,   HI,    LS)                               \
-  X(Uge,  false,     false,     CS,   CS,    CC)                    \
+  X(Uge,  false,     false,     CS,   CS,    CC)                               \
-  X(Ult,  false,     false,     CC,   CC,    CS)                    \
+  X(Ult,  false,     false,     CC,   CC,    CS)                               \
-  X(Ule,  false,     false,     LS,   LS,    HI)                    \
+  X(Ule,  false,     false,     LS,   LS,    HI)                               \
-  X(Sgt,  true,      true,      GT,   LT,    GE)                    \
+  X(Sgt,  true,      true,      GT,   LT,    GE)                               \
-  X(Sge,  true,      false,     GE,   GE,    LT)                    \
+  X(Sge,  true,      false,     GE,   GE,    LT)                               \
-  X(Slt,  true,      false,     LT,   LT,    GE)                    \
+  X(Slt,  true,      false,     LT,   LT,    GE)                               \
-  X(Sle,  true,      true,      LE,   GE,    LT)                    \
+  X(Sle,  true,      true,      LE,   GE,    LT)                               \
 //#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)
 #endif // SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -401,6 +401,13 @@ protected:
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vldr::create(Func, Dest, Src, Pred));
  }
+  void _vcmp(Variable *Src0, Variable *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vcmp::create(Func, Src0, Src1, Pred));
+  }
+  void _vmrs(CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vmrs::create(Func, Pred));
+  }
  // There are a whole bunch of vmov variants, to transfer within S/D/Q
  // registers, between core integer registers and S/D, and from small
  // immediates into S/D. For integer -> S/D/Q there is a variant which takes
@@ -411,8 +418,8 @@ protected:
  // register is modified. This cannot model that.
  //
  // This represents the simple single source, single dest variants only.
-  void _vmov(Variable *Dest, Operand *Src0) {
+  void _vmov(Variable *Dest, Operand *Src0,
-    constexpr CondARM32::Cond Pred = CondARM32::AL;
+             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vmov::create(Func, Dest, Src0, Pred));
  }
  // This represents the single source, multi dest variant.

--- a/tests_lit/llvm2ice_tests/fp.cmp.ll
+++ b/tests_lit/llvm2ice_tests/fp.cmp.ll
@@ -6,6 +6,16 @@
 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
 ; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+; RUN: %if --need=allow_dump --need=target_ARM32 --command %p2i --filetype=asm \
+; RUN:   --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=allow_dump --need=target_ARM32 --command FileCheck %s \
+; RUN:   --check-prefix=ARM32
+; RUN: %if --need=allow_dump --need=target_ARM32 --command %p2i --filetype=asm \
+; RUN:   --target arm32 -i %s --args -Om1 --skip-unimplemented \
+; RUN:   | %if --need=allow_dump --need=target_ARM32 --command FileCheck %s \
+; RUN:   --check-prefix=ARM32
 define internal void @fcmpEq(float %a, float %b, double %c, double %d) {
 entry:
  %cmp = fcmp oeq float %a, %b
@@ -35,6 +45,15 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; CHECK: jne
 ; CHECK-NEXT: jp
 ; CHECK: call {{.*}} R_{{.*}} func
+; ARM32-LABEL: fcmpEq
+; ARM32: vcmp.f32
+; ARM32: mov [[R0:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: moveq [[R0]], #1
+; ARM32: vcmp.f64
+; ARM32: mov [[R1:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: moveq [[R1]], #1
 declare void @func()
@@ -67,6 +86,15 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; CHECK: jne
 ; CHECK-NEXT: jp
 ; CHECK: call {{.*}} R_{{.*}} func
+; ARM32-LABEL: fcmpNe
+; ARM32: vcmp.f32
+; ARM32: mov [[R0:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movne [[R0]], #1
+; ARM32: vcmp.f64
+; ARM32: mov [[R1:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movne [[R1]], #1
 define internal void @fcmpGt(float %a, float %b, double %c, double %d) {
 entry:
@@ -95,6 +123,15 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; CHECK: ucomisd
 ; CHECK: seta
 ; CHECK: call {{.*}} R_{{.*}} func
+; ARM32-LABEL: fcmpGt
+; ARM32: vcmp.f32
+; ARM32: mov [[R0:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movgt [[R0]], #1
+; ARM32: vcmp.f64
+; ARM32: mov [[R1:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movgt [[R1]], #1
 define internal void @fcmpGe(float %a, float %b, double %c, double %d) {
 entry:
@@ -123,6 +160,15 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; CHECK: ucomisd
 ; CHECK: setb
 ; CHECK: call {{.*}} R_{{.*}} func
+; ARM32-LABEL: fcmpGe
+; ARM32: vcmp.f32
+; ARM32: mov [[R0:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movlt [[R0]], #1
+; ARM32: vcmp.f64
+; ARM32: mov [[R1:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movlt [[R1]], #1
 define internal void @fcmpLt(float %a, float %b, double %c, double %d) {
 entry:
@@ -151,6 +197,15 @@ if.end3:                                          ; preds = %if.then2, %if.end
 ; CHECK: ucomisd
 ; CHECK: seta
 ; CHECK: call {{.*}} R_{{.*}} func
+; ARM32-LABEL: fcmpLt
+; ARM32: vcmp.f32
+; ARM32: mov [[R0:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movmi [[R0]], #1
+; ARM32: vcmp.f64
+; ARM32: mov [[R1:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movmi [[R1]], #1
 define internal void @fcmpLe(float %a, float %b, double %c, double %d) {
 entry:
@@ -179,6 +234,15 @@ if.end3:                                          ; preds = %if.end, %if.then2
 ; CHECK: ucomisd
 ; CHECK: setb
 ; CHECK: call {{.*}} R_{{.*}} func
+; ARM32-LABEL: fcmpLe
+; ARM32: vcmp.f32
+; ARM32: mov [[R0:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movhi [[R0]], #1
+; ARM32: vcmp.f64
+; ARM32: mov [[R1:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movhi [[R1]], #1
 define internal i32 @fcmpFalseFloat(float %a, float %b) {
 entry:
@@ -188,6 +252,9 @@ entry:
 }
 ; CHECK-LABEL: fcmpFalseFloat
 ; CHECK: mov {{.*}},0x0
+; ARM32-LABEL: fcmpFalseFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
 define internal i32 @fcmpFalseDouble(double %a, double %b) {
 entry:
@@ -197,6 +264,9 @@ entry:
 }
 ; CHECK-LABEL: fcmpFalseDouble
 ; CHECK: mov {{.*}},0x0
+; ARM32-LABEL: fcmpFalseDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
 define internal i32 @fcmpOeqFloat(float %a, float %b) {
 entry:
@@ -208,6 +278,11 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: jne
 ; CHECK: jp
+; ARM32-LABEL: fcmpOeqFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: moveq [[R]], #1
 define internal i32 @fcmpOeqDouble(double %a, double %b) {
 entry:
@@ -219,6 +294,11 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: jne
 ; CHECK: jp
+; ARM32-LABEL: fcmpOeqDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: moveq [[R]], #1
 define internal i32 @fcmpOgtFloat(float %a, float %b) {
 entry:
@@ -229,6 +309,11 @@ entry:
 ; CHECK-LABEL: fcmpOgtFloat
 ; CHECK: ucomiss
 ; CHECK: seta
+; ARM32-LABEL: fcmpOgtFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movgt [[R]], #1
 define internal i32 @fcmpOgtDouble(double %a, double %b) {
 entry:
@@ -239,6 +324,11 @@ entry:
 ; CHECK-LABEL: fcmpOgtDouble
 ; CHECK: ucomisd
 ; CHECK: seta
+; ARM32-LABEL: fcmpOgtDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movgt [[R]], #1
 define internal i32 @fcmpOgeFloat(float %a, float %b) {
 entry:
@@ -249,6 +339,11 @@ entry:
 ; CHECK-LABEL: fcmpOgeFloat
 ; CHECK: ucomiss
 ; CHECK: setae
+; ARM32-LABEL: fcmpOgeFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movge [[R]], #1
 define internal i32 @fcmpOgeDouble(double %a, double %b) {
 entry:
@@ -259,6 +354,11 @@ entry:
 ; CHECK-LABEL: fcmpOgeDouble
 ; CHECK: ucomisd
 ; CHECK: setae
+; ARM32-LABEL: fcmpOgeDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movge [[R]], #1
 define internal i32 @fcmpOltFloat(float %a, float %b) {
 entry:
@@ -269,6 +369,11 @@ entry:
 ; CHECK-LABEL: fcmpOltFloat
 ; CHECK: ucomiss
 ; CHECK: seta
+; ARM32-LABEL: fcmpOltFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movmi [[R]], #1
 define internal i32 @fcmpOltDouble(double %a, double %b) {
 entry:
@@ -279,6 +384,11 @@ entry:
 ; CHECK-LABEL: fcmpOltDouble
 ; CHECK: ucomisd
 ; CHECK: seta
+; ARM32-LABEL: fcmpOltDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movmi [[R]], #1
 define internal i32 @fcmpOleFloat(float %a, float %b) {
 entry:
@@ -289,6 +399,11 @@ entry:
 ; CHECK-LABEL: fcmpOleFloat
 ; CHECK: ucomiss
 ; CHECK: setae
+; ARM32-LABEL: fcmpOleFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movls [[R]], #1
 define internal i32 @fcmpOleDouble(double %a, double %b) {
 entry:
@@ -299,6 +414,11 @@ entry:
 ; CHECK-LABEL: fcmpOleDouble
 ; CHECK: ucomisd
 ; CHECK: setae
+; ARM32-LABEL: fcmpOleDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movls [[R]], #1
 define internal i32 @fcmpOneFloat(float %a, float %b) {
 entry:
@@ -309,6 +429,12 @@ entry:
 ; CHECK-LABEL: fcmpOneFloat
 ; CHECK: ucomiss
 ; CHECK: setne
+; ARM32-LABEL: fcmpOneFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movmi [[R]], #1
+; ARM32: movgt [[R]], #1
 define internal i32 @fcmpOneDouble(double %a, double %b) {
 entry:
@@ -319,6 +445,12 @@ entry:
 ; CHECK-LABEL: fcmpOneDouble
 ; CHECK: ucomisd
 ; CHECK: setne
+; ARM32-LABEL: fcmpOneDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movmi [[R]], #1
+; ARM32: movgt [[R]], #1
 define internal i32 @fcmpOrdFloat(float %a, float %b) {
 entry:
@@ -329,6 +461,11 @@ entry:
 ; CHECK-LABEL: fcmpOrdFloat
 ; CHECK: ucomiss
 ; CHECK: setnp
+; ARM32-LABEL: fcmpOrdFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movvc [[R]], #1
 define internal i32 @fcmpOrdDouble(double %a, double %b) {
 entry:
@@ -339,6 +476,11 @@ entry:
 ; CHECK-LABEL: fcmpOrdDouble
 ; CHECK: ucomisd
 ; CHECK: setnp
+; ARM32-LABEL: fcmpOrdDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movvc [[R]], #1
 define internal i32 @fcmpUeqFloat(float %a, float %b) {
 entry:
@@ -349,6 +491,12 @@ entry:
 ; CHECK-LABEL: fcmpUeqFloat
 ; CHECK: ucomiss
 ; CHECK: sete
+; ARM32-LABEL: fcmpUeqFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: moveq [[R]], #1
+; ARM32: movvs [[R]], #1
 define internal i32 @fcmpUeqDouble(double %a, double %b) {
 entry:
@@ -359,6 +507,12 @@ entry:
 ; CHECK-LABEL: fcmpUeqDouble
 ; CHECK: ucomisd
 ; CHECK: sete
+; ARM32-LABEL: fcmpUeqDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: moveq [[R]], #1
+; ARM32: movvs [[R]], #1
 define internal i32 @fcmpUgtFloat(float %a, float %b) {
 entry:
@@ -369,6 +523,11 @@ entry:
 ; CHECK-LABEL: fcmpUgtFloat
 ; CHECK: ucomiss
 ; CHECK: setb
+; ARM32-LABEL: fcmpUgtFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movhi [[R]], #1
 define internal i32 @fcmpUgtDouble(double %a, double %b) {
 entry:
@@ -379,6 +538,11 @@ entry:
 ; CHECK-LABEL: fcmpUgtDouble
 ; CHECK: ucomisd
 ; CHECK: setb
+; ARM32-LABEL: fcmpUgtDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movhi [[R]], #1
 define internal i32 @fcmpUgeFloat(float %a, float %b) {
 entry:
@@ -389,6 +553,11 @@ entry:
 ; CHECK-LABEL: fcmpUgeFloat
 ; CHECK: ucomiss
 ; CHECK: setbe
+; ARM32-LABEL: fcmpUgeFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movpl [[R]], #1
 define internal i32 @fcmpUgeDouble(double %a, double %b) {
 entry:
@@ -399,6 +568,11 @@ entry:
 ; CHECK-LABEL: fcmpUgeDouble
 ; CHECK: ucomisd
 ; CHECK: setbe
+; ARM32-LABEL: fcmpUgeDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movpl [[R]], #1
 define internal i32 @fcmpUltFloat(float %a, float %b) {
 entry:
@@ -409,6 +583,11 @@ entry:
 ; CHECK-LABEL: fcmpUltFloat
 ; CHECK: ucomiss
 ; CHECK: setb
+; ARM32-LABEL: fcmpUltFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movlt [[R]], #1
 define internal i32 @fcmpUltDouble(double %a, double %b) {
 entry:
@@ -419,6 +598,11 @@ entry:
 ; CHECK-LABEL: fcmpUltDouble
 ; CHECK: ucomisd
 ; CHECK: setb
+; ARM32-LABEL: fcmpUltDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movlt [[R]], #1
 define internal i32 @fcmpUleFloat(float %a, float %b) {
 entry:
@@ -429,6 +613,11 @@ entry:
 ; CHECK-LABEL: fcmpUleFloat
 ; CHECK: ucomiss
 ; CHECK: setbe
+; ARM32-LABEL: fcmpUleFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movle [[R]], #1
 define internal i32 @fcmpUleDouble(double %a, double %b) {
 entry:
@@ -439,6 +628,11 @@ entry:
 ; CHECK-LABEL: fcmpUleDouble
 ; CHECK: ucomisd
 ; CHECK: setbe
+; ARM32-LABEL: fcmpUleDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movle [[R]], #1
 define internal i32 @fcmpUneFloat(float %a, float %b) {
 entry:
@@ -450,6 +644,11 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: jne
 ; CHECK: jp
+; ARM32-LABEL: fcmpUneFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movne [[R]], #1
 define internal i32 @fcmpUneDouble(double %a, double %b) {
 entry:
@@ -461,6 +660,11 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: jne
 ; CHECK: jp
+; ARM32-LABEL: fcmpUneDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movne [[R]], #1
 define internal i32 @fcmpUnoFloat(float %a, float %b) {
 entry:
@@ -471,6 +675,11 @@ entry:
 ; CHECK-LABEL: fcmpUnoFloat
 ; CHECK: ucomiss
 ; CHECK: setp
+; ARM32-LABEL: fcmpUnoFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movvs [[R]], #1
 define internal i32 @fcmpUnoDouble(double %a, double %b) {
 entry:
@@ -481,6 +690,11 @@ entry:
 ; CHECK-LABEL: fcmpUnoDouble
 ; CHECK: ucomisd
 ; CHECK: setp
+; ARM32-LABEL: fcmpUnoDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R:r[0-9]+]], #0
+; ARM32: vmrs
+; ARM32: movvs [[R]], #1
 define internal i32 @fcmpTrueFloat(float %a, float %b) {
 entry:
@@ -490,6 +704,9 @@ entry:
 }
 ; CHECK-LABEL: fcmpTrueFloat
 ; CHECK: mov {{.*}},0x1
+; ARM32-LABEL: fcmpTrueFloat
+; ARM32: vcmp.f32
+; ARM32: mov [[R]], #1
 define internal i32 @fcmpTrueDouble(double %a, double %b) {
 entry:
@@ -499,6 +716,9 @@ entry:
 }
 ; CHECK-LABEL: fcmpTrueDouble
 ; CHECK: mov {{.*}},0x1
+; ARM32-LABEL: fcmpTrueDouble
+; ARM32: vcmp.f64
+; ARM32: mov [[R]], #1
 define internal float @selectFloatVarVar(float %a, float %b) {
 entry:
@@ -510,6 +730,10 @@ entry:
 ; CHECK: ucomiss
 ; CHECK: seta
 ; CHECK: fld
+; ARM32-LABEL: selectFloatVarVar
+; ARM32: vcmp.f32
+; ARM32: vmovne.f32 s{{[0-9]+}}
+; ARM32: bx
 define internal double @selectDoubleVarVar(double %a, double %b) {
 entry:
@@ -521,3 +745,7 @@ entry:
 ; CHECK: ucomisd
 ; CHECK: seta
 ; CHECK: fld
+; ARM32-LABEL: selectDoubleVarVar
+; ARM32: vcmp.f64
+; ARM32: vmovne.f64 d{{[0-9]+}}
+; ARM32: bx