Add the ARM32 FP register table entries, simple arith, and args.

Lower some instructions, without much guarantee of correctness. *Running* generated code will be risky because the register allocator isn't aware of register aliasing. Fill in v{add,div,mul,sub}.f{32,64}, vmov, vldr and vsqrt.f{32,64}. I tried to make the nacl-other-intrinsics test not explode, so added vsqrt too. That was pretty easy for sqrt, but then fabs tests also exploded. Those are not truly fixed but are currently "fixed" by adding a FakeDef to satisfy liveness. Propagate float/double arguments to the right register in lowerArguments, lowerCall, and propagate to s0/d0/q0 for lowerReturn. May need to double check the calling convention. Currently can't test call-ret because vpush/vpop for prologues and epilogues isn't done. Legalize FP immediates to make the nacl-other-intrinsics sqrt test happy. Use the correct type of load (vldr (.32 and .64 are optional) instead of ldr{b,h,,d}). Whether or not the float/vector instructions can be predicated is a bit interesting. The float/double ones can, but the SIMD versions cannot. E.g. vadd<cond>.f32 s0, s0, s1 is okay vadd<cond>.f32 q0, q0, q1 is not okay. For now, just omit conditions from instructions that may end up being reused for SIMD. Split up the fp.pnacl.ll test into multiple ones so that parts of lowering can be tested incrementally. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1266263003 .

Add the ARM32 FP register table entries, simple arith, and args.
86ebec12 · Jan Voung · f4fbf7fd · 86ebec12 · 86ebec12 · 86ebec12
Commit 86ebec12 authored Aug 09, 2015 by Jan Voung
13 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -28,12 +28,13 @@ namespace Ice {
 namespace {
 const struct TypeARM32Attributes_ {
-  const char *WidthString; // b, h, <blank>, or d
+  const char *WidthString;    // b, h, <blank>, or d
+  const char *VecWidthString; // i8, i16, i32, f32, f64
  int8_t SExtAddrOffsetBits;
  int8_t ZExtAddrOffsetBits;
 } TypeARM32Attributes[] = {
-#define X(tag, elementty, width, sbits, ubits)                                 \
+#define X(tag, elementty, int_width, vec_width, sbits, ubits)                  \
-  { width, sbits, ubits }                                                      \
+  { int_width, vec_width, sbits, ubits }                                       \
  ,
    ICETYPEARM32_TABLE
 #undef X
@@ -66,6 +67,10 @@ const char *InstARM32::getWidthString(Type Ty) {
  return TypeARM32Attributes[Ty].WidthString;
 }
+const char *InstARM32::getVecWidthString(Type Ty) {
+  return TypeARM32Attributes[Ty].VecWidthString;
+}
 const char *InstARM32Pred::predString(CondARM32::Cond Pred) {
  return InstARM32CondAttributes[Pred].EmitString;
 }
@@ -94,6 +99,18 @@ void InstARM32Pred::emitUnaryopGPR(const char *Opcode,
  Inst->getSrc(0)->emit(Func);
 }
+void InstARM32Pred::emitUnaryopFP(const char *Opcode, const InstARM32Pred *Inst,
+                                  const Cfg *Func) {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 1);
+  Type SrcTy = Inst->getSrc(0)->getType();
+  Str << "\t" << Opcode << Inst->getPredicate() << getVecWidthString(SrcTy)
+      << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(0)->emit(Func);
+}
 void InstARM32Pred::emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
                                const Cfg *Func) {
  if (!BuildDefs::dump())
@@ -123,6 +140,21 @@ void InstARM32Pred::emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
  Inst->getSrc(1)->emit(Func);
 }
+void InstARM32::emitThreeAddrFP(const char *Opcode, const InstARM32 *Inst,
+                                const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 2);
+  Str << "\t" << Opcode << getVecWidthString(Inst->getDest()->getType())
+      << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(0)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+}
 void InstARM32Pred::emitFourAddr(const char *Opcode, const InstARM32Pred *Inst,
                                 const Cfg *Func) {
  if (!BuildDefs::dump())
@@ -304,12 +336,6 @@ IceString InstARM32Label::getName(const Cfg *Func) const {
  return ".L" + Func->getFunctionName() + "$local$__" + std::to_string(Number);
 }
-InstARM32Ldr::InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
-                           CondARM32::Cond Predicate)
-    : InstARM32Pred(Func, InstARM32::Ldr, 1, Dest, Predicate) {
-  addSource(Mem);
-}
 InstARM32Pop::InstARM32Pop(Cfg *Func, const VarList &Dests)
    : InstARM32(Func, InstARM32::Pop, 0, nullptr), Dests(Dests) {
  // Track modifications to Dests separately via FakeDefs.
@@ -363,8 +389,14 @@ template <> const char *InstARM32Rbit::Opcode = "rbit";
 template <> const char *InstARM32Rev::Opcode = "rev";
 template <> const char *InstARM32Sxt::Opcode = "sxt"; // still requires b/h
 template <> const char *InstARM32Uxt::Opcode = "uxt"; // still requires b/h
+// FP
+template <> const char *InstARM32Vsqrt::Opcode = "vsqrt";
 // Mov-like ops
+template <> const char *InstARM32Ldr::Opcode = "ldr";
 template <> const char *InstARM32Mov::Opcode = "mov";
+// FP
+template <> const char *InstARM32Vldr::Opcode = "vldr";
+template <> const char *InstARM32Vmov::Opcode = "vmov";
 // Three-addr ops
 template <> const char *InstARM32Adc::Opcode = "adc";
 template <> const char *InstARM32Add::Opcode = "add";
@@ -381,6 +413,11 @@ template <> const char *InstARM32Sbc::Opcode = "sbc";
 template <> const char *InstARM32Sdiv::Opcode = "sdiv";
 template <> const char *InstARM32Sub::Opcode = "sub";
 template <> const char *InstARM32Udiv::Opcode = "udiv";
+// FP
+template <> const char *InstARM32Vadd::Opcode = "vadd";
+template <> const char *InstARM32Vdiv::Opcode = "vdiv";
+template <> const char *InstARM32Vmul::Opcode = "vmul";
+template <> const char *InstARM32Vsub::Opcode = "vsub";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
 template <> const char *InstARM32Mls::Opcode = "mls";
@@ -403,19 +440,19 @@ template <> void InstARM32Mov::emit(const Cfg *Func) const {
  assert(getSrcSize() == 1);
  Variable *Dest = getDest();
  if (Dest->hasReg()) {
-    IceString Opcode = "mov";
+    IceString ActualOpcode = Opcode;
    Operand *Src0 = getSrc(0);
    if (const auto *Src0V = llvm::dyn_cast<Variable>(Src0)) {
      if (!Src0V->hasReg()) {
        // Always use the whole stack slot. A 32-bit load has a larger range
        // of offsets than 16-bit, etc.
-        Opcode = IceString("ldr");
+        ActualOpcode = IceString("ldr");
      }
    } else {
      if (llvm::isa<OperandARM32Mem>(Src0))
-        Opcode = IceString("ldr") + getWidthString(Dest->getType());
+        ActualOpcode = IceString("ldr") + getWidthString(Dest->getType());
    }
-    Str << "\t" << Opcode << getPredicate() << "\t";
+    Str << "\t" << ActualOpcode << getPredicate() << "\t";
    getDest()->emit(Func);
    Str << ", ";
    getSrc(0)->emit(Func);
@@ -436,6 +473,64 @@ template <> void InstARM32Mov::emitIAS(const Cfg *Func) const {
  llvm_unreachable("Not yet implemented");
 }
+template <> void InstARM32Vldr::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Str << "\t"<< Opcode << getPredicate() << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+}
+template <> void InstARM32Vldr::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+template <> void InstARM32Vmov::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  assert(CondARM32::AL == getPredicate());
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Variable *Dest = getDest();
+  if (Dest->hasReg()) {
+    IceString ActualOpcode = Opcode;
+    Operand *Src0 = getSrc(0);
+    if (const auto *Src0V = llvm::dyn_cast<Variable>(Src0)) {
+      if (!Src0V->hasReg()) {
+        ActualOpcode = IceString("vldr");
+      }
+    } else {
+      if (llvm::isa<OperandARM32Mem>(Src0))
+        ActualOpcode = IceString("vldr");
+    }
+    Str << "\t" << ActualOpcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+  } else {
+    Variable *Src0 = llvm::cast<Variable>(getSrc(0));
+    assert(Src0->hasReg());
+    Str << "\t"
+           "vstr"
+           "\t";
+    Src0->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+  }
+}
+template <> void InstARM32Vmov::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
 void InstARM32Br::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
@@ -547,37 +642,25 @@ void InstARM32Label::dump(const Cfg *Func) const {
  Str << getName(Func) << ":";
 }
-void InstARM32Ldr::emit(const Cfg *Func) const {
+template <> void InstARM32Ldr::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
  assert(getDest()->hasReg());
  Type Ty = getSrc(0)->getType();
-  Str << "\t"
+  Str << "\t"<< Opcode << getWidthString(Ty) << getPredicate() << "\t";
-      << "ldr" << getWidthString(Ty) << getPredicate() << "\t";
  getDest()->emit(Func);
  Str << ", ";
  getSrc(0)->emit(Func);
 }
-void InstARM32Ldr::emitIAS(const Cfg *Func) const {
+template <> void InstARM32Ldr::emitIAS(const Cfg *Func) const {
  assert(getSrcSize() == 1);
  (void)Func;
  llvm_unreachable("Not yet implemented");
 }
-void InstARM32Ldr::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = ";
-  dumpOpcodePred(Str, "ldr", getDest()->getType());
-  Str << " ";
-  dumpSources(Func);
-}
 template <> void InstARM32Movw::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;

--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -27,99 +27,249 @@
 // It is technically preserved, but save/restore is handled separately,
 // based on whether or not the function MaybeLeafFunc.
 #define REGARM32_GPR_TABLE                                                     \
-  /* val, encode, name, scratch, preserved, stackptr, frameptr, isInt, isFP */ \
+  /* val, encode, name, scratch, preserved, stackptr, frameptr,                \
-  X(Reg_r0,  = 0,            "r0",  1, 0, 0, 0, 1, 0)                   \
+     isInt, isFP32, isFP64, isVec128 */                                        \
-  X(Reg_r1,  = Reg_r0 + 1,   "r1",  1, 0, 0, 0, 1, 0)                   \
+  X(Reg_r0,  0, "r0",   1, 0, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r2,  = Reg_r0 + 2,   "r2",  1, 0, 0, 0, 1, 0)                   \
+  X(Reg_r1,  1, "r1",   1, 0, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r3,  = Reg_r0 + 3,   "r3",  1, 0, 0, 0, 1, 0)                   \
+  X(Reg_r2,  2, "r2",   1, 0, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r4,  = Reg_r0 + 4,   "r4",  0, 1, 0, 0, 1, 0)                   \
+  X(Reg_r3,  3, "r3",   1, 0, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r5,  = Reg_r0 + 5,   "r5",  0, 1, 0, 0, 1, 0)                   \
+  X(Reg_r4,  4, "r4",   0, 1, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r6,  = Reg_r0 + 6,   "r6",  0, 1, 0, 0, 1, 0)                   \
+  X(Reg_r5,  5, "r5",   0, 1, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r7,  = Reg_r0 + 7,   "r7",  0, 1, 0, 0, 1, 0)                   \
+  X(Reg_r6,  6, "r6",   0, 1, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r8,  = Reg_r0 + 8,   "r8",  0, 1, 0, 0, 1, 0)                   \
+  X(Reg_r7,  7, "r7",   0, 1, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r9,  = Reg_r0 + 9,   "r9",  0, 1, 0, 0, 0, 0)                   \
+  X(Reg_r8,  8, "r8",   0, 1, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_r10, = Reg_r0 + 10,  "r10", 0, 1, 0, 0, 1, 0)                   \
+  X(Reg_r9,  9, "r9",   0, 1, 0, 0, 0, 0, 0, 0)                                \
-  X(Reg_fp,  = Reg_r0 + 11,  "fp",  0, 1, 0, 1, 1, 0)                   \
+  X(Reg_r10, 10, "r10", 0, 1, 0, 0, 1, 0, 0, 0)                                \
-  X(Reg_ip,  = Reg_r0 + 12,  "ip",  1, 0, 0, 0, 0, 0)                   \
+  X(Reg_fp,  11, "fp",  0, 1, 0, 1, 1, 0, 0, 0)                                \
-  X(Reg_sp,  = Reg_r0 + 13,  "sp",  0, 0, 1, 0, 0, 0)                   \
+  X(Reg_ip,  12, "ip",  1, 0, 0, 0, 0, 0, 0, 0)                                \
-  X(Reg_lr,  = Reg_r0 + 14,  "lr",  0, 0, 0, 0, 0, 0)                   \
+  X(Reg_sp,  13, "sp",  0, 0, 1, 0, 0, 0, 0, 0)                                \
-  X(Reg_pc,  = Reg_r0 + 15,  "pc",  0, 0, 0, 0, 0, 0)                   \
+  X(Reg_lr,  14, "lr",  0, 0, 0, 0, 0, 0, 0, 0)                                \
+  X(Reg_pc,  15, "pc",  0, 0, 0, 0, 0, 0, 0, 0)
 //#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
-//          isInt, isFP)
+//          isInt, isFP32, isFP64, isVec128)
-// TODO(jvoung): List FP registers and know S0 == D0 == Q0, etc.
+// TODO(jvoung): Be able to grab even registers, and the corresponding odd
-// Be able to grab even registers, and the corresponding odd register
+// register for each even register. Want "register units" to encapsulate
-// for each even register.
+// the aliasing/overlap.
+//
+// S registers 0-15 are scratch, but 16-31 are preserved.
+// Regenerate this with the following python script:
+//
+// def print_sregs():
+//   for i in xrange(0, 32):
+//     is_scratch = 1 if i < 16 else 0
+//     is_preserved = 1 if i >= 16 else 0
+//     print ('X(Reg_s{regnum:<2}, {regnum:<2}, "s{regnum}", ' +
+//            '{scratch}, {preserved}, 0, 0, 0, 1, 0, 0)    \\').format(
+//            regnum=i, scratch=is_scratch, preserved=is_preserved)
+//
+// print_sregs()
+//
+#define REGARM32_FP32_TABLE                                                    \
+  /* val, encode, name, scratch, preserved, stackptr, frameptr,                \
+     isInt, isFP32, isFP64, isVec128 */                                        \
+  X(Reg_s0,  0,  "s0",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s1,  1,  "s1",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s2,  2,  "s2",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s3,  3,  "s3",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s4,  4,  "s4",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s5,  5,  "s5",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s6,  6,  "s6",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s7,  7,  "s7",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s8,  8,  "s8",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s9,  9,  "s9",  1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s10, 10, "s10", 1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s11, 11, "s11", 1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s12, 12, "s12", 1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s13, 13, "s13", 1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s14, 14, "s14", 1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s15, 15, "s15", 1, 0, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s16, 16, "s16", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s17, 17, "s17", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s18, 18, "s18", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s19, 19, "s19", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s20, 20, "s20", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s21, 21, "s21", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s22, 22, "s22", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s23, 23, "s23", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s24, 24, "s24", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s25, 25, "s25", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s26, 26, "s26", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s27, 27, "s27", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s28, 28, "s28", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s29, 29, "s29", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s30, 30, "s30", 0, 1, 0, 0, 0, 1, 0, 0)                                \
+  X(Reg_s31, 31, "s31", 0, 1, 0, 0, 0, 1, 0, 0)
+//#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
+//          isInt, isFP32, isFP64, isVec128)
+// D registers 0-7 are scratch, 8-15 are preserved, and 16-31
+// are also scratch (if supported by the D32 feature vs D16).
+//
+// Regenerate this with the following python script:
+// def print_dregs():
+//   for i in xrange(0, 32):
+//     is_scratch = 1 if (i < 8 or i >= 16) else 0
+//     is_preserved = 1 if (8 <= i and i < 16) else 0
+//     print ('X(Reg_d{regnum:<2}, {regnum:<2}, "d{regnum}", ' +
+//            '{scratch}, {preserved}, 0, 0, 0, 0, 1, 0)    \\').format(
+//            regnum=i, scratch=is_scratch, preserved=is_preserved)
+//
+// print_dregs()
+//
+#define REGARM32_FP64_TABLE                                                    \
+  /* val, encode, name, scratch, preserved, stackptr, frameptr,                \
+     isInt, isFP32, isFP64, isVec128 */                                        \
+  X(Reg_d0,  0,  "d0",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d1,  1,  "d1",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d2,  2,  "d2",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d3,  3,  "d3",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d4,  4,  "d4",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d5,  5,  "d5",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d6,  6,  "d6",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d7,  7,  "d7",  1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d8,  8,  "d8",  0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d9,  9,  "d9",  0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d10, 10, "d10", 0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d11, 11, "d11", 0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d12, 12, "d12", 0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d13, 13, "d13", 0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d14, 14, "d14", 0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d15, 15, "d15", 0, 1, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d16, 16, "d16", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d17, 17, "d17", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d18, 18, "d18", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d19, 19, "d19", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d20, 20, "d20", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d21, 21, "d21", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d22, 22, "d22", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d23, 23, "d23", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d24, 24, "d24", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d25, 25, "d25", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d26, 26, "d26", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d27, 27, "d27", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d28, 28, "d28", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d29, 29, "d29", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d30, 30, "d30", 1, 0, 0, 0, 0, 0, 1, 0)                                \
+  X(Reg_d31, 31, "d31", 1, 0, 0, 0, 0, 0, 1, 0)
+//#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
+//          isInt, isFP32, isFP64, isVec128)
+// Q registers 0-3 are scratch, 4-7 are preserved, and 8-15
+// are also scratch (if supported by the D32 feature).
+//
+// Regenerate this with the following python script:
+// def print_qregs():
+//   for i in xrange(0, 16):
+//     is_scratch = 1 if (i < 4 or i >= 8) else 0
+//     is_preserved = 1 if (4 <= i and i < 8) else 0
+//     print ('X(Reg_q{regnum:<2}, {regnum:<2}, "q{regnum}", ' +
+//            '{scratch}, {preserved}, 0, 0, 0, 0, 0, 1)    \\').format(
+//            regnum=i, scratch=is_scratch, preserved=is_preserved)
+//
+// print_qregs()
+//
+#define REGARM32_VEC128_TABLE                                                  \
+  /* val, encode, name, scratch, preserved, stackptr, frameptr,                \
+     isInt, isFP32, isFP64, isVec128 */                                        \
+  X(Reg_q0,  0,  "q0",  1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q1,  1,  "q1",  1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q2,  2,  "q2",  1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q3,  3,  "q3",  1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q4,  4,  "q4",  0, 1, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q5,  5,  "q5",  0, 1, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q6,  6,  "q6",  0, 1, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q7,  7,  "q7",  0, 1, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q8,  8,  "q8",  1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q9,  9,  "q9",  1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q10, 10, "q10", 1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q11, 11, "q11", 1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q12, 12, "q12", 1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q13, 13, "q13", 1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q14, 14, "q14", 1, 0, 0, 0, 0, 0, 0, 1)                                \
+  X(Reg_q15, 15, "q15", 1, 0, 0, 0, 0, 0, 0, 1)
+//#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
+//          isInt, isFP32, isFP64, isVec128)
 // We also provide a combined table, so that there is a namespace where
 // all of the registers are considered and have distinct numberings.
 // This is in contrast to the above, where the "encode" is based on how
 // the register numbers will be encoded in binaries and values can overlap.
-#define REGARM32_TABLE                                                  \
+#define REGARM32_TABLE                                                         \
-  /* val, encode, name, scratch, preserved, stackptr, frameptr, isInt, isFP */ \
+  /* val, encode, name, scratch, preserved, stackptr, frameptr, isInt,         \
-  REGARM32_GPR_TABLE
+     isFP32, isFP64, isVec128 */                                               \
+  REGARM32_GPR_TABLE                                                           \
+  REGARM32_FP32_TABLE                                                          \
+  REGARM32_FP64_TABLE                                                          \
+  REGARM32_VEC128_TABLE
 //#define X(val, encode, name, scratch, preserved, stackptr, frameptr,
-//          isInt, isFP)
+//          isInt, isFP32, isFP64, isVec128)
-#define REGARM32_TABLE_BOUNDS                                           \
-  /* val, init */                                                       \
-  X(Reg_GPR_First, = Reg_r0)                                            \
-  X(Reg_GPR_Last,  = Reg_pc)
-//define X(val, init)
-// TODO(jvoung): add condition code tables, etc.
+#define REGARM32_TABLE_BOUNDS                                                  \
+  /* val, init */                                                              \
+  X(Reg_GPR_First, = Reg_r0)                                                   \
+  X(Reg_GPR_Last, = Reg_pc)                                                    \
+  X(Reg_SREG_First, = Reg_s0)                                                  \
+  X(Reg_SREG_Last, = Reg_s31)                                                  \
+  X(Reg_DREG_First, = Reg_d0)                                                  \
+  X(Reg_DREG_Last, = Reg_d31)                                                  \
+  X(Reg_QREG_First, = Reg_q0)                                                  \
+  X(Reg_QREG_Last, = Reg_q15)
+// define X(val, init)
-// Load/Store instruction width suffixes.
+// Load/Store instruction width suffixes and FP/Vector element size suffixes
-#define ICETYPEARM32_TABLE                                              \
+// the # of offset bits allowed as part of an addressing mode (for sign or
-  /* tag,          element type, width, addr off bits sext, zext */     \
+// zero extending load/stores).
-  X(IceType_void,  IceType_void, "",  0, 0)                             \
+#define ICETYPEARM32_TABLE                                                     \
-  X(IceType_i1,    IceType_void, "b", 8, 12)                            \
+  /* tag,          element type, int_width, vec_width, addr bits sext, zext */ \
-  X(IceType_i8,    IceType_void, "b", 8, 12)                            \
+  X(IceType_void, IceType_void, "", "", 0, 0)                                  \
-  X(IceType_i16,   IceType_void, "h", 8, 8)                             \
+  X(IceType_i1, IceType_void, "b", "", 8, 12)                                  \
-  X(IceType_i32,   IceType_void, "", 12, 12)                            \
+  X(IceType_i8, IceType_void, "b", "", 8, 12)                                  \
-  X(IceType_i64,   IceType_void, "d", 8, 8)                             \
+  X(IceType_i16, IceType_void, "h", "", 8, 8)                                  \
-  X(IceType_f32,   IceType_void, "", 10, 10)                            \
+  X(IceType_i32, IceType_void, "", "", 12, 12)                                 \
-  X(IceType_f64,   IceType_void, "", 10, 10)                            \
+  X(IceType_i64, IceType_void, "d", "", 8, 8)                                  \
-  X(IceType_v4i1,  IceType_i32 , "",  0,  0)                            \
+  X(IceType_f32, IceType_void, "", ".f32", 10, 10)                             \
-  X(IceType_v8i1,  IceType_i16 , "",  0,  0)                            \
+  X(IceType_f64, IceType_void, "", ".f64", 10, 10)                             \
-  X(IceType_v16i1, IceType_i8  , "",  0,  0)                            \
+  X(IceType_v4i1, IceType_i32, "", ".i32", 0, 0)                               \
-  X(IceType_v16i8, IceType_i8  , "",  0,  0)                            \
+  X(IceType_v8i1, IceType_i16, "", ".i16", 0, 0)                               \
-  X(IceType_v8i16, IceType_i16 , "",  0,  0)                            \
+  X(IceType_v16i1, IceType_i8, "", ".i8", 0, 0)                                \
-  X(IceType_v4i32, IceType_i32 , "",  0,  0)                            \
+  X(IceType_v16i8, IceType_i8, "", ".i8", 0, 0)                                \
-  X(IceType_v4f32, IceType_f32 , "",  0,  0)                            \
+  X(IceType_v8i16, IceType_i16, "", ".i16", 0, 0)                              \
-//#define X(tag, elementty, width, sbits, ubits)
+  X(IceType_v4i32, IceType_i32, "", ".i32", 0, 0)                              \
+  X(IceType_v4f32, IceType_f32, "", ".f32", 0, 0)
+//#define X(tag, elementty, int_width, vec_width, sbits, ubits)
 // Shifter types for Data-processing operands as defined in section A5.1.2.
-#define ICEINSTARM32SHIFT_TABLE                                         \
+#define ICEINSTARM32SHIFT_TABLE                                                \
-  /* enum value, emit */                                                \
+  /* enum value, emit */                                                       \
-  X(LSL, "lsl")                                                         \
+  X(LSL, "lsl")                                                                \
-  X(LSR, "lsr")                                                         \
+  X(LSR, "lsr")                                                                \
-  X(ASR, "asr")                                                         \
+  X(ASR, "asr")                                                                \
-  X(ROR, "ror")                                                         \
+  X(ROR, "ror")                                                                \
-  X(RRX, "rrx")                                                         \
+  X(RRX, "rrx")
 //#define X(tag, emit)
 // Attributes for the condition code 4-bit encoding (that is independent
 // of the APSR's NZCV fields). For example, EQ is 0, but corresponds to
 // Z = 1, and NE is 1, but corresponds to Z = 0.
-#define ICEINSTARM32COND_TABLE                                          \
+#define ICEINSTARM32COND_TABLE                                                 \
-  /* enum value, encoding, opposite, emit */                            \
+  /* enum value, encoding, opposite, emit */                                   \
-  X(EQ, 0, NE, "eq") /* equal */                                        \
+  X(EQ, 0, NE, "eq")   /* equal */                                             \
-  X(NE, 1, EQ, "ne") /* not equal */                                    \
+  X(NE, 1, EQ, "ne")   /* not equal */                                         \
-  X(CS, 2, CC, "cs") /* carry set/unsigned (AKA hs: higher or same) */  \
+  X(CS, 2, CC, "cs")   /* carry set/unsigned (AKA hs: higher or same) */       \
-  X(CC, 3, CS, "cc") /* carry clear/unsigned (AKA lo: lower) */         \
+  X(CC, 3, CS, "cc")   /* carry clear/unsigned (AKA lo: lower) */              \
-  X(MI, 4, PL, "mi") /* minus/negative */                               \
+  X(MI, 4, PL, "mi")   /* minus/negative */                                    \
-  X(PL, 5, MI, "pl") /* plus/positive or zero */                        \
+  X(PL, 5, MI, "pl")   /* plus/positive or zero */                             \
-  X(VS, 6, VC, "vs") /* overflow (float unordered) */                   \
+  X(VS, 6, VC, "vs")   /* overflow (float unordered) */                        \
-  X(VC, 7, VS, "vc") /* no overflow (float not unordered) */            \
+  X(VC, 7, VS, "vc")   /* no overflow (float not unordered) */                 \
-  X(HI, 8, LS, "hi") /* unsigned higher */                              \
+  X(HI, 8, LS, "hi")   /* unsigned higher */                                   \
-  X(LS, 9, HI, "ls") /* unsigned lower or same */                       \
+  X(LS, 9, HI, "ls")   /* unsigned lower or same */                            \
-  X(GE, 10, LT, "ge") /* signed greater than or equal */                \
+  X(GE, 10, LT, "ge")  /* signed greater than or equal */                      \
-  X(LT, 11, GE, "lt") /* signed less than */                            \
+  X(LT, 11, GE, "lt")  /* signed less than */                                  \
-  X(GT, 12, LE, "gt") /* signed greater than */                         \
+  X(GT, 12, LE, "gt")  /* signed greater than */                               \
-  X(LE, 13, GT, "le") /* signed less than or equal */                   \
+  X(LE, 13, GT, "le")  /* signed less than or equal */                         \
-  X(AL, 14, kNone, "") /* always (unconditional) */                     \
+  X(AL, 14, kNone, "") /* always (unconditional) */                            \
-  X(kNone, 15, kNone, "??") /* special condition / none */              \
+  X(kNone, 15, kNone, "??") /* special condition / none */
 //#define(tag, encode, opp, emit)
 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -320,12 +320,24 @@ public:
    Tst,
    Udiv,
    Umull,
-    Uxt
+    Uxt,
+    Vadd,
+    Vdiv,
+    Vldr,
+    Vmov,
+    Vmul,
+    Vsqrt,
+    Vsub
  };
  static const char *getWidthString(Type Ty);
+  static const char *getVecWidthString(Type Ty);
  static CondARM32::Cond getOppositeCondition(CondARM32::Cond Cond);
+  /// Shared emit routines for common forms of instructions.
+  static void emitThreeAddrFP(const char *Opcode, const InstARM32 *Inst,
+                              const Cfg *Func);
  void dump(const Cfg *Func) const override;
 protected:
@@ -357,6 +369,8 @@ public:
  /// Shared emit routines for common forms of instructions.
  static void emitUnaryopGPR(const char *Opcode, const InstARM32Pred *Inst,
                             const Cfg *Func, bool NeedsWidthSuffix);
+  static void emitUnaryopFP(const char *Opcode, const InstARM32Pred *Inst,
+                            const Cfg *Func);
  static void emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
                          const Cfg *Func);
  static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
@@ -420,6 +434,50 @@ private:
  static const char *Opcode;
 };
+/// Instructions of the form x := op(y), for vector/FP.
+template <InstARM32::InstKindARM32 K>
+class InstARM32UnaryopFP : public InstARM32Pred {
+  InstARM32UnaryopFP() = delete;
+  InstARM32UnaryopFP(const InstARM32UnaryopFP &) = delete;
+  InstARM32UnaryopFP &operator=(const InstARM32UnaryopFP &) = delete;
+public:
+  static InstARM32UnaryopFP *create(Cfg *Func, Variable *Dest, Variable *Src,
+                                    CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32UnaryopFP>())
+        InstARM32UnaryopFP(Func, Dest, Src, Predicate);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitUnaryopFP(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+private:
+  InstARM32UnaryopFP(Cfg *Func, Variable *Dest, Operand *Src,
+                     CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 1, Dest, Predicate) {
+    addSource(Src);
+  }
+  static const char *Opcode;
+};
 /// Instructions of the form x := x op y.
 template <InstARM32::InstKindARM32 K>
 class InstARM32TwoAddrGPR : public InstARM32Pred {
@@ -559,7 +617,56 @@ private:
  bool SetFlags;
 };
-// Instructions of the form x := a op1 (y op2 z). E.g., multiply accumulate.
+/// Instructions of the form x := y op z, for vector/FP.  We leave these as
+/// unconditional: "ARM deprecates the conditional execution of any instruction
+/// encoding provided by the Advanced SIMD Extension that is not also provided
+/// by the Floating-point (VFP) extension".  They do not set flags.
+template <InstARM32::InstKindARM32 K>
+class InstARM32ThreeAddrFP : public InstARM32 {
+  InstARM32ThreeAddrFP() = delete;
+  InstARM32ThreeAddrFP(const InstARM32ThreeAddrFP &) = delete;
+  InstARM32ThreeAddrFP &operator=(const InstARM32ThreeAddrFP &) = delete;
+public:
+  /// Create a vector/FP binary-op instruction like vadd, and vsub.
+  /// Everything must be a register.
+  static InstARM32ThreeAddrFP *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                      Variable *Src1) {
+    return new (Func->allocate<InstARM32ThreeAddrFP>())
+        InstARM32ThreeAddrFP(Func, Dest, Src0, Src1);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitThreeAddrFP(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = ";
+    Str << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+private:
+  InstARM32ThreeAddrFP(Cfg *Func, Variable *Dest, Variable *Src0,
+                       Variable *Src1)
+      : InstARM32(Func, K, 2, Dest) {
+    addSource(Src0);
+    addSource(Src1);
+  }
+  static const char *Opcode;
+};
+/// Instructions of the form x := a op1 (y op2 z). E.g., multiply accumulate.
 template <InstARM32::InstKindARM32 K>
 class InstARM32FourAddrGPR : public InstARM32Pred {
  InstARM32FourAddrGPR() = delete;
@@ -608,7 +715,7 @@ private:
  static const char *Opcode;
 };
-// Instructions of the form x cmpop y (setting flags).
+/// Instructions of the form x cmpop y (setting flags).
 template <InstARM32::InstKindARM32 K>
 class InstARM32CmpLike : public InstARM32Pred {
  InstARM32CmpLike() = delete;
@@ -666,10 +773,19 @@ typedef InstARM32ThreeAddrGPR<InstARM32::Sbc> InstARM32Sbc;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sdiv> InstARM32Sdiv;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sub> InstARM32Sub;
 typedef InstARM32ThreeAddrGPR<InstARM32::Udiv> InstARM32Udiv;
+typedef InstARM32ThreeAddrFP<InstARM32::Vadd> InstARM32Vadd;
+typedef InstARM32ThreeAddrFP<InstARM32::Vdiv> InstARM32Vdiv;
+typedef InstARM32ThreeAddrFP<InstARM32::Vmul> InstARM32Vmul;
+typedef InstARM32ThreeAddrFP<InstARM32::Vsub> InstARM32Vsub;
+typedef InstARM32Movlike<InstARM32::Ldr> InstARM32Ldr;
 /// Move instruction (variable <- flex). This is more of a pseudo-inst.
 /// If var is a register, then we use "mov". If var is stack, then we use
 /// "str" to store to the stack.
 typedef InstARM32Movlike<InstARM32::Mov> InstARM32Mov;
+/// Represents various vector mov instruction forms (simple single source,
+/// single dest forms only, not the 2 GPR <-> 1 D reg forms, etc.).
+typedef InstARM32Movlike<InstARM32::Vmov> InstARM32Vmov;
+typedef InstARM32Movlike<InstARM32::Vldr> InstARM32Vldr;
 /// MovT leaves the bottom bits alone so dest is also a source.
 /// This helps indicate that a previous MovW setting dest is not dead code.
 typedef InstARM32TwoAddrGPR<InstARM32::Movt> InstARM32Movt;
@@ -683,6 +799,7 @@ typedef InstARM32UnaryopGPR<InstARM32::Rev, false> InstARM32Rev;
 // but we aren't using that for now, so just model as a Unaryop.
 typedef InstARM32UnaryopGPR<InstARM32::Sxt, true> InstARM32Sxt;
 typedef InstARM32UnaryopGPR<InstARM32::Uxt, true> InstARM32Uxt;
+typedef InstARM32UnaryopFP<InstARM32::Vsqrt> InstARM32Vsqrt;
 typedef InstARM32FourAddrGPR<InstARM32::Mla> InstARM32Mla;
 typedef InstARM32FourAddrGPR<InstARM32::Mls> InstARM32Mls;
 typedef InstARM32CmpLike<InstARM32::Cmp> InstARM32Cmp;
@@ -838,29 +955,6 @@ private:
  InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
 };
-/// Load instruction.
-class InstARM32Ldr : public InstARM32Pred {
-  InstARM32Ldr() = delete;
-  InstARM32Ldr(const InstARM32Ldr &) = delete;
-  InstARM32Ldr &operator=(const InstARM32Ldr &) = delete;
-public:
-  /// Dest must be a register.
-  static InstARM32Ldr *create(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
-                              CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Ldr>())
-        InstARM32Ldr(Func, Dest, Mem, Predicate);
-  }
-  void emit(const Cfg *Func) const override;
-  void emitIAS(const Cfg *Func) const override;
-  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Ldr); }
-private:
-  InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
-               CondARM32::Cond Predicate);
-};
 /// Pop into a list of GPRs. Technically this can be predicated, but we don't
 /// need that functionality.
 class InstARM32Pop : public InstARM32 {
@@ -1003,8 +1097,12 @@ private:
 // already have default implementations.  Without this, there is the
 // possibility of ODR violations and link errors.
+template <> void InstARM32Ldr::emit(const Cfg *Func) const;
+template <> void InstARM32Mov::emit(const Cfg *Func) const;
 template <> void InstARM32Movw::emit(const Cfg *Func) const;
 template <> void InstARM32Movt::emit(const Cfg *Func) const;
+template <> void InstARM32Vldr::emit(const Cfg *Func) const;
+template <> void InstARM32Vmov::emit(const Cfg *Func) const;
 } // end of namespace Ice

--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -21,42 +21,90 @@
 namespace Ice {
-namespace RegARM32 {
+class RegARM32 {
+public:
-/// An enum of every register. The enum value may not match the encoding
+  /// An enum of every register. The enum value may not match the encoding
-/// used to binary encode register operands in instructions.
+  /// used to binary encode register operands in instructions.
-enum AllRegisters {
+  enum AllRegisters {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
  val,
-  REGARM32_TABLE
+    REGARM32_TABLE
 #undef X
-      Reg_NUM,
+        Reg_NUM,
 #define X(val, init) val init,
-  REGARM32_TABLE_BOUNDS
+    REGARM32_TABLE_BOUNDS
 #undef X
-};
+  };
-/// An enum of GPR Registers. The enum value does match the encoding used
+  /// An enum of GPR Registers. The enum value does match the encoding used
-/// to binary encode register operands in instructions.
+  /// to binary encode register operands in instructions.
-enum GPRRegister {
+  enum GPRRegister {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
-  Encoded_##val encode,
+  Encoded_##val = encode,
-  REGARM32_GPR_TABLE
+    REGARM32_GPR_TABLE
 #undef X
-      Encoded_Not_GPR = -1
+        Encoded_Not_GPR = -1
-};
+  };
+  /// An enum of FP32 S-Registers. The enum value does match the encoding used
+  /// to binary encode register operands in instructions.
+  enum SRegister {
+#define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
+          isFP32, isFP64, isVec128)                                            \
+  Encoded_##val = encode,
+    REGARM32_FP32_TABLE
+#undef X
+        Encoded_Not_SReg = -1
+  };
+  /// An enum of FP64 D-Registers. The enum value does match the encoding used
+  /// to binary encode register operands in instructions.
+  enum DRegister {
+#define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
+          isFP32, isFP64, isVec128)                                            \
+  Encoded_##val = encode,
+    REGARM32_FP64_TABLE
+#undef X
+        Encoded_Not_DReg = -1
+  };
-// TODO(jvoung): Floating point and vector registers...
+  /// An enum of 128-bit Q-Registers. The enum value does match the encoding
-// Need to model overlap and difference in encoding too.
+  /// used to binary encode register operands in instructions.
+  enum QRegister {
+#define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
+          isFP32, isFP64, isVec128)                                            \
+  Encoded_##val = encode,
+    REGARM32_VEC128_TABLE
+#undef X
+        Encoded_Not_QReg = -1
+  };
-static inline GPRRegister getEncodedGPR(int32_t RegNum) {
+  static inline GPRRegister getEncodedGPR(int32_t RegNum) {
-  assert(Reg_GPR_First <= RegNum && RegNum <= Reg_GPR_Last);
+    assert(Reg_GPR_First <= RegNum);
-  return GPRRegister(RegNum - Reg_GPR_First);
+    assert(RegNum <= Reg_GPR_Last);
-}
+    return GPRRegister(RegNum - Reg_GPR_First);
+  }
-} // end of namespace RegARM32
+  static inline SRegister getEncodedSReg(int32_t RegNum) {
+    assert(Reg_SREG_First <= RegNum);
+    assert(RegNum <= Reg_SREG_Last);
+    return SRegister(RegNum - Reg_SREG_First);
+  }
+  static inline DRegister getEncodedDReg(int32_t RegNum) {
+    assert(Reg_DREG_First <= RegNum);
+    assert(RegNum <= Reg_DREG_Last);
+    return DRegister(RegNum - Reg_DREG_First);
+  }
+  static inline QRegister getEncodedQReg(int32_t RegNum) {
+    assert(Reg_QREG_First <= RegNum);
+    assert(RegNum <= Reg_QREG_Last);
+    return QRegister(RegNum - Reg_QREG_First);
+  }
+};
 } // end of namespace Ice

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -174,16 +174,19 @@ TargetARM32::TargetARM32(Cfg *Func)
  // TODO: Don't initialize IntegerRegisters and friends every time.
  // Instead, initialize in some sort of static initializer for the
  // class.
+  // Limit this size (or do all bitsets need to be the same width)???
  llvm::SmallBitVector IntegerRegisters(RegARM32::Reg_NUM);
-  llvm::SmallBitVector FloatRegisters(RegARM32::Reg_NUM);
+  llvm::SmallBitVector Float32Registers(RegARM32::Reg_NUM);
+  llvm::SmallBitVector Float64Registers(RegARM32::Reg_NUM);
  llvm::SmallBitVector VectorRegisters(RegARM32::Reg_NUM);
  llvm::SmallBitVector InvalidRegisters(RegARM32::Reg_NUM);
  ScratchRegs.resize(RegARM32::Reg_NUM);
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
  IntegerRegisters[RegARM32::val] = isInt;                                     \
-  FloatRegisters[RegARM32::val] = isFP;                                        \
+  Float32Registers[RegARM32::val] = isFP32;                                    \
-  VectorRegisters[RegARM32::val] = isFP;                                       \
+  Float64Registers[RegARM32::val] = isFP64;                                    \
+  VectorRegisters[RegARM32::val] = isVec128;                                   \
  ScratchRegs[RegARM32::val] = scratch;
  REGARM32_TABLE;
 #undef X
@@ -193,8 +196,8 @@ TargetARM32::TargetARM32(Cfg *Func)
  TypeToRegisterSet[IceType_i16] = IntegerRegisters;
  TypeToRegisterSet[IceType_i32] = IntegerRegisters;
  TypeToRegisterSet[IceType_i64] = IntegerRegisters;
-  TypeToRegisterSet[IceType_f32] = FloatRegisters;
+  TypeToRegisterSet[IceType_f32] = Float32Registers;
-  TypeToRegisterSet[IceType_f64] = FloatRegisters;
+  TypeToRegisterSet[IceType_f64] = Float64Registers;
  TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
  TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
  TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
@@ -363,7 +366,7 @@ IceString TargetARM32::getRegName(SizeT RegNum, Type Ty) const {
  (void)Ty;
  static const char *RegNames[] = {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
  name,
      REGARM32_TABLE
 #undef X
@@ -435,9 +438,7 @@ bool TargetARM32::CallingConv::I64InRegs(std::pair<int32_t, int32_t> *Regs) {
  int32_t RegLo, RegHi;
  // Always start i64 registers at an even register, so this may end
  // up padding away a register.
-  if (NumGPRRegsUsed % 2 != 0) {
+  NumGPRRegsUsed = Utils::applyAlignment(NumGPRRegsUsed, 2);
-    ++NumGPRRegsUsed;
-  }
  RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
  ++NumGPRRegsUsed;
  RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
@@ -459,6 +460,33 @@ bool TargetARM32::CallingConv::I32InReg(int32_t *Reg) {
  return true;
 }
+bool TargetARM32::CallingConv::FPInReg(Type Ty, int32_t *Reg) {
+  if (NumFPRegUnits >= ARM32_MAX_FP_REG_UNITS)
+    return false;
+  if (isVectorType(Ty)) {
+    NumFPRegUnits = Utils::applyAlignment(NumFPRegUnits, 4);
+    *Reg = RegARM32::Reg_q0 + (NumFPRegUnits / 4);
+    NumFPRegUnits += 4;
+    // If this bumps us past the boundary, don't allocate to a register
+    // and leave any previously speculatively consumed registers as consumed.
+    if (NumFPRegUnits > ARM32_MAX_FP_REG_UNITS)
+      return false;
+  } else if (Ty == IceType_f64) {
+    NumFPRegUnits = Utils::applyAlignment(NumFPRegUnits, 2);
+    *Reg = RegARM32::Reg_d0 + (NumFPRegUnits / 2);
+    NumFPRegUnits += 2;
+    // If this bumps us past the boundary, don't allocate to a register
+    // and leave any previously speculatively consumed registers as consumed.
+    if (NumFPRegUnits > ARM32_MAX_FP_REG_UNITS)
+      return false;
+  } else {
+    assert(Ty == IceType_f32);
+    *Reg = RegARM32::Reg_s0 + NumFPRegUnits;
+    ++NumFPRegUnits;
+  }
+  return true;
+}
 void TargetARM32::lowerArguments() {
  VarList &Args = Func->getArgs();
  TargetARM32::CallingConv CC;
@@ -472,14 +500,7 @@ void TargetARM32::lowerArguments() {
  for (SizeT I = 0, E = Args.size(); I < E; ++I) {
    Variable *Arg = Args[I];
    Type Ty = Arg->getType();
-    // TODO(jvoung): handle float/vector types.
+    if (Ty == IceType_i64) {
-    if (isVectorType(Ty)) {
-      UnimplementedError(Func->getContext()->getFlags());
-      continue;
-    } else if (isFloatingType(Ty)) {
-      UnimplementedError(Func->getContext()->getFlags());
-      continue;
-    } else if (Ty == IceType_i64) {
      std::pair<int32_t, int32_t> RegPair;
      if (!CC.I64InRegs(&RegPair))
        continue;
@@ -503,10 +524,15 @@ void TargetARM32::lowerArguments() {
      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
      continue;
    } else {
-      assert(Ty == IceType_i32);
      int32_t RegNum;
-      if (!CC.I32InReg(&RegNum))
+      if (isVectorType(Ty) || isFloatingType(Ty)) {
-        continue;
+        if (!CC.FPInReg(Ty, &RegNum))
+          continue;
+      } else {
+        assert(Ty == IceType_i32);
+        if (!CC.I32InReg(&RegNum))
+          continue;
+      }
      Variable *RegisterArg = Func->makeVariable(Ty);
      if (BuildDefs::dump()) {
        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
@@ -517,6 +543,7 @@ void TargetARM32::lowerArguments() {
      Args[I] = RegisterArg;
      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+      continue;
    }
  }
 }
@@ -554,7 +581,10 @@ void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
        Func, Ty, FramePtr, llvm::cast<ConstantInteger32>(
                                Ctx->getConstantInt32(Arg->getStackOffset())));
    if (isVectorType(Arg->getType())) {
+      // Use vld1.$elem or something?
      UnimplementedError(Func->getContext()->getFlags());
+    } else if (isFloatingType(Arg->getType())) {
+      _vldr(Arg, Mem);
    } else {
      _ldr(Arg, Mem);
    }
@@ -725,12 +755,9 @@ void TargetARM32::addProlog(CfgNode *Node) {
    Type Ty = Arg->getType();
    bool InRegs = false;
    // Skip arguments passed in registers.
-    if (isVectorType(Ty)) {
+    if (isVectorType(Ty) || isFloatingType(Ty)) {
-      UnimplementedError(Func->getContext()->getFlags());
+      int32_t DummyReg;
-      continue;
+      InRegs = CC.FPInReg(Ty, &DummyReg);
-    } else if (isFloatingType(Ty)) {
-      UnimplementedError(Func->getContext()->getFlags());
-      continue;
    } else if (Ty == IceType_i64) {
      std::pair<int32_t, int32_t> DummyRegs;
      InRegs = CC.I64InRegs(&DummyRegs);
@@ -858,6 +885,8 @@ void TargetARM32::addEpilog(CfgNode *Node) {
 bool TargetARM32::isLegalVariableStackOffset(int32_t Offset) const {
  constexpr bool SignExt = false;
+  // TODO(jvoung): vldr of FP stack slots has a different limit from the
+  // plain stackSlotType().
  return OperandARM32Mem::canHoldOffset(stackSlotType(), SignExt, Offset);
 }
@@ -1121,7 +1150,7 @@ llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
  llvm::SmallBitVector Registers(RegARM32::Reg_NUM);
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
  if (scratch && (Include & RegSet_CallerSave))                                \
    Registers[RegARM32::val] = true;                                           \
  if (preserved && (Include & RegSet_CalleeSave))                              \
@@ -1518,6 +1547,8 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
    return;
  } else if (isVectorType(Dest->getType())) {
    UnimplementedError(Func->getContext()->getFlags());
+    // Add a fake def to keep liveness consistent in the meantime.
+    Context.insert(InstFakeDef::create(Func, Dest));
    return;
  }
  // Dest->getType() is a non-i64 scalar.
@@ -1553,6 +1584,47 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
                 H_srem_i32, IsRemainder);
    return;
  }
+  case InstArithmetic::Frem: {
+    const SizeT MaxSrcs = 2;
+    Type Ty = Dest->getType();
+    InstCall *Call = makeHelperCall(
+        isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
+    Call->addArg(Src0R);
+    Call->addArg(Src1);
+    lowerCall(Call);
+    return;
+  }
+  }
+  // Handle floating point arithmetic separately: they require Src1 to be
+  // legalized to a register.
+  switch (Inst->getOp()) {
+  default:
+    break;
+  case InstArithmetic::Fadd: {
+    Variable *Src1R = legalizeToReg(Src1);
+    _vadd(T, Src0R, Src1R);
+    _vmov(Dest, T);
+    return;
+  }
+  case InstArithmetic::Fsub: {
+    Variable *Src1R = legalizeToReg(Src1);
+    _vsub(T, Src0R, Src1R);
+    _vmov(Dest, T);
+    return;
+  }
+  case InstArithmetic::Fmul: {
+    Variable *Src1R = legalizeToReg(Src1);
+    _vmul(T, Src0R, Src1R);
+    _vmov(Dest, T);
+    return;
+  }
+  case InstArithmetic::Fdiv: {
+    Variable *Src1R = legalizeToReg(Src1);
+    _vdiv(T, Src0R, Src1R);
+    _vmov(Dest, T);
+    return;
+  }
  }
  Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
@@ -1605,19 +1677,11 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
    llvm_unreachable("Integer div/rem should have been handled earlier.");
    return;
  case InstArithmetic::Fadd:
-    UnimplementedError(Func->getContext()->getFlags());
-    return;
  case InstArithmetic::Fsub:
-    UnimplementedError(Func->getContext()->getFlags());
-    return;
  case InstArithmetic::Fmul:
-    UnimplementedError(Func->getContext()->getFlags());
-    return;
  case InstArithmetic::Fdiv:
-    UnimplementedError(Func->getContext()->getFlags());
-    return;
  case InstArithmetic::Frem:
-    UnimplementedError(Func->getContext()->getFlags());
+    llvm_unreachable("Floating point arith should have been handled earlier.");
    return;
  }
 }
@@ -1652,6 +1716,9 @@ void TargetARM32::lowerAssign(const InstAssign *Inst) {
    }
    if (isVectorType(Dest->getType())) {
      UnimplementedError(Func->getContext()->getFlags());
+    } else if (isFloatingType(Dest->getType())) {
+      Variable *SrcR = legalizeToReg(NewSrc);
+      _vmov(Dest, SrcR);
    } else {
      _mov(Dest, NewSrc);
    }
@@ -1681,6 +1748,8 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
  // Pair of Arg Operand -> GPR number assignments.
  llvm::SmallVector<std::pair<Operand *, int32_t>,
                    TargetARM32::CallingConv::ARM32_MAX_GPR_ARG> GPRArgs;
+  llvm::SmallVector<std::pair<Operand *, int32_t>,
+                    TargetARM32::CallingConv::ARM32_MAX_FP_REG_UNITS> FPArgs;
  // Pair of Arg Operand -> stack offset.
  llvm::SmallVector<std::pair<Operand *, int32_t>, 8> StackArgs;
  int32_t ParameterAreaSizeBytes = 0;
@@ -1691,11 +1760,7 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
    Operand *Arg = legalizeUndef(Instr->getArg(i));
    Type Ty = Arg->getType();
    bool InRegs = false;
-    if (isVectorType(Ty)) {
+    if (Ty == IceType_i64) {
-      UnimplementedError(Func->getContext()->getFlags());
-    } else if (isFloatingType(Ty)) {
-      UnimplementedError(Func->getContext()->getFlags());
-    } else if (Ty == IceType_i64) {
      std::pair<int32_t, int32_t> Regs;
      if (CC.I64InRegs(&Regs)) {
        InRegs = true;
@@ -1704,6 +1769,12 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
        GPRArgs.push_back(std::make_pair(Lo, Regs.first));
        GPRArgs.push_back(std::make_pair(Hi, Regs.second));
      }
+    } else if (isVectorType(Ty) || isFloatingType(Ty)) {
+      int32_t Reg;
+      if (CC.FPInReg(Ty, &Reg)) {
+        InRegs = true;
+        FPArgs.push_back(std::make_pair(Arg, Reg));
+      }
    } else {
      assert(Ty == IceType_i32);
      int32_t Reg;
@@ -1766,6 +1837,10 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
    // registers after the call.
    Context.insert(InstFakeUse::create(Func, Reg));
  }
+  for (auto &FPArg : FPArgs) {
+    Variable *Reg = legalizeToReg(FPArg.first, FPArg.second);
+    Context.insert(InstFakeUse::create(Func, Reg));
+  }
  // Generate the call instruction.  Assign its result to a temporary
  // with high register allocation weight.
@@ -1791,9 +1866,10 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
      ReturnRegHi = makeReg(IceType_i32, RegARM32::Reg_r1);
      break;
    case IceType_f32:
+      ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_s0);
+      break;
    case IceType_f64:
-      // Use S and D regs.
+      ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_d0);
-      UnimplementedError(Func->getContext()->getFlags());
      break;
    case IceType_v4i1:
    case IceType_v8i1:
@@ -1802,8 +1878,7 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
    case IceType_v8i16:
    case IceType_v4i32:
    case IceType_v4f32:
-      // Use Q regs.
+      ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_q0);
-      UnimplementedError(Func->getContext()->getFlags());
      break;
    }
  }
@@ -1853,12 +1928,11 @@ void TargetARM32::lowerCall(const InstCall *Instr) {
      _mov(DestLo, ReturnReg);
      _mov(DestHi, ReturnRegHi);
    } else {
-      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
-             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
-             isVectorType(Dest->getType()));
      if (isFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
-        UnimplementedError(Func->getContext()->getFlags());
+        _vmov(Dest, ReturnReg);
      } else {
+        assert(isIntegerType(Dest->getType()) &&
+               typeWidthInBytes(Dest->getType()) <= 4);
        _mov(Dest, ReturnReg);
      }
    }
@@ -2291,6 +2365,8 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
    return;
  }
  case Intrinsics::Fabs: {
+    // Add a fake def to keep liveness consistent in the meantime.
+    Context.insert(InstFakeDef::create(Func, Instr->getDest()));
    UnimplementedError(Func->getContext()->getFlags());
    return;
  }
@@ -2352,7 +2428,11 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
    return;
  }
  case Intrinsics::Sqrt: {
-    UnimplementedError(Func->getContext()->getFlags());
+    Variable *Src = legalizeToReg(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(Dest->getType());
+    _vsqrt(T, Src);
+    _vmov(Dest, T);
    return;
  }
  case Intrinsics::Stacksave: {
@@ -2440,16 +2520,22 @@ void TargetARM32::lowerRet(const InstRet *Inst) {
  Variable *Reg = nullptr;
  if (Inst->hasRetValue()) {
    Operand *Src0 = Inst->getRetValue();
-    if (Src0->getType() == IceType_i64) {
+    Type Ty = Src0->getType();
+    if (Ty == IceType_i64) {
      Src0 = legalizeUndef(Src0);
      Variable *R0 = legalizeToReg(loOperand(Src0), RegARM32::Reg_r0);
      Variable *R1 = legalizeToReg(hiOperand(Src0), RegARM32::Reg_r1);
      Reg = R0;
      Context.insert(InstFakeUse::create(Func, R1));
-    } else if (isScalarFloatingType(Src0->getType())) {
+    } else if (Ty == IceType_f32) {
-      UnimplementedError(Func->getContext()->getFlags());
+      Variable *S0 = legalizeToReg(Src0, RegARM32::Reg_s0);
+      Reg = S0;
+    } else if (Ty == IceType_f64) {
+      Variable *D0 = legalizeToReg(Src0, RegARM32::Reg_d0);
+      Reg = D0;
    } else if (isVectorType(Src0->getType())) {
-      UnimplementedError(Func->getContext()->getFlags());
+      Variable *Q0 = legalizeToReg(Src0, RegARM32::Reg_q0);
+      Reg = Q0;
    } else {
      Operand *Src0F = legalize(Src0, Legal_Reg | Legal_Flex);
      _mov(Reg, Src0F, CondARM32::AL, RegARM32::Reg_r0);
@@ -2596,8 +2682,8 @@ Variable *TargetARM32::makeVectorOfZeros(Type Ty, int32_t RegNum) {
 Variable *TargetARM32::copyToReg(Operand *Src, int32_t RegNum) {
  Type Ty = Src->getType();
  Variable *Reg = makeReg(Ty, RegNum);
-  if (isVectorType(Ty)) {
+  if (isVectorType(Ty) || isFloatingType(Ty)) {
-    UnimplementedError(Func->getContext()->getFlags());
+    _vmov(Reg, Src);
  } else {
    // Mov's Src operand can really only be the flexible second operand type
    // or a register. Users should guarantee that.
@@ -2646,7 +2732,13 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
    }
    if (!(Allowed & Legal_Mem)) {
      Variable *Reg = makeReg(Ty, RegNum);
-      _ldr(Reg, Mem);
+      if (isVectorType(Ty)) {
+        UnimplementedError(Func->getContext()->getFlags());
+      } else if (isFloatingType(Ty)) {
+        _vldr(Reg, Mem);
+      } else {
+        _ldr(Reg, Mem);
+      }
      From = Reg;
    } else {
      From = Mem;
@@ -2716,11 +2808,25 @@ Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
      _movt(Reg, C);
      return Reg;
    } else {
+      assert(isScalarFloatingType(Ty));
      // Load floats/doubles from literal pool.
-      UnimplementedError(Func->getContext()->getFlags());
+      // TODO(jvoung): Allow certain immediates to be encoded directly in
-      From = copyToReg(From, RegNum);
+      // an operand. See Table A7-18 of the ARM manual:
+      // "Floating-point modified immediate constants".
+      // Or, for 32-bit floating point numbers, just encode the raw bits
+      // into a movw/movt pair to GPR, and vmov to an SREG, instead of using
+      // a movw/movt pair to get the const-pool address then loading to SREG.
+      std::string Buffer;
+      llvm::raw_string_ostream StrBuf(Buffer);
+      llvm::cast<Constant>(From)->emitPoolLabel(StrBuf);
+      llvm::cast<Constant>(From)->setShouldBePooled(true);
+      Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
+      Variable *BaseReg = makeReg(getPointerType());
+      _movw(BaseReg, Offset);
+      _movt(BaseReg, Offset);
+      From = formMemoryOperand(BaseReg, Ty);
+      return copyToReg(From, RegNum);
    }
-    return From;
  }
  if (auto Var = llvm::dyn_cast<Variable>(From)) {

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -190,7 +190,7 @@ protected:
  }
  void _adds(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Add::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -300,7 +300,7 @@ protected:
  }
  void _orrs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Orr::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -334,7 +334,7 @@ protected:
  }
  void _sbcs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -352,7 +352,7 @@ protected:
  }
  void _subs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Sub::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -381,6 +381,41 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Uxt::create(Func, Dest, Src0, Pred));
  }
+  void _vadd(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vadd::create(Func, Dest, Src0, Src1));
+  }
+  void _vdiv(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vdiv::create(Func, Dest, Src0, Src1));
+  }
+  void _vldr(Variable *Dest, OperandARM32Mem *Src,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vldr::create(Func, Dest, Src, Pred));
+  }
+  // There are a whole bunch of vmov variants, to transfer within
+  // S/D/Q registers, between core integer registers and S/D,
+  // and from small immediates into S/D.
+  // For integer -> S/D/Q there is a variant which takes two integer
+  // register to fill a D, or to fill two consecutive S registers.
+  // Vmov can also be used to insert-element. E.g.,
+  //    "vmov.8 d0[1], r0"
+  // but insert-element is a "two-address" operation where only part of the
+  // register is modified. This cannot model that.
+  //
+  // This represents the simple single source, single dest variants only.
+  void _vmov(Variable *Dest, Operand *Src0) {
+    constexpr CondARM32::Cond Pred = CondARM32::AL;
+    Context.insert(InstARM32Vmov::create(Func, Dest, Src0, Pred));
+  }
+  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vmul::create(Func, Dest, Src0, Src1));
+  }
+  void _vsqrt(Variable *Dest, Variable *Src,
+              CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vsqrt::create(Func, Dest, Src, Pred));
+  }
+  void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vsub::create(Func, Dest, Src0, Src1));
+  }
  /// Run a pass through stack variables and ensure that the offsets are legal.
  /// If the offset is not legal, use a new base register that accounts for
@@ -417,16 +452,20 @@ protected:
    CallingConv &operator=(const CallingConv &) = delete;
  public:
-    CallingConv() : NumGPRRegsUsed(0) {}
+    CallingConv() {}
    ~CallingConv() = default;
    bool I64InRegs(std::pair<int32_t, int32_t> *Regs);
    bool I32InReg(int32_t *Reg);
+    bool FPInReg(Type Ty, int32_t *Reg);
    static constexpr uint32_t ARM32_MAX_GPR_ARG = 4;
+    // Units of S registers still available to S/D/Q arguments.
+    static constexpr uint32_t ARM32_MAX_FP_REG_UNITS = 16;
  private:
-    uint32_t NumGPRRegsUsed;
+    uint32_t NumGPRRegsUsed = 0;
+    uint32_t NumFPRegUnits = 0;
  };
 private:

--- a/tests_lit/llvm2ice_tests/fp.arith.ll
+++ b/tests_lit/llvm2ice_tests/fp.arith.ll
+; This tries to be a comprehensive test of f32 and f64 arith operations.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -Om1 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+define internal float @addFloat(float %a, float %b) {
+entry:
+  %add = fadd float %a, %b
+  ret float %add
+}
+; CHECK-LABEL: addFloat
+; CHECK: addss
+; CHECK: fld
+; ARM32-LABEL: addFloat
+; ARM32: vadd.f32 s{{[0-9]+}}, s
+define internal double @addDouble(double %a, double %b) {
+entry:
+  %add = fadd double %a, %b
+  ret double %add
+}
+; CHECK-LABEL: addDouble
+; CHECK: addsd
+; CHECK: fld
+; ARM32-LABEL: addDouble
+; ARM32: vadd.f64 d{{[0-9]+}}, d
+define internal float @subFloat(float %a, float %b) {
+entry:
+  %sub = fsub float %a, %b
+  ret float %sub
+}
+; CHECK-LABEL: subFloat
+; CHECK: subss
+; CHECK: fld
+; ARM32-LABEL: subFloat
+; ARM32: vsub.f32 s{{[0-9]+}}, s
+define internal double @subDouble(double %a, double %b) {
+entry:
+  %sub = fsub double %a, %b
+  ret double %sub
+}
+; CHECK-LABEL: subDouble
+; CHECK: subsd
+; CHECK: fld
+; ARM32-LABEL: subDouble
+; ARM32: vsub.f64 d{{[0-9]+}}, d
+define internal float @mulFloat(float %a, float %b) {
+entry:
+  %mul = fmul float %a, %b
+  ret float %mul
+}
+; CHECK-LABEL: mulFloat
+; CHECK: mulss
+; CHECK: fld
+; ARM32-LABEL: mulFloat
+; ARM32: vmul.f32 s{{[0-9]+}}, s
+define internal double @mulDouble(double %a, double %b) {
+entry:
+  %mul = fmul double %a, %b
+  ret double %mul
+}
+; CHECK-LABEL: mulDouble
+; CHECK: mulsd
+; CHECK: fld
+; ARM32-LABEL: mulDouble
+; ARM32: vmul.f64 d{{[0-9]+}}, d
+define internal float @divFloat(float %a, float %b) {
+entry:
+  %div = fdiv float %a, %b
+  ret float %div
+}
+; CHECK-LABEL: divFloat
+; CHECK: divss
+; CHECK: fld
+; ARM32-LABEL: divFloat
+; ARM32: vdiv.f32 s{{[0-9]+}}, s
+define internal double @divDouble(double %a, double %b) {
+entry:
+  %div = fdiv double %a, %b
+  ret double %div
+}
+; CHECK-LABEL: divDouble
+; CHECK: divsd
+; CHECK: fld
+; ARM32-LABEL: divDouble
+; ARM32: vdiv.f64 d{{[0-9]+}}, d
+define internal float @remFloat(float %a, float %b) {
+entry:
+  %div = frem float %a, %b
+  ret float %div
+}
+; CHECK-LABEL: remFloat
+; CHECK: call {{.*}} R_{{.*}} fmodf
+; ARM32-LABEL: remFloat
+; ARM32: bl {{.*}} fmodf
+define internal double @remDouble(double %a, double %b) {
+entry:
+  %div = frem double %a, %b
+  ret double %div
+}
+; CHECK-LABEL: remDouble
+; CHECK: call {{.*}} R_{{.*}} fmod
+; ARM32-LABEL: remDouble
+; ARM32: bl {{.*}} fmod
--- a/tests_lit/llvm2ice_tests/fp.call_ret.ll
+++ b/tests_lit/llvm2ice_tests/fp.call_ret.ll
+; This tries to be a comprehensive test of f32 and f64 call/return ops.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; Can't test on ARM yet. Need to use several vpush {contiguous FP regs},
+; instead of push {any GPR list}.
+define internal i32 @doubleArgs(double %a, i32 %b, double %c) {
+entry:
+  ret i32 %b
+}
+; CHECK-LABEL: doubleArgs
+; CHECK:      mov eax,DWORD PTR [esp+0xc]
+; CHECK-NEXT: ret
+; ARM32-LABEL: doubleArgs
+define internal i32 @floatArgs(float %a, i32 %b, float %c) {
+entry:
+  ret i32 %b
+}
+; CHECK-LABEL: floatArgs
+; CHECK:      mov eax,DWORD PTR [esp+0x8]
+; CHECK-NEXT: ret
+define internal i32 @passFpArgs(float %a, double %b, float %c, double %d, float %e, double %f) {
+entry:
+  %call = call i32 @ignoreFpArgsNoInline(float %a, i32 123, double %b)
+  %call1 = call i32 @ignoreFpArgsNoInline(float %c, i32 123, double %d)
+  %call2 = call i32 @ignoreFpArgsNoInline(float %e, i32 123, double %f)
+  %add = add i32 %call1, %call
+  %add3 = add i32 %add, %call2
+  ret i32 %add3
+}
+; CHECK-LABEL: passFpArgs
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+declare i32 @ignoreFpArgsNoInline(float %x, i32 %y, double %z)
+define internal i32 @passFpConstArg(float %a, double %b) {
+entry:
+  %call = call i32 @ignoreFpArgsNoInline(float %a, i32 123, double 2.340000e+00)
+  ret i32 %call
+}
+; CHECK-LABEL: passFpConstArg
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+define internal i32 @passFp32ConstArg(float %a) {
+entry:
+  %call = call i32 @ignoreFp32ArgsNoInline(float %a, i32 123, float 2.0)
+  ret i32 %call
+}
+; CHECK-LABEL: passFp32ConstArg
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: movss DWORD PTR [esp+0x8]
+; CHECK: call {{.*}} R_{{.*}} ignoreFp32ArgsNoInline
+declare i32 @ignoreFp32ArgsNoInline(float %x, i32 %y, float %z)
+define internal float @returnFloatArg(float %a) {
+entry:
+  ret float %a
+}
+; CHECK-LABEL: returnFloatArg
+; CHECK: fld DWORD PTR [esp
+define internal double @returnDoubleArg(double %a) {
+entry:
+  ret double %a
+}
+; CHECK-LABEL: returnDoubleArg
+; CHECK: fld QWORD PTR [esp
+define internal float @returnFloatConst() {
+entry:
+  ret float 0x3FF3AE1480000000
+}
+; CHECK-LABEL: returnFloatConst
+; CHECK: fld
+define internal double @returnDoubleConst() {
+entry:
+  ret double 1.230000e+00
+}
+; CHECK-LABEL: returnDoubleConst
+; CHECK: fld
--- a/tests_lit/llvm2ice_tests/fp.cmp.ll
+++ b/tests_lit/llvm2ice_tests/fp.cmp.ll
+; This tries to be a comprehensive test of f32 and f64 compare operations.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+define internal void @fcmpEq(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+if.then:                                          ; preds = %entry
+  call void @func()
+  br label %if.end
+if.end:                                           ; preds = %if.then, %entry
+  %cmp1 = fcmp oeq double %c, %d
+  br i1 %cmp1, label %if.then2, label %if.end3
+if.then2:                                         ; preds = %if.end
+  call void @func()
+  br label %if.end3
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+; CHECK-LABEL: fcmpEq
+; CHECK: ucomiss
+; CHECK: jne
+; CHECK-NEXT: jp
+; CHECK: call {{.*}} R_{{.*}} func
+; CHECK: ucomisd
+; CHECK: jne
+; CHECK-NEXT: jp
+; CHECK: call {{.*}} R_{{.*}} func
+declare void @func()
+define internal void @fcmpNe(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp = fcmp une float %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+if.then:                                          ; preds = %entry
+  call void @func()
+  br label %if.end
+if.end:                                           ; preds = %if.then, %entry
+  %cmp1 = fcmp une double %c, %d
+  br i1 %cmp1, label %if.then2, label %if.end3
+if.then2:                                         ; preds = %if.end
+  call void @func()
+  br label %if.end3
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+; CHECK-LABEL: fcmpNe
+; CHECK: ucomiss
+; CHECK: jne
+; CHECK-NEXT: jp
+; CHECK: call {{.*}} R_{{.*}} func
+; CHECK: ucomisd
+; CHECK: jne
+; CHECK-NEXT: jp
+; CHECK: call {{.*}} R_{{.*}} func
+define internal void @fcmpGt(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp = fcmp ogt float %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+if.then:                                          ; preds = %entry
+  call void @func()
+  br label %if.end
+if.end:                                           ; preds = %if.then, %entry
+  %cmp1 = fcmp ogt double %c, %d
+  br i1 %cmp1, label %if.then2, label %if.end3
+if.then2:                                         ; preds = %if.end
+  call void @func()
+  br label %if.end3
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+; CHECK-LABEL: fcmpGt
+; CHECK: ucomiss
+; CHECK: seta
+; CHECK: call {{.*}} R_{{.*}} func
+; CHECK: ucomisd
+; CHECK: seta
+; CHECK: call {{.*}} R_{{.*}} func
+define internal void @fcmpGe(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp = fcmp ult float %a, %b
+  br i1 %cmp, label %if.end, label %if.then
+if.then:                                          ; preds = %entry
+  call void @func()
+  br label %if.end
+if.end:                                           ; preds = %entry, %if.then
+  %cmp1 = fcmp ult double %c, %d
+  br i1 %cmp1, label %if.end3, label %if.then2
+if.then2:                                         ; preds = %if.end
+  call void @func()
+  br label %if.end3
+if.end3:                                          ; preds = %if.end, %if.then2
+  ret void
+}
+; CHECK-LABEL: fcmpGe
+; CHECK: ucomiss
+; CHECK: setb
+; CHECK: call {{.*}} R_{{.*}} func
+; CHECK: ucomisd
+; CHECK: setb
+; CHECK: call {{.*}} R_{{.*}} func
+define internal void @fcmpLt(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp = fcmp olt float %a, %b
+  br i1 %cmp, label %if.then, label %if.end
+if.then:                                          ; preds = %entry
+  call void @func()
+  br label %if.end
+if.end:                                           ; preds = %if.then, %entry
+  %cmp1 = fcmp olt double %c, %d
+  br i1 %cmp1, label %if.then2, label %if.end3
+if.then2:                                         ; preds = %if.end
+  call void @func()
+  br label %if.end3
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+; CHECK-LABEL: fcmpLt
+; CHECK: ucomiss
+; CHECK: seta
+; CHECK: call {{.*}} R_{{.*}} func
+; CHECK: ucomisd
+; CHECK: seta
+; CHECK: call {{.*}} R_{{.*}} func
+define internal void @fcmpLe(float %a, float %b, double %c, double %d) {
+entry:
+  %cmp = fcmp ugt float %a, %b
+  br i1 %cmp, label %if.end, label %if.then
+if.then:                                          ; preds = %entry
+  call void @func()
+  br label %if.end
+if.end:                                           ; preds = %entry, %if.then
+  %cmp1 = fcmp ugt double %c, %d
+  br i1 %cmp1, label %if.end3, label %if.then2
+if.then2:                                         ; preds = %if.end
+  call void @func()
+  br label %if.end3
+if.end3:                                          ; preds = %if.end, %if.then2
+  ret void
+}
+; CHECK-LABEL: fcmpLe
+; CHECK: ucomiss
+; CHECK: setb
+; CHECK: call {{.*}} R_{{.*}} func
+; CHECK: ucomisd
+; CHECK: setb
+; CHECK: call {{.*}} R_{{.*}} func
+define internal i32 @fcmpFalseFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp false float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpFalseFloat
+; CHECK: mov {{.*}},0x0
+define internal i32 @fcmpFalseDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp false double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpFalseDouble
+; CHECK: mov {{.*}},0x0
+define internal i32 @fcmpOeqFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp oeq float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOeqFloat
+; CHECK: ucomiss
+; CHECK: jne
+; CHECK: jp
+define internal i32 @fcmpOeqDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp oeq double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOeqDouble
+; CHECK: ucomisd
+; CHECK: jne
+; CHECK: jp
+define internal i32 @fcmpOgtFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ogt float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOgtFloat
+; CHECK: ucomiss
+; CHECK: seta
+define internal i32 @fcmpOgtDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ogt double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOgtDouble
+; CHECK: ucomisd
+; CHECK: seta
+define internal i32 @fcmpOgeFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp oge float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOgeFloat
+; CHECK: ucomiss
+; CHECK: setae
+define internal i32 @fcmpOgeDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp oge double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOgeDouble
+; CHECK: ucomisd
+; CHECK: setae
+define internal i32 @fcmpOltFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp olt float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOltFloat
+; CHECK: ucomiss
+; CHECK: seta
+define internal i32 @fcmpOltDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp olt double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOltDouble
+; CHECK: ucomisd
+; CHECK: seta
+define internal i32 @fcmpOleFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ole float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOleFloat
+; CHECK: ucomiss
+; CHECK: setae
+define internal i32 @fcmpOleDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ole double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOleDouble
+; CHECK: ucomisd
+; CHECK: setae
+define internal i32 @fcmpOneFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp one float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOneFloat
+; CHECK: ucomiss
+; CHECK: setne
+define internal i32 @fcmpOneDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp one double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOneDouble
+; CHECK: ucomisd
+; CHECK: setne
+define internal i32 @fcmpOrdFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ord float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOrdFloat
+; CHECK: ucomiss
+; CHECK: setnp
+define internal i32 @fcmpOrdDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ord double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpOrdDouble
+; CHECK: ucomisd
+; CHECK: setnp
+define internal i32 @fcmpUeqFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ueq float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUeqFloat
+; CHECK: ucomiss
+; CHECK: sete
+define internal i32 @fcmpUeqDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ueq double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUeqDouble
+; CHECK: ucomisd
+; CHECK: sete
+define internal i32 @fcmpUgtFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ugt float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUgtFloat
+; CHECK: ucomiss
+; CHECK: setb
+define internal i32 @fcmpUgtDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ugt double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUgtDouble
+; CHECK: ucomisd
+; CHECK: setb
+define internal i32 @fcmpUgeFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp uge float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUgeFloat
+; CHECK: ucomiss
+; CHECK: setbe
+define internal i32 @fcmpUgeDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp uge double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUgeDouble
+; CHECK: ucomisd
+; CHECK: setbe
+define internal i32 @fcmpUltFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ult float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUltFloat
+; CHECK: ucomiss
+; CHECK: setb
+define internal i32 @fcmpUltDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ult double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUltDouble
+; CHECK: ucomisd
+; CHECK: setb
+define internal i32 @fcmpUleFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp ule float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUleFloat
+; CHECK: ucomiss
+; CHECK: setbe
+define internal i32 @fcmpUleDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp ule double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUleDouble
+; CHECK: ucomisd
+; CHECK: setbe
+define internal i32 @fcmpUneFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp une float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUneFloat
+; CHECK: ucomiss
+; CHECK: jne
+; CHECK: jp
+define internal i32 @fcmpUneDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp une double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUneDouble
+; CHECK: ucomisd
+; CHECK: jne
+; CHECK: jp
+define internal i32 @fcmpUnoFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp uno float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUnoFloat
+; CHECK: ucomiss
+; CHECK: setp
+define internal i32 @fcmpUnoDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp uno double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpUnoDouble
+; CHECK: ucomisd
+; CHECK: setp
+define internal i32 @fcmpTrueFloat(float %a, float %b) {
+entry:
+  %cmp = fcmp true float %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpTrueFloat
+; CHECK: mov {{.*}},0x1
+define internal i32 @fcmpTrueDouble(double %a, double %b) {
+entry:
+  %cmp = fcmp true double %a, %b
+  %cmp.ret_ext = zext i1 %cmp to i32
+  ret i32 %cmp.ret_ext
+}
+; CHECK-LABEL: fcmpTrueDouble
+; CHECK: mov {{.*}},0x1
+define internal float @selectFloatVarVar(float %a, float %b) {
+entry:
+  %cmp = fcmp olt float %a, %b
+  %cond = select i1 %cmp, float %a, float %b
+  ret float %cond
+}
+; CHECK-LABEL: selectFloatVarVar
+; CHECK: ucomiss
+; CHECK: seta
+; CHECK: fld
+define internal double @selectDoubleVarVar(double %a, double %b) {
+entry:
+  %cmp = fcmp olt double %a, %b
+  %cond = select i1 %cmp, double %a, double %b
+  ret double %cond
+}
+; CHECK-LABEL: selectDoubleVarVar
+; CHECK: ucomisd
+; CHECK: seta
+; CHECK: fld
--- a/tests_lit/llvm2ice_tests/fp.convert.ll
+++ b/tests_lit/llvm2ice_tests/fp.convert.ll
+; This tries to be a comprehensive test of f32 and f64 convert operations.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+define internal float @fptrunc(double %a) {
+entry:
+  %conv = fptrunc double %a to float
+  ret float %conv
+}
+; CHECK-LABEL: fptrunc
+; CHECK: cvtsd2ss
+; CHECK: fld
+define internal double @fpext(float %a) {
+entry:
+  %conv = fpext float %a to double
+  ret double %conv
+}
+; CHECK-LABEL: fpext
+; CHECK: cvtss2sd
+; CHECK: fld
+define internal i64 @doubleToSigned64(double %a) {
+entry:
+  %conv = fptosi double %a to i64
+  ret i64 %conv
+}
+; CHECK-LABEL: doubleToSigned64
+; CHECK: call {{.*}} R_{{.*}} __Sz_fptosi_f64_i64
+define internal i64 @floatToSigned64(float %a) {
+entry:
+  %conv = fptosi float %a to i64
+  ret i64 %conv
+}
+; CHECK-LABEL: floatToSigned64
+; CHECK: call {{.*}} R_{{.*}} __Sz_fptosi_f32_i64
+define internal i64 @doubleToUnsigned64(double %a) {
+entry:
+  %conv = fptoui double %a to i64
+  ret i64 %conv
+}
+; CHECK-LABEL: doubleToUnsigned64
+; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f64_i64
+define internal i64 @floatToUnsigned64(float %a) {
+entry:
+  %conv = fptoui float %a to i64
+  ret i64 %conv
+}
+; CHECK-LABEL: floatToUnsigned64
+; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f32_i64
+define internal i32 @doubleToSigned32(double %a) {
+entry:
+  %conv = fptosi double %a to i32
+  ret i32 %conv
+}
+; CHECK-LABEL: doubleToSigned32
+; CHECK: cvttsd2si
+define internal i32 @doubleToSigned32Const() {
+entry:
+  %conv = fptosi double 867.5309 to i32
+  ret i32 %conv
+}
+; CHECK-LABEL: doubleToSigned32Const
+; CHECK: cvttsd2si
+define internal i32 @floatToSigned32(float %a) {
+entry:
+  %conv = fptosi float %a to i32
+  ret i32 %conv
+}
+; CHECK-LABEL: floatToSigned32
+; CHECK: cvttss2si
+define internal i32 @doubleToUnsigned32(double %a) {
+entry:
+  %conv = fptoui double %a to i32
+  ret i32 %conv
+}
+; CHECK-LABEL: doubleToUnsigned32
+; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f64_i32
+define internal i32 @floatToUnsigned32(float %a) {
+entry:
+  %conv = fptoui float %a to i32
+  ret i32 %conv
+}
+; CHECK-LABEL: floatToUnsigned32
+; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f32_i32
+define internal i32 @doubleToSigned16(double %a) {
+entry:
+  %conv = fptosi double %a to i16
+  %conv.ret_ext = sext i16 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: doubleToSigned16
+; CHECK: cvttsd2si
+; CHECK: movsx
+define internal i32 @floatToSigned16(float %a) {
+entry:
+  %conv = fptosi float %a to i16
+  %conv.ret_ext = sext i16 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: floatToSigned16
+; CHECK: cvttss2si
+; CHECK: movsx
+define internal i32 @doubleToUnsigned16(double %a) {
+entry:
+  %conv = fptoui double %a to i16
+  %conv.ret_ext = zext i16 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: doubleToUnsigned16
+; CHECK: cvttsd2si
+; CHECK: movzx
+define internal i32 @floatToUnsigned16(float %a) {
+entry:
+  %conv = fptoui float %a to i16
+  %conv.ret_ext = zext i16 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: floatToUnsigned16
+; CHECK: cvttss2si
+; CHECK: movzx
+define internal i32 @doubleToSigned8(double %a) {
+entry:
+  %conv = fptosi double %a to i8
+  %conv.ret_ext = sext i8 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: doubleToSigned8
+; CHECK: cvttsd2si
+; CHECK: movsx
+define internal i32 @floatToSigned8(float %a) {
+entry:
+  %conv = fptosi float %a to i8
+  %conv.ret_ext = sext i8 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: floatToSigned8
+; CHECK: cvttss2si
+; CHECK: movsx
+define internal i32 @doubleToUnsigned8(double %a) {
+entry:
+  %conv = fptoui double %a to i8
+  %conv.ret_ext = zext i8 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: doubleToUnsigned8
+; CHECK: cvttsd2si
+; CHECK: movzx
+define internal i32 @floatToUnsigned8(float %a) {
+entry:
+  %conv = fptoui float %a to i8
+  %conv.ret_ext = zext i8 %conv to i32
+  ret i32 %conv.ret_ext
+}
+; CHECK-LABEL: floatToUnsigned8
+; CHECK: cvttss2si
+; CHECK: movzx
+define internal i32 @doubleToUnsigned1(double %a) {
+entry:
+  %tobool = fptoui double %a to i1
+  %tobool.ret_ext = zext i1 %tobool to i32
+  ret i32 %tobool.ret_ext
+}
+; CHECK-LABEL: doubleToUnsigned1
+; CHECK: cvttsd2si
+; CHECK: and eax,0x1
+define internal i32 @floatToUnsigned1(float %a) {
+entry:
+  %tobool = fptoui float %a to i1
+  %tobool.ret_ext = zext i1 %tobool to i32
+  ret i32 %tobool.ret_ext
+}
+; CHECK-LABEL: floatToUnsigned1
+; CHECK: cvttss2si
+; CHECK: and eax,0x1
+define internal double @signed64ToDouble(i64 %a) {
+entry:
+  %conv = sitofp i64 %a to double
+  ret double %conv
+}
+; CHECK-LABEL: signed64ToDouble
+; CHECK: call {{.*}} R_{{.*}} __Sz_sitofp_i64_f64
+; CHECK: fstp QWORD
+define internal float @signed64ToFloat(i64 %a) {
+entry:
+  %conv = sitofp i64 %a to float
+  ret float %conv
+}
+; CHECK-LABEL: signed64ToFloat
+; CHECK: call {{.*}} R_{{.*}} __Sz_sitofp_i64_f32
+; CHECK: fstp DWORD
+define internal double @unsigned64ToDouble(i64 %a) {
+entry:
+  %conv = uitofp i64 %a to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned64ToDouble
+; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i64_f64
+; CHECK: fstp
+define internal float @unsigned64ToFloat(i64 %a) {
+entry:
+  %conv = uitofp i64 %a to float
+  ret float %conv
+}
+; CHECK-LABEL: unsigned64ToFloat
+; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i64_f32
+; CHECK: fstp
+define internal double @unsigned64ToDoubleConst() {
+entry:
+  %conv = uitofp i64 12345678901234 to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned64ToDouble
+; CHECK: mov DWORD PTR [esp+0x4],0xb3a
+; CHECK: mov DWORD PTR [esp],0x73ce2ff2
+; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i64_f64
+; CHECK: fstp
+define internal double @signed32ToDouble(i32 %a) {
+entry:
+  %conv = sitofp i32 %a to double
+  ret double %conv
+}
+; CHECK-LABEL: signed32ToDouble
+; CHECK: cvtsi2sd
+; CHECK: fld
+define internal double @signed32ToDoubleConst() {
+entry:
+  %conv = sitofp i32 123 to double
+  ret double %conv
+}
+; CHECK-LABEL: signed32ToDoubleConst
+; CHECK: cvtsi2sd {{.*[^1]}}
+; CHECK: fld
+define internal float @signed32ToFloat(i32 %a) {
+entry:
+  %conv = sitofp i32 %a to float
+  ret float %conv
+}
+; CHECK-LABEL: signed32ToFloat
+; CHECK: cvtsi2ss
+; CHECK: fld
+define internal double @unsigned32ToDouble(i32 %a) {
+entry:
+  %conv = uitofp i32 %a to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned32ToDouble
+; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i32_f64
+; CHECK: fstp QWORD
+define internal float @unsigned32ToFloat(i32 %a) {
+entry:
+  %conv = uitofp i32 %a to float
+  ret float %conv
+}
+; CHECK-LABEL: unsigned32ToFloat
+; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i32_f32
+; CHECK: fstp DWORD
+define internal double @signed16ToDouble(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i16
+  %conv = sitofp i16 %a.arg_trunc to double
+  ret double %conv
+}
+; CHECK-LABEL: signed16ToDouble
+; CHECK: cvtsi2sd
+; CHECK: fld QWORD
+define internal float @signed16ToFloat(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i16
+  %conv = sitofp i16 %a.arg_trunc to float
+  ret float %conv
+}
+; CHECK-LABEL: signed16ToFloat
+; CHECK: cvtsi2ss
+; CHECK: fld DWORD
+define internal double @unsigned16ToDouble(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i16
+  %conv = uitofp i16 %a.arg_trunc to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned16ToDouble
+; CHECK: cvtsi2sd
+; CHECK: fld
+define internal double @unsigned16ToDoubleConst() {
+entry:
+  %conv = uitofp i16 12345 to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned16ToDoubleConst
+; CHECK: cvtsi2sd
+; CHECK: fld
+define internal float @unsigned16ToFloat(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i16
+  %conv = uitofp i16 %a.arg_trunc to float
+  ret float %conv
+}
+; CHECK-LABEL: unsigned16ToFloat
+; CHECK: cvtsi2ss
+; CHECK: fld
+define internal double @signed8ToDouble(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i8
+  %conv = sitofp i8 %a.arg_trunc to double
+  ret double %conv
+}
+; CHECK-LABEL: signed8ToDouble
+; CHECK: cvtsi2sd
+; CHECK: fld
+define internal float @signed8ToFloat(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i8
+  %conv = sitofp i8 %a.arg_trunc to float
+  ret float %conv
+}
+; CHECK-LABEL: signed8ToFloat
+; CHECK: cvtsi2ss
+; CHECK: fld
+define internal double @unsigned8ToDouble(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i8
+  %conv = uitofp i8 %a.arg_trunc to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned8ToDouble
+; CHECK: cvtsi2sd
+; CHECK: fld
+define internal float @unsigned8ToFloat(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i8
+  %conv = uitofp i8 %a.arg_trunc to float
+  ret float %conv
+}
+; CHECK-LABEL: unsigned8ToFloat
+; CHECK: cvtsi2ss
+; CHECK: fld
+define internal double @unsigned1ToDouble(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i1
+  %conv = uitofp i1 %a.arg_trunc to double
+  ret double %conv
+}
+; CHECK-LABEL: unsigned1ToDouble
+; CHECK: cvtsi2sd
+; CHECK: fld
+define internal float @unsigned1ToFloat(i32 %a) {
+entry:
+  %a.arg_trunc = trunc i32 %a to i1
+  %conv = uitofp i1 %a.arg_trunc to float
+  ret float %conv
+}
+; CHECK-LABEL: unsigned1ToFloat
+; CHECK: cvtsi2ss
+; CHECK: fld
+define internal float @int32BitcastToFloat(i32 %a) {
+entry:
+  %conv = bitcast i32 %a to float
+  ret float %conv
+}
+; CHECK-LABEL: int32BitcastToFloat
+; CHECK: mov
+define internal float @int32BitcastToFloatConst() {
+entry:
+  %conv = bitcast i32 8675309 to float
+  ret float %conv
+}
+; CHECK-LABEL: int32BitcastToFloatConst
+; CHECK: mov
+define internal double @int64BitcastToDouble(i64 %a) {
+entry:
+  %conv = bitcast i64 %a to double
+  ret double %conv
+}
+; CHECK-LABEL: int64BitcastToDouble
+; CHECK: mov
+define internal double @int64BitcastToDoubleConst() {
+entry:
+  %conv = bitcast i64 9035768 to double
+  ret double %conv
+}
+; CHECK-LABEL: int64BitcastToDoubleConst
+; CHECK: mov
--- a/tests_lit/llvm2ice_tests/fp.load_store.ll
+++ b/tests_lit/llvm2ice_tests/fp.load_store.ll
+; This tries to be a comprehensive test of f32 and f64 compare operations.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+define internal float @loadFloat(i32 %a) {
+entry:
+  %__1 = inttoptr i32 %a to float*
+  %v0 = load float, float* %__1, align 4
+  ret float %v0
+}
+; CHECK-LABEL: loadFloat
+; CHECK: movss
+; CHECK: fld
+define internal double @loadDouble(i32 %a) {
+entry:
+  %__1 = inttoptr i32 %a to double*
+  %v0 = load double, double* %__1, align 8
+  ret double %v0
+}
+; CHECK-LABEL: loadDouble
+; CHECK: movsd
+; CHECK: fld
+define internal void @storeFloat(i32 %a, float %value) {
+entry:
+  %__2 = inttoptr i32 %a to float*
+  store float %value, float* %__2, align 4
+  ret void
+}
+; CHECK-LABEL: storeFloat
+; CHECK: movss
+; CHECK: movss
+define internal void @storeDouble(i32 %a, double %value) {
+entry:
+  %__2 = inttoptr i32 %a to double*
+  store double %value, double* %__2, align 8
+  ret void
+}
+; CHECK-LABEL: storeDouble
+; CHECK: movsd
+; CHECK: movsd
+define internal void @storeFloatConst(i32 %a) {
+entry:
+  %a.asptr = inttoptr i32 %a to float*
+  store float 0x3FF3AE1480000000, float* %a.asptr, align 4
+  ret void
+}
+; CHECK-LABEL: storeFloatConst
+; CHECK: movss
+; CHECK: movss
+define internal void @storeDoubleConst(i32 %a) {
+entry:
+  %a.asptr = inttoptr i32 %a to double*
+  store double 1.230000e+00, double* %a.asptr, align 8
+  ret void
+}
+; CHECK-LABEL: storeDoubleConst
+; CHECK: movsd
+; CHECK: movsd
--- a/tests_lit/llvm2ice_tests/fp.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/fp.pnacl.ll
-; This tries to be a comprehensive test of f32 and f64 operations.
-; The CHECK lines are only checking for basic instruction patterns
-; that should be present regardless of the optimization level, so
-; there are no special OPTM1 match lines.
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
-; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
-@__init_array_start = internal constant [0 x i8] zeroinitializer, align 4
-@__fini_array_start = internal constant [0 x i8] zeroinitializer, align 4
-@__tls_template_start = internal constant [0 x i8] zeroinitializer, align 8
-@__tls_template_alignment = internal constant [4 x i8] c"\01\00\00\00", align 4
-define internal i32 @doubleArgs(double %a, i32 %b, double %c) {
-entry:
-  ret i32 %b
-}
-; CHECK-LABEL: doubleArgs
-; CHECK:      mov eax,DWORD PTR [esp+0xc]
-; CHECK-NEXT: ret
-define internal i32 @floatArgs(float %a, i32 %b, float %c) {
-entry:
-  ret i32 %b
-}
-; CHECK-LABEL: floatArgs
-; CHECK:      mov eax,DWORD PTR [esp+0x8]
-; CHECK-NEXT: ret
-define internal i32 @passFpArgs(float %a, double %b, float %c, double %d, float %e, double %f) {
-entry:
-  %call = call i32 @ignoreFpArgsNoInline(float %a, i32 123, double %b)
-  %call1 = call i32 @ignoreFpArgsNoInline(float %c, i32 123, double %d)
-  %call2 = call i32 @ignoreFpArgsNoInline(float %e, i32 123, double %f)
-  %add = add i32 %call1, %call
-  %add3 = add i32 %add, %call2
-  ret i32 %add3
-}
-; CHECK-LABEL: passFpArgs
-; CHECK: mov DWORD PTR [esp+0x4],0x7b
-; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
-; CHECK: mov DWORD PTR [esp+0x4],0x7b
-; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
-; CHECK: mov DWORD PTR [esp+0x4],0x7b
-; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
-declare i32 @ignoreFpArgsNoInline(float %x, i32 %y, double %z)
-define internal i32 @passFpConstArg(float %a, double %b) {
-entry:
-  %call = call i32 @ignoreFpArgsNoInline(float %a, i32 123, double 2.340000e+00)
-  ret i32 %call
-}
-; CHECK-LABEL: passFpConstArg
-; CHECK: mov DWORD PTR [esp+0x4],0x7b
-; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
-define internal i32 @passFp32ConstArg(float %a) {
-entry:
-  %call = call i32 @ignoreFp32ArgsNoInline(float %a, i32 123, float 2.0)
-  ret i32 %call
-}
-; CHECK-LABEL: passFp32ConstArg
-; CHECK: mov DWORD PTR [esp+0x4],0x7b
-; CHECK: movss DWORD PTR [esp+0x8]
-; CHECK: call {{.*}} R_{{.*}} ignoreFp32ArgsNoInline
-declare i32 @ignoreFp32ArgsNoInline(float %x, i32 %y, float %z)
-define internal float @returnFloatArg(float %a) {
-entry:
-  ret float %a
-}
-; CHECK-LABEL: returnFloatArg
-; CHECK: fld DWORD PTR [esp
-define internal double @returnDoubleArg(double %a) {
-entry:
-  ret double %a
-}
-; CHECK-LABEL: returnDoubleArg
-; CHECK: fld QWORD PTR [esp
-define internal float @returnFloatConst() {
-entry:
-  ret float 0x3FF3AE1480000000
-}
-; CHECK-LABEL: returnFloatConst
-; CHECK: fld
-define internal double @returnDoubleConst() {
-entry:
-  ret double 1.230000e+00
-}
-; CHECK-LABEL: returnDoubleConst
-; CHECK: fld
-define internal float @addFloat(float %a, float %b) {
-entry:
-  %add = fadd float %a, %b
-  ret float %add
-}
-; CHECK-LABEL: addFloat
-; CHECK: addss
-; CHECK: fld
-define internal double @addDouble(double %a, double %b) {
-entry:
-  %add = fadd double %a, %b
-  ret double %add
-}
-; CHECK-LABEL: addDouble
-; CHECK: addsd
-; CHECK: fld
-define internal float @subFloat(float %a, float %b) {
-entry:
-  %sub = fsub float %a, %b
-  ret float %sub
-}
-; CHECK-LABEL: subFloat
-; CHECK: subss
-; CHECK: fld
-define internal double @subDouble(double %a, double %b) {
-entry:
-  %sub = fsub double %a, %b
-  ret double %sub
-}
-; CHECK-LABEL: subDouble
-; CHECK: subsd
-; CHECK: fld
-define internal float @mulFloat(float %a, float %b) {
-entry:
-  %mul = fmul float %a, %b
-  ret float %mul
-}
-; CHECK-LABEL: mulFloat
-; CHECK: mulss
-; CHECK: fld
-define internal double @mulDouble(double %a, double %b) {
-entry:
-  %mul = fmul double %a, %b
-  ret double %mul
-}
-; CHECK-LABEL: mulDouble
-; CHECK: mulsd
-; CHECK: fld
-define internal float @divFloat(float %a, float %b) {
-entry:
-  %div = fdiv float %a, %b
-  ret float %div
-}
-; CHECK-LABEL: divFloat
-; CHECK: divss
-; CHECK: fld
-define internal double @divDouble(double %a, double %b) {
-entry:
-  %div = fdiv double %a, %b
-  ret double %div
-}
-; CHECK-LABEL: divDouble
-; CHECK: divsd
-; CHECK: fld
-define internal float @remFloat(float %a, float %b) {
-entry:
-  %div = frem float %a, %b
-  ret float %div
-}
-; CHECK-LABEL: remFloat
-; CHECK: call {{.*}} R_{{.*}} fmodf
-define internal double @remDouble(double %a, double %b) {
-entry:
-  %div = frem double %a, %b
-  ret double %div
-}
-; CHECK-LABEL: remDouble
-; CHECK: call {{.*}} R_{{.*}} fmod
-define internal float @fptrunc(double %a) {
-entry:
-  %conv = fptrunc double %a to float
-  ret float %conv
-}
-; CHECK-LABEL: fptrunc
-; CHECK: cvtsd2ss
-; CHECK: fld
-define internal double @fpext(float %a) {
-entry:
-  %conv = fpext float %a to double
-  ret double %conv
-}
-; CHECK-LABEL: fpext
-; CHECK: cvtss2sd
-; CHECK: fld
-define internal i64 @doubleToSigned64(double %a) {
-entry:
-  %conv = fptosi double %a to i64
-  ret i64 %conv
-}
-; CHECK-LABEL: doubleToSigned64
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptosi_f64_i64
-define internal i64 @floatToSigned64(float %a) {
-entry:
-  %conv = fptosi float %a to i64
-  ret i64 %conv
-}
-; CHECK-LABEL: floatToSigned64
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptosi_f32_i64
-define internal i64 @doubleToUnsigned64(double %a) {
-entry:
-  %conv = fptoui double %a to i64
-  ret i64 %conv
-}
-; CHECK-LABEL: doubleToUnsigned64
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f64_i64
-define internal i64 @floatToUnsigned64(float %a) {
-entry:
-  %conv = fptoui float %a to i64
-  ret i64 %conv
-}
-; CHECK-LABEL: floatToUnsigned64
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f32_i64
-define internal i32 @doubleToSigned32(double %a) {
-entry:
-  %conv = fptosi double %a to i32
-  ret i32 %conv
-}
-; CHECK-LABEL: doubleToSigned32
-; CHECK: cvttsd2si
-define internal i32 @doubleToSigned32Const() {
-entry:
-  %conv = fptosi double 867.5309 to i32
-  ret i32 %conv
-}
-; CHECK-LABEL: doubleToSigned32Const
-; CHECK: cvttsd2si
-define internal i32 @floatToSigned32(float %a) {
-entry:
-  %conv = fptosi float %a to i32
-  ret i32 %conv
-}
-; CHECK-LABEL: floatToSigned32
-; CHECK: cvttss2si
-define internal i32 @doubleToUnsigned32(double %a) {
-entry:
-  %conv = fptoui double %a to i32
-  ret i32 %conv
-}
-; CHECK-LABEL: doubleToUnsigned32
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f64_i32
-define internal i32 @floatToUnsigned32(float %a) {
-entry:
-  %conv = fptoui float %a to i32
-  ret i32 %conv
-}
-; CHECK-LABEL: floatToUnsigned32
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_f32_i32
-define internal i32 @doubleToSigned16(double %a) {
-entry:
-  %conv = fptosi double %a to i16
-  %conv.ret_ext = sext i16 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: doubleToSigned16
-; CHECK: cvttsd2si
-; CHECK: movsx
-define internal i32 @floatToSigned16(float %a) {
-entry:
-  %conv = fptosi float %a to i16
-  %conv.ret_ext = sext i16 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: floatToSigned16
-; CHECK: cvttss2si
-; CHECK: movsx
-define internal i32 @doubleToUnsigned16(double %a) {
-entry:
-  %conv = fptoui double %a to i16
-  %conv.ret_ext = zext i16 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: doubleToUnsigned16
-; CHECK: cvttsd2si
-; CHECK: movzx
-define internal i32 @floatToUnsigned16(float %a) {
-entry:
-  %conv = fptoui float %a to i16
-  %conv.ret_ext = zext i16 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: floatToUnsigned16
-; CHECK: cvttss2si
-; CHECK: movzx
-define internal i32 @doubleToSigned8(double %a) {
-entry:
-  %conv = fptosi double %a to i8
-  %conv.ret_ext = sext i8 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: doubleToSigned8
-; CHECK: cvttsd2si
-; CHECK: movsx
-define internal i32 @floatToSigned8(float %a) {
-entry:
-  %conv = fptosi float %a to i8
-  %conv.ret_ext = sext i8 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: floatToSigned8
-; CHECK: cvttss2si
-; CHECK: movsx
-define internal i32 @doubleToUnsigned8(double %a) {
-entry:
-  %conv = fptoui double %a to i8
-  %conv.ret_ext = zext i8 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: doubleToUnsigned8
-; CHECK: cvttsd2si
-; CHECK: movzx
-define internal i32 @floatToUnsigned8(float %a) {
-entry:
-  %conv = fptoui float %a to i8
-  %conv.ret_ext = zext i8 %conv to i32
-  ret i32 %conv.ret_ext
-}
-; CHECK-LABEL: floatToUnsigned8
-; CHECK: cvttss2si
-; CHECK: movzx
-define internal i32 @doubleToUnsigned1(double %a) {
-entry:
-  %tobool = fptoui double %a to i1
-  %tobool.ret_ext = zext i1 %tobool to i32
-  ret i32 %tobool.ret_ext
-}
-; CHECK-LABEL: doubleToUnsigned1
-; CHECK: cvttsd2si
-; CHECK: and eax,0x1
-define internal i32 @floatToUnsigned1(float %a) {
-entry:
-  %tobool = fptoui float %a to i1
-  %tobool.ret_ext = zext i1 %tobool to i32
-  ret i32 %tobool.ret_ext
-}
-; CHECK-LABEL: floatToUnsigned1
-; CHECK: cvttss2si
-; CHECK: and eax,0x1
-define internal double @signed64ToDouble(i64 %a) {
-entry:
-  %conv = sitofp i64 %a to double
-  ret double %conv
-}
-; CHECK-LABEL: signed64ToDouble
-; CHECK: call {{.*}} R_{{.*}} __Sz_sitofp_i64_f64
-; CHECK: fstp QWORD
-define internal float @signed64ToFloat(i64 %a) {
-entry:
-  %conv = sitofp i64 %a to float
-  ret float %conv
-}
-; CHECK-LABEL: signed64ToFloat
-; CHECK: call {{.*}} R_{{.*}} __Sz_sitofp_i64_f32
-; CHECK: fstp DWORD
-define internal double @unsigned64ToDouble(i64 %a) {
-entry:
-  %conv = uitofp i64 %a to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned64ToDouble
-; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i64_f64
-; CHECK: fstp
-define internal float @unsigned64ToFloat(i64 %a) {
-entry:
-  %conv = uitofp i64 %a to float
-  ret float %conv
-}
-; CHECK-LABEL: unsigned64ToFloat
-; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i64_f32
-; CHECK: fstp
-define internal double @unsigned64ToDoubleConst() {
-entry:
-  %conv = uitofp i64 12345678901234 to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned64ToDouble
-; CHECK: mov DWORD PTR [esp+0x4],0xb3a
-; CHECK: mov DWORD PTR [esp],0x73ce2ff2
-; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i64_f64
-; CHECK: fstp
-define internal double @signed32ToDouble(i32 %a) {
-entry:
-  %conv = sitofp i32 %a to double
-  ret double %conv
-}
-; CHECK-LABEL: signed32ToDouble
-; CHECK: cvtsi2sd
-; CHECK: fld
-define internal double @signed32ToDoubleConst() {
-entry:
-  %conv = sitofp i32 123 to double
-  ret double %conv
-}
-; CHECK-LABEL: signed32ToDoubleConst
-; CHECK: cvtsi2sd {{.*[^1]}}
-; CHECK: fld
-define internal float @signed32ToFloat(i32 %a) {
-entry:
-  %conv = sitofp i32 %a to float
-  ret float %conv
-}
-; CHECK-LABEL: signed32ToFloat
-; CHECK: cvtsi2ss
-; CHECK: fld
-define internal double @unsigned32ToDouble(i32 %a) {
-entry:
-  %conv = uitofp i32 %a to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned32ToDouble
-; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i32_f64
-; CHECK: fstp QWORD
-define internal float @unsigned32ToFloat(i32 %a) {
-entry:
-  %conv = uitofp i32 %a to float
-  ret float %conv
-}
-; CHECK-LABEL: unsigned32ToFloat
-; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_i32_f32
-; CHECK: fstp DWORD
-define internal double @signed16ToDouble(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i16
-  %conv = sitofp i16 %a.arg_trunc to double
-  ret double %conv
-}
-; CHECK-LABEL: signed16ToDouble
-; CHECK: cvtsi2sd
-; CHECK: fld QWORD
-define internal float @signed16ToFloat(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i16
-  %conv = sitofp i16 %a.arg_trunc to float
-  ret float %conv
-}
-; CHECK-LABEL: signed16ToFloat
-; CHECK: cvtsi2ss
-; CHECK: fld DWORD
-define internal double @unsigned16ToDouble(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i16
-  %conv = uitofp i16 %a.arg_trunc to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned16ToDouble
-; CHECK: cvtsi2sd
-; CHECK: fld
-define internal double @unsigned16ToDoubleConst() {
-entry:
-  %conv = uitofp i16 12345 to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned16ToDoubleConst
-; CHECK: cvtsi2sd
-; CHECK: fld
-define internal float @unsigned16ToFloat(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i16
-  %conv = uitofp i16 %a.arg_trunc to float
-  ret float %conv
-}
-; CHECK-LABEL: unsigned16ToFloat
-; CHECK: cvtsi2ss
-; CHECK: fld
-define internal double @signed8ToDouble(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i8
-  %conv = sitofp i8 %a.arg_trunc to double
-  ret double %conv
-}
-; CHECK-LABEL: signed8ToDouble
-; CHECK: cvtsi2sd
-; CHECK: fld
-define internal float @signed8ToFloat(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i8
-  %conv = sitofp i8 %a.arg_trunc to float
-  ret float %conv
-}
-; CHECK-LABEL: signed8ToFloat
-; CHECK: cvtsi2ss
-; CHECK: fld
-define internal double @unsigned8ToDouble(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i8
-  %conv = uitofp i8 %a.arg_trunc to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned8ToDouble
-; CHECK: cvtsi2sd
-; CHECK: fld
-define internal float @unsigned8ToFloat(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i8
-  %conv = uitofp i8 %a.arg_trunc to float
-  ret float %conv
-}
-; CHECK-LABEL: unsigned8ToFloat
-; CHECK: cvtsi2ss
-; CHECK: fld
-define internal double @unsigned1ToDouble(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i1
-  %conv = uitofp i1 %a.arg_trunc to double
-  ret double %conv
-}
-; CHECK-LABEL: unsigned1ToDouble
-; CHECK: cvtsi2sd
-; CHECK: fld
-define internal float @unsigned1ToFloat(i32 %a) {
-entry:
-  %a.arg_trunc = trunc i32 %a to i1
-  %conv = uitofp i1 %a.arg_trunc to float
-  ret float %conv
-}
-; CHECK-LABEL: unsigned1ToFloat
-; CHECK: cvtsi2ss
-; CHECK: fld
-define internal float @int32BitcastToFloat(i32 %a) {
-entry:
-  %conv = bitcast i32 %a to float
-  ret float %conv
-}
-; CHECK-LABEL: int32BitcastToFloat
-; CHECK: mov
-define internal float @int32BitcastToFloatConst() {
-entry:
-  %conv = bitcast i32 8675309 to float
-  ret float %conv
-}
-; CHECK-LABEL: int32BitcastToFloatConst
-; CHECK: mov
-define internal double @int64BitcastToDouble(i64 %a) {
-entry:
-  %conv = bitcast i64 %a to double
-  ret double %conv
-}
-; CHECK-LABEL: int64BitcastToDouble
-; CHECK: mov
-define internal double @int64BitcastToDoubleConst() {
-entry:
-  %conv = bitcast i64 9035768 to double
-  ret double %conv
-}
-; CHECK-LABEL: int64BitcastToDoubleConst
-; CHECK: mov
-define internal void @fcmpEq(float %a, float %b, double %c, double %d) {
-entry:
-  %cmp = fcmp oeq float %a, %b
-  br i1 %cmp, label %if.then, label %if.end
-if.then:                                          ; preds = %entry
-  call void @func()
-  br label %if.end
-if.end:                                           ; preds = %if.then, %entry
-  %cmp1 = fcmp oeq double %c, %d
-  br i1 %cmp1, label %if.then2, label %if.end3
-if.then2:                                         ; preds = %if.end
-  call void @func()
-  br label %if.end3
-if.end3:                                          ; preds = %if.then2, %if.end
-  ret void
-}
-; CHECK-LABEL: fcmpEq
-; CHECK: ucomiss
-; CHECK: jne
-; CHECK-NEXT: jp
-; CHECK: call {{.*}} R_{{.*}} func
-; CHECK: ucomisd
-; CHECK: jne
-; CHECK-NEXT: jp
-; CHECK: call {{.*}} R_{{.*}} func
-declare void @func()
-define internal void @fcmpNe(float %a, float %b, double %c, double %d) {
-entry:
-  %cmp = fcmp une float %a, %b
-  br i1 %cmp, label %if.then, label %if.end
-if.then:                                          ; preds = %entry
-  call void @func()
-  br label %if.end
-if.end:                                           ; preds = %if.then, %entry
-  %cmp1 = fcmp une double %c, %d
-  br i1 %cmp1, label %if.then2, label %if.end3
-if.then2:                                         ; preds = %if.end
-  call void @func()
-  br label %if.end3
-if.end3:                                          ; preds = %if.then2, %if.end
-  ret void
-}
-; CHECK-LABEL: fcmpNe
-; CHECK: ucomiss
-; CHECK: jne
-; CHECK-NEXT: jp
-; CHECK: call {{.*}} R_{{.*}} func
-; CHECK: ucomisd
-; CHECK: jne
-; CHECK-NEXT: jp
-; CHECK: call {{.*}} R_{{.*}} func
-define internal void @fcmpGt(float %a, float %b, double %c, double %d) {
-entry:
-  %cmp = fcmp ogt float %a, %b
-  br i1 %cmp, label %if.then, label %if.end
-if.then:                                          ; preds = %entry
-  call void @func()
-  br label %if.end
-if.end:                                           ; preds = %if.then, %entry
-  %cmp1 = fcmp ogt double %c, %d
-  br i1 %cmp1, label %if.then2, label %if.end3
-if.then2:                                         ; preds = %if.end
-  call void @func()
-  br label %if.end3
-if.end3:                                          ; preds = %if.then2, %if.end
-  ret void
-}
-; CHECK-LABEL: fcmpGt
-; CHECK: ucomiss
-; CHECK: seta
-; CHECK: call {{.*}} R_{{.*}} func
-; CHECK: ucomisd
-; CHECK: seta
-; CHECK: call {{.*}} R_{{.*}} func
-define internal void @fcmpGe(float %a, float %b, double %c, double %d) {
-entry:
-  %cmp = fcmp ult float %a, %b
-  br i1 %cmp, label %if.end, label %if.then
-if.then:                                          ; preds = %entry
-  call void @func()
-  br label %if.end
-if.end:                                           ; preds = %entry, %if.then
-  %cmp1 = fcmp ult double %c, %d
-  br i1 %cmp1, label %if.end3, label %if.then2
-if.then2:                                         ; preds = %if.end
-  call void @func()
-  br label %if.end3
-if.end3:                                          ; preds = %if.end, %if.then2
-  ret void
-}
-; CHECK-LABEL: fcmpGe
-; CHECK: ucomiss
-; CHECK: setb
-; CHECK: call {{.*}} R_{{.*}} func
-; CHECK: ucomisd
-; CHECK: setb
-; CHECK: call {{.*}} R_{{.*}} func
-define internal void @fcmpLt(float %a, float %b, double %c, double %d) {
-entry:
-  %cmp = fcmp olt float %a, %b
-  br i1 %cmp, label %if.then, label %if.end
-if.then:                                          ; preds = %entry
-  call void @func()
-  br label %if.end
-if.end:                                           ; preds = %if.then, %entry
-  %cmp1 = fcmp olt double %c, %d
-  br i1 %cmp1, label %if.then2, label %if.end3
-if.then2:                                         ; preds = %if.end
-  call void @func()
-  br label %if.end3
-if.end3:                                          ; preds = %if.then2, %if.end
-  ret void
-}
-; CHECK-LABEL: fcmpLt
-; CHECK: ucomiss
-; CHECK: seta
-; CHECK: call {{.*}} R_{{.*}} func
-; CHECK: ucomisd
-; CHECK: seta
-; CHECK: call {{.*}} R_{{.*}} func
-define internal void @fcmpLe(float %a, float %b, double %c, double %d) {
-entry:
-  %cmp = fcmp ugt float %a, %b
-  br i1 %cmp, label %if.end, label %if.then
-if.then:                                          ; preds = %entry
-  call void @func()
-  br label %if.end
-if.end:                                           ; preds = %entry, %if.then
-  %cmp1 = fcmp ugt double %c, %d
-  br i1 %cmp1, label %if.end3, label %if.then2
-if.then2:                                         ; preds = %if.end
-  call void @func()
-  br label %if.end3
-if.end3:                                          ; preds = %if.end, %if.then2
-  ret void
-}
-; CHECK-LABEL: fcmpLe
-; CHECK: ucomiss
-; CHECK: setb
-; CHECK: call {{.*}} R_{{.*}} func
-; CHECK: ucomisd
-; CHECK: setb
-; CHECK: call {{.*}} R_{{.*}} func
-define internal i32 @fcmpFalseFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp false float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpFalseFloat
-; CHECK: mov {{.*}},0x0
-define internal i32 @fcmpFalseDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp false double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpFalseDouble
-; CHECK: mov {{.*}},0x0
-define internal i32 @fcmpOeqFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp oeq float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOeqFloat
-; CHECK: ucomiss
-; CHECK: jne
-; CHECK: jp
-define internal i32 @fcmpOeqDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp oeq double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOeqDouble
-; CHECK: ucomisd
-; CHECK: jne
-; CHECK: jp
-define internal i32 @fcmpOgtFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ogt float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOgtFloat
-; CHECK: ucomiss
-; CHECK: seta
-define internal i32 @fcmpOgtDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ogt double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOgtDouble
-; CHECK: ucomisd
-; CHECK: seta
-define internal i32 @fcmpOgeFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp oge float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOgeFloat
-; CHECK: ucomiss
-; CHECK: setae
-define internal i32 @fcmpOgeDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp oge double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOgeDouble
-; CHECK: ucomisd
-; CHECK: setae
-define internal i32 @fcmpOltFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp olt float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOltFloat
-; CHECK: ucomiss
-; CHECK: seta
-define internal i32 @fcmpOltDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp olt double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOltDouble
-; CHECK: ucomisd
-; CHECK: seta
-define internal i32 @fcmpOleFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ole float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOleFloat
-; CHECK: ucomiss
-; CHECK: setae
-define internal i32 @fcmpOleDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ole double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOleDouble
-; CHECK: ucomisd
-; CHECK: setae
-define internal i32 @fcmpOneFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp one float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOneFloat
-; CHECK: ucomiss
-; CHECK: setne
-define internal i32 @fcmpOneDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp one double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOneDouble
-; CHECK: ucomisd
-; CHECK: setne
-define internal i32 @fcmpOrdFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ord float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOrdFloat
-; CHECK: ucomiss
-; CHECK: setnp
-define internal i32 @fcmpOrdDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ord double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpOrdDouble
-; CHECK: ucomisd
-; CHECK: setnp
-define internal i32 @fcmpUeqFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ueq float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUeqFloat
-; CHECK: ucomiss
-; CHECK: sete
-define internal i32 @fcmpUeqDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ueq double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUeqDouble
-; CHECK: ucomisd
-; CHECK: sete
-define internal i32 @fcmpUgtFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ugt float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUgtFloat
-; CHECK: ucomiss
-; CHECK: setb
-define internal i32 @fcmpUgtDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ugt double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUgtDouble
-; CHECK: ucomisd
-; CHECK: setb
-define internal i32 @fcmpUgeFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp uge float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUgeFloat
-; CHECK: ucomiss
-; CHECK: setbe
-define internal i32 @fcmpUgeDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp uge double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUgeDouble
-; CHECK: ucomisd
-; CHECK: setbe
-define internal i32 @fcmpUltFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ult float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUltFloat
-; CHECK: ucomiss
-; CHECK: setb
-define internal i32 @fcmpUltDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ult double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUltDouble
-; CHECK: ucomisd
-; CHECK: setb
-define internal i32 @fcmpUleFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp ule float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUleFloat
-; CHECK: ucomiss
-; CHECK: setbe
-define internal i32 @fcmpUleDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp ule double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUleDouble
-; CHECK: ucomisd
-; CHECK: setbe
-define internal i32 @fcmpUneFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp une float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUneFloat
-; CHECK: ucomiss
-; CHECK: jne
-; CHECK: jp
-define internal i32 @fcmpUneDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp une double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUneDouble
-; CHECK: ucomisd
-; CHECK: jne
-; CHECK: jp
-define internal i32 @fcmpUnoFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp uno float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUnoFloat
-; CHECK: ucomiss
-; CHECK: setp
-define internal i32 @fcmpUnoDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp uno double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpUnoDouble
-; CHECK: ucomisd
-; CHECK: setp
-define internal i32 @fcmpTrueFloat(float %a, float %b) {
-entry:
-  %cmp = fcmp true float %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpTrueFloat
-; CHECK: mov {{.*}},0x1
-define internal i32 @fcmpTrueDouble(double %a, double %b) {
-entry:
-  %cmp = fcmp true double %a, %b
-  %cmp.ret_ext = zext i1 %cmp to i32
-  ret i32 %cmp.ret_ext
-}
-; CHECK-LABEL: fcmpTrueDouble
-; CHECK: mov {{.*}},0x1
-define internal float @loadFloat(i32 %a) {
-entry:
-  %__1 = inttoptr i32 %a to float*
-  %v0 = load float, float* %__1, align 4
-  ret float %v0
-}
-; CHECK-LABEL: loadFloat
-; CHECK: movss
-; CHECK: fld
-define internal double @loadDouble(i32 %a) {
-entry:
-  %__1 = inttoptr i32 %a to double*
-  %v0 = load double, double* %__1, align 8
-  ret double %v0
-}
-; CHECK-LABEL: loadDouble
-; CHECK: movsd
-; CHECK: fld
-define internal void @storeFloat(i32 %a, float %value) {
-entry:
-  %__2 = inttoptr i32 %a to float*
-  store float %value, float* %__2, align 4
-  ret void
-}
-; CHECK-LABEL: storeFloat
-; CHECK: movss
-; CHECK: movss
-define internal void @storeDouble(i32 %a, double %value) {
-entry:
-  %__2 = inttoptr i32 %a to double*
-  store double %value, double* %__2, align 8
-  ret void
-}
-; CHECK-LABEL: storeDouble
-; CHECK: movsd
-; CHECK: movsd
-define internal void @storeFloatConst(i32 %a) {
-entry:
-  %a.asptr = inttoptr i32 %a to float*
-  store float 0x3FF3AE1480000000, float* %a.asptr, align 4
-  ret void
-}
-; CHECK-LABEL: storeFloatConst
-; CHECK: movss
-; CHECK: movss
-define internal void @storeDoubleConst(i32 %a) {
-entry:
-  %a.asptr = inttoptr i32 %a to double*
-  store double 1.230000e+00, double* %a.asptr, align 8
-  ret void
-}
-; CHECK-LABEL: storeDoubleConst
-; CHECK: movsd
-; CHECK: movsd
-define internal float @selectFloatVarVar(float %a, float %b) {
-entry:
-  %cmp = fcmp olt float %a, %b
-  %cond = select i1 %cmp, float %a, float %b
-  ret float %cond
-}
-; CHECK-LABEL: selectFloatVarVar
-; CHECK: ucomiss
-; CHECK: seta
-; CHECK: fld
-define internal double @selectDoubleVarVar(double %a, double %b) {
-entry:
-  %cmp = fcmp olt double %a, %b
-  %cond = select i1 %cmp, double %a, double %b
-  ret double %cond
-}
-; CHECK-LABEL: selectDoubleVarVar
-; CHECK: ucomisd
-; CHECK: seta
-; CHECK: fld
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -150,6 +150,11 @@ entry:
 ; CHECK: sqrtss xmm{{.*}}
 ; CHECK: sqrtss xmm{{.*}}
 ; CHECK: sqrtss xmm{{.*}},DWORD PTR
+; ARM32-LABEL: test_sqrt_float
+; ARM32: vsqrt.f32
+; ARM32: vsqrt.f32
+; ARM32: vsqrt.f32
+; ARM32: vadd.f32
 define float @test_sqrt_float_mergeable_load(float %x, i32 %iptr) {
 entry:
@@ -164,6 +169,9 @@ entry:
 ; current folding only handles load + arithmetic op. The sqrt inst
 ; is considered an intrinsic call and not an arithmetic op.
 ; CHECK: sqrtss xmm{{.*}}
+; ARM32-LABEL: test_sqrt_float_mergeable_load
+; ARM32: vldr s{{.*}}
+; ARM32: vsqrt.f32
 define double @test_sqrt_double(double %x, i32 %iptr) {
 entry:
@@ -177,6 +185,11 @@ entry:
 ; CHECK: sqrtsd xmm{{.*}}
 ; CHECK: sqrtsd xmm{{.*}}
 ; CHECK: sqrtsd xmm{{.*}},QWORD PTR
+; ARM32-LABEL: test_sqrt_double
+; ARM32: vsqrt.f64
+; ARM32: vsqrt.f64
+; ARM32: vsqrt.f64
+; ARM32: vadd.f64
 define double @test_sqrt_double_mergeable_load(double %x, i32 %iptr) {
 entry:
@@ -188,6 +201,9 @@ entry:
 }
 ; CHECK-LABEL: test_sqrt_double_mergeable_load
 ; CHECK: sqrtsd xmm{{.*}}
+; ARM32-LABEL: test_sqrt_double_mergeable_load
+; ARM32: vldr d{{.*}}
+; ARM32: vsqrt.f64
 define float @test_sqrt_ignored(float %x, double %y) {
 entry: