Add the ARM32 FP register table entries, simple arith, and args.

Lower some instructions, without much guarantee of correctness. *Running* generated code will be risky because the register allocator isn't aware of register aliasing. Fill in v{add,div,mul,sub}.f{32,64}, vmov, vldr and vsqrt.f{32,64}. I tried to make the nacl-other-intrinsics test not explode, so added vsqrt too. That was pretty easy for sqrt, but then fabs tests also exploded. Those are not truly fixed but are currently "fixed" by adding a FakeDef to satisfy liveness. Propagate float/double arguments to the right register in lowerArguments, lowerCall, and propagate to s0/d0/q0 for lowerReturn. May need to double check the calling convention. Currently can't test call-ret because vpush/vpop for prologues and epilogues isn't done. Legalize FP immediates to make the nacl-other-intrinsics sqrt test happy. Use the correct type of load (vldr (.32 and .64 are optional) instead of ldr{b,h,,d}). Whether or not the float/vector instructions can be predicated is a bit interesting. The float/double ones can, but the SIMD versions cannot. E.g. vadd<cond>.f32 s0, s0, s1 is okay vadd<cond>.f32 q0, q0, q1 is not okay. For now, just omit conditions from instructions that may end up being reused for SIMD. Split up the fp.pnacl.ll test into multiple ones so that parts of lowering can be tested incrementally. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1266263003 .

Add the ARM32 FP register table entries, simple arith, and args.
86ebec12 · Jan Voung · f4fbf7fd · 86ebec12 · 86ebec12 · 86ebec12
Commit 86ebec12 authored Aug 09, 2015 by Jan Voung
13 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -29,11 +29,12 @@ namespace {
 const struct TypeARM32Attributes_ {
  const char *WidthString;    // b, h, <blank>, or d
+  const char *VecWidthString; // i8, i16, i32, f32, f64
  int8_t SExtAddrOffsetBits;
  int8_t ZExtAddrOffsetBits;
 } TypeARM32Attributes[] = {
-#define X(tag, elementty, width, sbits, ubits)                                 \
+#define X(tag, elementty, int_width, vec_width, sbits, ubits)                  \
-  { width, sbits, ubits }                                                      \
+  { int_width, vec_width, sbits, ubits }                                       \
  ,
    ICETYPEARM32_TABLE
 #undef X
@@ -66,6 +67,10 @@ const char *InstARM32::getWidthString(Type Ty) {
  return TypeARM32Attributes[Ty].WidthString;
 }
+const char *InstARM32::getVecWidthString(Type Ty) {
+  return TypeARM32Attributes[Ty].VecWidthString;
+}
 const char *InstARM32Pred::predString(CondARM32::Cond Pred) {
  return InstARM32CondAttributes[Pred].EmitString;
 }
@@ -94,6 +99,18 @@ void InstARM32Pred::emitUnaryopGPR(const char *Opcode,
  Inst->getSrc(0)->emit(Func);
 }
+void InstARM32Pred::emitUnaryopFP(const char *Opcode, const InstARM32Pred *Inst,
+                                  const Cfg *Func) {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 1);
+  Type SrcTy = Inst->getSrc(0)->getType();
+  Str << "\t" << Opcode << Inst->getPredicate() << getVecWidthString(SrcTy)
+      << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(0)->emit(Func);
+}
 void InstARM32Pred::emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
                                const Cfg *Func) {
  if (!BuildDefs::dump())
@@ -123,6 +140,21 @@ void InstARM32Pred::emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
  Inst->getSrc(1)->emit(Func);
 }
+void InstARM32::emitThreeAddrFP(const char *Opcode, const InstARM32 *Inst,
+                                const Cfg *Func) {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 2);
+  Str << "\t" << Opcode << getVecWidthString(Inst->getDest()->getType())
+      << "\t";
+  Inst->getDest()->emit(Func);
+  Str << ", ";
+  Inst->getSrc(0)->emit(Func);
+  Str << ", ";
+  Inst->getSrc(1)->emit(Func);
+}
 void InstARM32Pred::emitFourAddr(const char *Opcode, const InstARM32Pred *Inst,
                                 const Cfg *Func) {
  if (!BuildDefs::dump())
@@ -304,12 +336,6 @@ IceString InstARM32Label::getName(const Cfg *Func) const {
  return ".L" + Func->getFunctionName() + "$local$__" + std::to_string(Number);
 }
-InstARM32Ldr::InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
-                           CondARM32::Cond Predicate)
-    : InstARM32Pred(Func, InstARM32::Ldr, 1, Dest, Predicate) {
-  addSource(Mem);
-}
 InstARM32Pop::InstARM32Pop(Cfg *Func, const VarList &Dests)
    : InstARM32(Func, InstARM32::Pop, 0, nullptr), Dests(Dests) {
  // Track modifications to Dests separately via FakeDefs.
@@ -363,8 +389,14 @@ template <> const char *InstARM32Rbit::Opcode = "rbit";
 template <> const char *InstARM32Rev::Opcode = "rev";
 template <> const char *InstARM32Sxt::Opcode = "sxt"; // still requires b/h
 template <> const char *InstARM32Uxt::Opcode = "uxt"; // still requires b/h
+// FP
+template <> const char *InstARM32Vsqrt::Opcode = "vsqrt";
 // Mov-like ops
+template <> const char *InstARM32Ldr::Opcode = "ldr";
 template <> const char *InstARM32Mov::Opcode = "mov";
+// FP
+template <> const char *InstARM32Vldr::Opcode = "vldr";
+template <> const char *InstARM32Vmov::Opcode = "vmov";
 // Three-addr ops
 template <> const char *InstARM32Adc::Opcode = "adc";
 template <> const char *InstARM32Add::Opcode = "add";
@@ -381,6 +413,11 @@ template <> const char *InstARM32Sbc::Opcode = "sbc";
 template <> const char *InstARM32Sdiv::Opcode = "sdiv";
 template <> const char *InstARM32Sub::Opcode = "sub";
 template <> const char *InstARM32Udiv::Opcode = "udiv";
+// FP
+template <> const char *InstARM32Vadd::Opcode = "vadd";
+template <> const char *InstARM32Vdiv::Opcode = "vdiv";
+template <> const char *InstARM32Vmul::Opcode = "vmul";
+template <> const char *InstARM32Vsub::Opcode = "vsub";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
 template <> const char *InstARM32Mls::Opcode = "mls";
@@ -403,19 +440,19 @@ template <> void InstARM32Mov::emit(const Cfg *Func) const {
  assert(getSrcSize() == 1);
  Variable *Dest = getDest();
  if (Dest->hasReg()) {
-    IceString Opcode = "mov";
+    IceString ActualOpcode = Opcode;
    Operand *Src0 = getSrc(0);
    if (const auto *Src0V = llvm::dyn_cast<Variable>(Src0)) {
      if (!Src0V->hasReg()) {
        // Always use the whole stack slot. A 32-bit load has a larger range
        // of offsets than 16-bit, etc.
-        Opcode = IceString("ldr");
+        ActualOpcode = IceString("ldr");
      }
    } else {
      if (llvm::isa<OperandARM32Mem>(Src0))
-        Opcode = IceString("ldr") + getWidthString(Dest->getType());
+        ActualOpcode = IceString("ldr") + getWidthString(Dest->getType());
    }
-    Str << "\t" << Opcode << getPredicate() << "\t";
+    Str << "\t" << ActualOpcode << getPredicate() << "\t";
    getDest()->emit(Func);
    Str << ", ";
    getSrc(0)->emit(Func);
@@ -436,6 +473,64 @@ template <> void InstARM32Mov::emitIAS(const Cfg *Func) const {
  llvm_unreachable("Not yet implemented");
 }
+template <> void InstARM32Vldr::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Str << "\t"<< Opcode << getPredicate() << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+}
+template <> void InstARM32Vldr::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+template <> void InstARM32Vmov::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  assert(CondARM32::AL == getPredicate());
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Variable *Dest = getDest();
+  if (Dest->hasReg()) {
+    IceString ActualOpcode = Opcode;
+    Operand *Src0 = getSrc(0);
+    if (const auto *Src0V = llvm::dyn_cast<Variable>(Src0)) {
+      if (!Src0V->hasReg()) {
+        ActualOpcode = IceString("vldr");
+      }
+    } else {
+      if (llvm::isa<OperandARM32Mem>(Src0))
+        ActualOpcode = IceString("vldr");
+    }
+    Str << "\t" << ActualOpcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+  } else {
+    Variable *Src0 = llvm::cast<Variable>(getSrc(0));
+    assert(Src0->hasReg());
+    Str << "\t"
+           "vstr"
+           "\t";
+    Src0->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+  }
+}
+template <> void InstARM32Vmov::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
 void InstARM32Br::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
@@ -547,37 +642,25 @@ void InstARM32Label::dump(const Cfg *Func) const {
  Str << getName(Func) << ":";
 }
-void InstARM32Ldr::emit(const Cfg *Func) const {
+template <> void InstARM32Ldr::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
  Ostream &Str = Func->getContext()->getStrEmit();
  assert(getSrcSize() == 1);
  assert(getDest()->hasReg());
  Type Ty = getSrc(0)->getType();
-  Str << "\t"
+  Str << "\t"<< Opcode << getWidthString(Ty) << getPredicate() << "\t";
-      << "ldr" << getWidthString(Ty) << getPredicate() << "\t";
  getDest()->emit(Func);
  Str << ", ";
  getSrc(0)->emit(Func);
 }
-void InstARM32Ldr::emitIAS(const Cfg *Func) const {
+template <> void InstARM32Ldr::emitIAS(const Cfg *Func) const {
  assert(getSrcSize() == 1);
  (void)Func;
  llvm_unreachable("Not yet implemented");
 }
-void InstARM32Ldr::dump(const Cfg *Func) const {
-  if (!BuildDefs::dump())
-    return;
-  Ostream &Str = Func->getContext()->getStrDump();
-  dumpDest(Func);
-  Str << " = ";
-  dumpOpcodePred(Str, "ldr", getDest()->getType());
-  Str << " ";
-  dumpSources(Func);
-}
 template <> void InstARM32Movw::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;

--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -320,12 +320,24 @@ public:
    Tst,
    Udiv,
    Umull,
-    Uxt
+    Uxt,
+    Vadd,
+    Vdiv,
+    Vldr,
+    Vmov,
+    Vmul,
+    Vsqrt,
+    Vsub
  };
  static const char *getWidthString(Type Ty);
+  static const char *getVecWidthString(Type Ty);
  static CondARM32::Cond getOppositeCondition(CondARM32::Cond Cond);
+  /// Shared emit routines for common forms of instructions.
+  static void emitThreeAddrFP(const char *Opcode, const InstARM32 *Inst,
+                              const Cfg *Func);
  void dump(const Cfg *Func) const override;
 protected:
@@ -357,6 +369,8 @@ public:
  /// Shared emit routines for common forms of instructions.
  static void emitUnaryopGPR(const char *Opcode, const InstARM32Pred *Inst,
                             const Cfg *Func, bool NeedsWidthSuffix);
+  static void emitUnaryopFP(const char *Opcode, const InstARM32Pred *Inst,
+                            const Cfg *Func);
  static void emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
                          const Cfg *Func);
  static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
@@ -420,6 +434,50 @@ private:
  static const char *Opcode;
 };
+/// Instructions of the form x := op(y), for vector/FP.
+template <InstARM32::InstKindARM32 K>
+class InstARM32UnaryopFP : public InstARM32Pred {
+  InstARM32UnaryopFP() = delete;
+  InstARM32UnaryopFP(const InstARM32UnaryopFP &) = delete;
+  InstARM32UnaryopFP &operator=(const InstARM32UnaryopFP &) = delete;
+public:
+  static InstARM32UnaryopFP *create(Cfg *Func, Variable *Dest, Variable *Src,
+                                    CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32UnaryopFP>())
+        InstARM32UnaryopFP(Func, Dest, Src, Predicate);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitUnaryopFP(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+private:
+  InstARM32UnaryopFP(Cfg *Func, Variable *Dest, Operand *Src,
+                     CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 1, Dest, Predicate) {
+    addSource(Src);
+  }
+  static const char *Opcode;
+};
 /// Instructions of the form x := x op y.
 template <InstARM32::InstKindARM32 K>
 class InstARM32TwoAddrGPR : public InstARM32Pred {
@@ -559,7 +617,56 @@ private:
  bool SetFlags;
 };
-// Instructions of the form x := a op1 (y op2 z). E.g., multiply accumulate.
+/// Instructions of the form x := y op z, for vector/FP.  We leave these as
+/// unconditional: "ARM deprecates the conditional execution of any instruction
+/// encoding provided by the Advanced SIMD Extension that is not also provided
+/// by the Floating-point (VFP) extension".  They do not set flags.
+template <InstARM32::InstKindARM32 K>
+class InstARM32ThreeAddrFP : public InstARM32 {
+  InstARM32ThreeAddrFP() = delete;
+  InstARM32ThreeAddrFP(const InstARM32ThreeAddrFP &) = delete;
+  InstARM32ThreeAddrFP &operator=(const InstARM32ThreeAddrFP &) = delete;
+public:
+  /// Create a vector/FP binary-op instruction like vadd, and vsub.
+  /// Everything must be a register.
+  static InstARM32ThreeAddrFP *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                      Variable *Src1) {
+    return new (Func->allocate<InstARM32ThreeAddrFP>())
+        InstARM32ThreeAddrFP(Func, Dest, Src0, Src1);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    emitThreeAddrFP(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!BuildDefs::dump())
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = ";
+    Str << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+private:
+  InstARM32ThreeAddrFP(Cfg *Func, Variable *Dest, Variable *Src0,
+                       Variable *Src1)
+      : InstARM32(Func, K, 2, Dest) {
+    addSource(Src0);
+    addSource(Src1);
+  }
+  static const char *Opcode;
+};
+/// Instructions of the form x := a op1 (y op2 z). E.g., multiply accumulate.
 template <InstARM32::InstKindARM32 K>
 class InstARM32FourAddrGPR : public InstARM32Pred {
  InstARM32FourAddrGPR() = delete;
@@ -608,7 +715,7 @@ private:
  static const char *Opcode;
 };
-// Instructions of the form x cmpop y (setting flags).
+/// Instructions of the form x cmpop y (setting flags).
 template <InstARM32::InstKindARM32 K>
 class InstARM32CmpLike : public InstARM32Pred {
  InstARM32CmpLike() = delete;
@@ -666,10 +773,19 @@ typedef InstARM32ThreeAddrGPR<InstARM32::Sbc> InstARM32Sbc;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sdiv> InstARM32Sdiv;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sub> InstARM32Sub;
 typedef InstARM32ThreeAddrGPR<InstARM32::Udiv> InstARM32Udiv;
+typedef InstARM32ThreeAddrFP<InstARM32::Vadd> InstARM32Vadd;
+typedef InstARM32ThreeAddrFP<InstARM32::Vdiv> InstARM32Vdiv;
+typedef InstARM32ThreeAddrFP<InstARM32::Vmul> InstARM32Vmul;
+typedef InstARM32ThreeAddrFP<InstARM32::Vsub> InstARM32Vsub;
+typedef InstARM32Movlike<InstARM32::Ldr> InstARM32Ldr;
 /// Move instruction (variable <- flex). This is more of a pseudo-inst.
 /// If var is a register, then we use "mov". If var is stack, then we use
 /// "str" to store to the stack.
 typedef InstARM32Movlike<InstARM32::Mov> InstARM32Mov;
+/// Represents various vector mov instruction forms (simple single source,
+/// single dest forms only, not the 2 GPR <-> 1 D reg forms, etc.).
+typedef InstARM32Movlike<InstARM32::Vmov> InstARM32Vmov;
+typedef InstARM32Movlike<InstARM32::Vldr> InstARM32Vldr;
 /// MovT leaves the bottom bits alone so dest is also a source.
 /// This helps indicate that a previous MovW setting dest is not dead code.
 typedef InstARM32TwoAddrGPR<InstARM32::Movt> InstARM32Movt;
@@ -683,6 +799,7 @@ typedef InstARM32UnaryopGPR<InstARM32::Rev, false> InstARM32Rev;
 // but we aren't using that for now, so just model as a Unaryop.
 typedef InstARM32UnaryopGPR<InstARM32::Sxt, true> InstARM32Sxt;
 typedef InstARM32UnaryopGPR<InstARM32::Uxt, true> InstARM32Uxt;
+typedef InstARM32UnaryopFP<InstARM32::Vsqrt> InstARM32Vsqrt;
 typedef InstARM32FourAddrGPR<InstARM32::Mla> InstARM32Mla;
 typedef InstARM32FourAddrGPR<InstARM32::Mls> InstARM32Mls;
 typedef InstARM32CmpLike<InstARM32::Cmp> InstARM32Cmp;
@@ -838,29 +955,6 @@ private:
  InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
 };
-/// Load instruction.
-class InstARM32Ldr : public InstARM32Pred {
-  InstARM32Ldr() = delete;
-  InstARM32Ldr(const InstARM32Ldr &) = delete;
-  InstARM32Ldr &operator=(const InstARM32Ldr &) = delete;
-public:
-  /// Dest must be a register.
-  static InstARM32Ldr *create(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
-                              CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Ldr>())
-        InstARM32Ldr(Func, Dest, Mem, Predicate);
-  }
-  void emit(const Cfg *Func) const override;
-  void emitIAS(const Cfg *Func) const override;
-  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Ldr); }
-private:
-  InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
-               CondARM32::Cond Predicate);
-};
 /// Pop into a list of GPRs. Technically this can be predicated, but we don't
 /// need that functionality.
 class InstARM32Pop : public InstARM32 {
@@ -1003,8 +1097,12 @@ private:
 // already have default implementations.  Without this, there is the
 // possibility of ODR violations and link errors.
+template <> void InstARM32Ldr::emit(const Cfg *Func) const;
+template <> void InstARM32Mov::emit(const Cfg *Func) const;
 template <> void InstARM32Movw::emit(const Cfg *Func) const;
 template <> void InstARM32Movt::emit(const Cfg *Func) const;
+template <> void InstARM32Vldr::emit(const Cfg *Func) const;
+template <> void InstARM32Vmov::emit(const Cfg *Func) const;
 } // end of namespace Ice

--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -21,13 +21,13 @@
 namespace Ice {
-namespace RegARM32 {
+class RegARM32 {
+public:
-/// An enum of every register. The enum value may not match the encoding
+  /// An enum of every register. The enum value may not match the encoding
-/// used to binary encode register operands in instructions.
+  /// used to binary encode register operands in instructions.
-enum AllRegisters {
+  enum AllRegisters {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
  val,
    REGARM32_TABLE
 #undef X
@@ -35,28 +35,76 @@ enum AllRegisters {
 #define X(val, init) val init,
    REGARM32_TABLE_BOUNDS
 #undef X
-};
+  };
-/// An enum of GPR Registers. The enum value does match the encoding used
+  /// An enum of GPR Registers. The enum value does match the encoding used
-/// to binary encode register operands in instructions.
+  /// to binary encode register operands in instructions.
-enum GPRRegister {
+  enum GPRRegister {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
-          isFP)                                                                \
+          isFP32, isFP64, isVec128)                                            \
-  Encoded_##val encode,
+  Encoded_##val = encode,
    REGARM32_GPR_TABLE
 #undef X
        Encoded_Not_GPR = -1
-};
+  };
+  /// An enum of FP32 S-Registers. The enum value does match the encoding used
+  /// to binary encode register operands in instructions.
+  enum SRegister {
+#define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
+          isFP32, isFP64, isVec128)                                            \
+  Encoded_##val = encode,
+    REGARM32_FP32_TABLE
+#undef X
+        Encoded_Not_SReg = -1
+  };
+  /// An enum of FP64 D-Registers. The enum value does match the encoding used
+  /// to binary encode register operands in instructions.
+  enum DRegister {
+#define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
+          isFP32, isFP64, isVec128)                                            \
+  Encoded_##val = encode,
+    REGARM32_FP64_TABLE
+#undef X
+        Encoded_Not_DReg = -1
+  };
-// TODO(jvoung): Floating point and vector registers...
+  /// An enum of 128-bit Q-Registers. The enum value does match the encoding
-// Need to model overlap and difference in encoding too.
+  /// used to binary encode register operands in instructions.
+  enum QRegister {
+#define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
+          isFP32, isFP64, isVec128)                                            \
+  Encoded_##val = encode,
+    REGARM32_VEC128_TABLE
+#undef X
+        Encoded_Not_QReg = -1
+  };
-static inline GPRRegister getEncodedGPR(int32_t RegNum) {
+  static inline GPRRegister getEncodedGPR(int32_t RegNum) {
-  assert(Reg_GPR_First <= RegNum && RegNum <= Reg_GPR_Last);
+    assert(Reg_GPR_First <= RegNum);
+    assert(RegNum <= Reg_GPR_Last);
    return GPRRegister(RegNum - Reg_GPR_First);
-}
+  }
-} // end of namespace RegARM32
+  static inline SRegister getEncodedSReg(int32_t RegNum) {
+    assert(Reg_SREG_First <= RegNum);
+    assert(RegNum <= Reg_SREG_Last);
+    return SRegister(RegNum - Reg_SREG_First);
+  }
+  static inline DRegister getEncodedDReg(int32_t RegNum) {
+    assert(Reg_DREG_First <= RegNum);
+    assert(RegNum <= Reg_DREG_Last);
+    return DRegister(RegNum - Reg_DREG_First);
+  }
+  static inline QRegister getEncodedQReg(int32_t RegNum) {
+    assert(Reg_QREG_First <= RegNum);
+    assert(RegNum <= Reg_QREG_Last);
+    return QRegister(RegNum - Reg_QREG_First);
+  }
+};
 } // end of namespace Ice

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -190,7 +190,7 @@ protected:
  }
  void _adds(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Add::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -300,7 +300,7 @@ protected:
  }
  void _orrs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Orr::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -334,7 +334,7 @@ protected:
  }
  void _sbcs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -352,7 +352,7 @@ protected:
  }
  void _subs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
-    const bool SetFlags = true;
+    constexpr bool SetFlags = true;
    Context.insert(
        InstARM32Sub::create(Func, Dest, Src0, Src1, Pred, SetFlags));
  }
@@ -381,6 +381,41 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Uxt::create(Func, Dest, Src0, Pred));
  }
+  void _vadd(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vadd::create(Func, Dest, Src0, Src1));
+  }
+  void _vdiv(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vdiv::create(Func, Dest, Src0, Src1));
+  }
+  void _vldr(Variable *Dest, OperandARM32Mem *Src,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vldr::create(Func, Dest, Src, Pred));
+  }
+  // There are a whole bunch of vmov variants, to transfer within
+  // S/D/Q registers, between core integer registers and S/D,
+  // and from small immediates into S/D.
+  // For integer -> S/D/Q there is a variant which takes two integer
+  // register to fill a D, or to fill two consecutive S registers.
+  // Vmov can also be used to insert-element. E.g.,
+  //    "vmov.8 d0[1], r0"
+  // but insert-element is a "two-address" operation where only part of the
+  // register is modified. This cannot model that.
+  //
+  // This represents the simple single source, single dest variants only.
+  void _vmov(Variable *Dest, Operand *Src0) {
+    constexpr CondARM32::Cond Pred = CondARM32::AL;
+    Context.insert(InstARM32Vmov::create(Func, Dest, Src0, Pred));
+  }
+  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vmul::create(Func, Dest, Src0, Src1));
+  }
+  void _vsqrt(Variable *Dest, Variable *Src,
+              CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vsqrt::create(Func, Dest, Src, Pred));
+  }
+  void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert(InstARM32Vsub::create(Func, Dest, Src0, Src1));
+  }
  /// Run a pass through stack variables and ensure that the offsets are legal.
  /// If the offset is not legal, use a new base register that accounts for
@@ -417,16 +452,20 @@ protected:
    CallingConv &operator=(const CallingConv &) = delete;
  public:
-    CallingConv() : NumGPRRegsUsed(0) {}
+    CallingConv() {}
    ~CallingConv() = default;
    bool I64InRegs(std::pair<int32_t, int32_t> *Regs);
    bool I32InReg(int32_t *Reg);
+    bool FPInReg(Type Ty, int32_t *Reg);
    static constexpr uint32_t ARM32_MAX_GPR_ARG = 4;
+    // Units of S registers still available to S/D/Q arguments.
+    static constexpr uint32_t ARM32_MAX_FP_REG_UNITS = 16;
  private:
-    uint32_t NumGPRRegsUsed;
+    uint32_t NumGPRRegsUsed = 0;
+    uint32_t NumFPRegUnits = 0;
  };
 private:

--- a/tests_lit/llvm2ice_tests/fp.arith.ll
+++ b/tests_lit/llvm2ice_tests/fp.arith.ll
+; This tries to be a comprehensive test of f32 and f64 arith operations.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+; RUN: %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
+; RUN:   -i %s --args -Om1 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --need=allow_dump \
+; RUN:   --command FileCheck --check-prefix ARM32 %s
+define internal float @addFloat(float %a, float %b) {
+entry:
+  %add = fadd float %a, %b
+  ret float %add
+}
+; CHECK-LABEL: addFloat
+; CHECK: addss
+; CHECK: fld
+; ARM32-LABEL: addFloat
+; ARM32: vadd.f32 s{{[0-9]+}}, s
+define internal double @addDouble(double %a, double %b) {
+entry:
+  %add = fadd double %a, %b
+  ret double %add
+}
+; CHECK-LABEL: addDouble
+; CHECK: addsd
+; CHECK: fld
+; ARM32-LABEL: addDouble
+; ARM32: vadd.f64 d{{[0-9]+}}, d
+define internal float @subFloat(float %a, float %b) {
+entry:
+  %sub = fsub float %a, %b
+  ret float %sub
+}
+; CHECK-LABEL: subFloat
+; CHECK: subss
+; CHECK: fld
+; ARM32-LABEL: subFloat
+; ARM32: vsub.f32 s{{[0-9]+}}, s
+define internal double @subDouble(double %a, double %b) {
+entry:
+  %sub = fsub double %a, %b
+  ret double %sub
+}
+; CHECK-LABEL: subDouble
+; CHECK: subsd
+; CHECK: fld
+; ARM32-LABEL: subDouble
+; ARM32: vsub.f64 d{{[0-9]+}}, d
+define internal float @mulFloat(float %a, float %b) {
+entry:
+  %mul = fmul float %a, %b
+  ret float %mul
+}
+; CHECK-LABEL: mulFloat
+; CHECK: mulss
+; CHECK: fld
+; ARM32-LABEL: mulFloat
+; ARM32: vmul.f32 s{{[0-9]+}}, s
+define internal double @mulDouble(double %a, double %b) {
+entry:
+  %mul = fmul double %a, %b
+  ret double %mul
+}
+; CHECK-LABEL: mulDouble
+; CHECK: mulsd
+; CHECK: fld
+; ARM32-LABEL: mulDouble
+; ARM32: vmul.f64 d{{[0-9]+}}, d
+define internal float @divFloat(float %a, float %b) {
+entry:
+  %div = fdiv float %a, %b
+  ret float %div
+}
+; CHECK-LABEL: divFloat
+; CHECK: divss
+; CHECK: fld
+; ARM32-LABEL: divFloat
+; ARM32: vdiv.f32 s{{[0-9]+}}, s
+define internal double @divDouble(double %a, double %b) {
+entry:
+  %div = fdiv double %a, %b
+  ret double %div
+}
+; CHECK-LABEL: divDouble
+; CHECK: divsd
+; CHECK: fld
+; ARM32-LABEL: divDouble
+; ARM32: vdiv.f64 d{{[0-9]+}}, d
+define internal float @remFloat(float %a, float %b) {
+entry:
+  %div = frem float %a, %b
+  ret float %div
+}
+; CHECK-LABEL: remFloat
+; CHECK: call {{.*}} R_{{.*}} fmodf
+; ARM32-LABEL: remFloat
+; ARM32: bl {{.*}} fmodf
+define internal double @remDouble(double %a, double %b) {
+entry:
+  %div = frem double %a, %b
+  ret double %div
+}
+; CHECK-LABEL: remDouble
+; CHECK: call {{.*}} R_{{.*}} fmod
+; ARM32-LABEL: remDouble
+; ARM32: bl {{.*}} fmod
--- a/tests_lit/llvm2ice_tests/fp.call_ret.ll
+++ b/tests_lit/llvm2ice_tests/fp.call_ret.ll
+; This tries to be a comprehensive test of f32 and f64 call/return ops.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -O2 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; RUN: %if --need=target_X8632 --command %p2i --filetype=obj --disassemble \
+; RUN:   --target x8632 -i %s --args -Om1 \
+; RUN:   | %if --need=target_X8632 --command FileCheck %s
+; Can't test on ARM yet. Need to use several vpush {contiguous FP regs},
+; instead of push {any GPR list}.
+define internal i32 @doubleArgs(double %a, i32 %b, double %c) {
+entry:
+  ret i32 %b
+}
+; CHECK-LABEL: doubleArgs
+; CHECK:      mov eax,DWORD PTR [esp+0xc]
+; CHECK-NEXT: ret
+; ARM32-LABEL: doubleArgs
+define internal i32 @floatArgs(float %a, i32 %b, float %c) {
+entry:
+  ret i32 %b
+}
+; CHECK-LABEL: floatArgs
+; CHECK:      mov eax,DWORD PTR [esp+0x8]
+; CHECK-NEXT: ret
+define internal i32 @passFpArgs(float %a, double %b, float %c, double %d, float %e, double %f) {
+entry:
+  %call = call i32 @ignoreFpArgsNoInline(float %a, i32 123, double %b)
+  %call1 = call i32 @ignoreFpArgsNoInline(float %c, i32 123, double %d)
+  %call2 = call i32 @ignoreFpArgsNoInline(float %e, i32 123, double %f)
+  %add = add i32 %call1, %call
+  %add3 = add i32 %add, %call2
+  ret i32 %add3
+}
+; CHECK-LABEL: passFpArgs
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+declare i32 @ignoreFpArgsNoInline(float %x, i32 %y, double %z)
+define internal i32 @passFpConstArg(float %a, double %b) {
+entry:
+  %call = call i32 @ignoreFpArgsNoInline(float %a, i32 123, double 2.340000e+00)
+  ret i32 %call
+}
+; CHECK-LABEL: passFpConstArg
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: call {{.*}} R_{{.*}} ignoreFpArgsNoInline
+define internal i32 @passFp32ConstArg(float %a) {
+entry:
+  %call = call i32 @ignoreFp32ArgsNoInline(float %a, i32 123, float 2.0)
+  ret i32 %call
+}
+; CHECK-LABEL: passFp32ConstArg
+; CHECK: mov DWORD PTR [esp+0x4],0x7b
+; CHECK: movss DWORD PTR [esp+0x8]
+; CHECK: call {{.*}} R_{{.*}} ignoreFp32ArgsNoInline
+declare i32 @ignoreFp32ArgsNoInline(float %x, i32 %y, float %z)
+define internal float @returnFloatArg(float %a) {
+entry:
+  ret float %a
+}
+; CHECK-LABEL: returnFloatArg
+; CHECK: fld DWORD PTR [esp
+define internal double @returnDoubleArg(double %a) {
+entry:
+  ret double %a
+}
+; CHECK-LABEL: returnDoubleArg
+; CHECK: fld QWORD PTR [esp
+define internal float @returnFloatConst() {
+entry:
+  ret float 0x3FF3AE1480000000
+}
+; CHECK-LABEL: returnFloatConst
+; CHECK: fld
+define internal double @returnDoubleConst() {
+entry:
+  ret double 1.230000e+00
+}
+; CHECK-LABEL: returnDoubleConst
+; CHECK: fld
--- a/tests_lit/llvm2ice_tests/fp.cmp.ll
+++ b/tests_lit/llvm2ice_tests/fp.cmp.ll
--- a/tests_lit/llvm2ice_tests/fp.convert.ll
+++ b/tests_lit/llvm2ice_tests/fp.convert.ll
--- a/tests_lit/llvm2ice_tests/fp.load_store.ll
+++ b/tests_lit/llvm2ice_tests/fp.load_store.ll
+; This tries to be a comprehensive test of f32 and f64 compare operations.
+; The CHECK lines are only checking for basic instruction patterns
+; that should be present regardless of the optimization level, so
+; there are no special OPTM1 match lines.
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
+define internal float @loadFloat(i32 %a) {
+entry:
+  %__1 = inttoptr i32 %a to float*
+  %v0 = load float, float* %__1, align 4
+  ret float %v0
+}
+; CHECK-LABEL: loadFloat
+; CHECK: movss
+; CHECK: fld
+define internal double @loadDouble(i32 %a) {
+entry:
+  %__1 = inttoptr i32 %a to double*
+  %v0 = load double, double* %__1, align 8
+  ret double %v0
+}
+; CHECK-LABEL: loadDouble
+; CHECK: movsd
+; CHECK: fld
+define internal void @storeFloat(i32 %a, float %value) {
+entry:
+  %__2 = inttoptr i32 %a to float*
+  store float %value, float* %__2, align 4
+  ret void
+}
+; CHECK-LABEL: storeFloat
+; CHECK: movss
+; CHECK: movss
+define internal void @storeDouble(i32 %a, double %value) {
+entry:
+  %__2 = inttoptr i32 %a to double*
+  store double %value, double* %__2, align 8
+  ret void
+}
+; CHECK-LABEL: storeDouble
+; CHECK: movsd
+; CHECK: movsd
+define internal void @storeFloatConst(i32 %a) {
+entry:
+  %a.asptr = inttoptr i32 %a to float*
+  store float 0x3FF3AE1480000000, float* %a.asptr, align 4
+  ret void
+}
+; CHECK-LABEL: storeFloatConst
+; CHECK: movss
+; CHECK: movss
+define internal void @storeDoubleConst(i32 %a) {
+entry:
+  %a.asptr = inttoptr i32 %a to double*
+  store double 1.230000e+00, double* %a.asptr, align 8
+  ret void
+}
+; CHECK-LABEL: storeDoubleConst
+; CHECK: movsd
+; CHECK: movsd
--- a/tests_lit/llvm2ice_tests/fp.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/fp.pnacl.ll
--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -150,6 +150,11 @@ entry:
 ; CHECK: sqrtss xmm{{.*}}
 ; CHECK: sqrtss xmm{{.*}}
 ; CHECK: sqrtss xmm{{.*}},DWORD PTR
+; ARM32-LABEL: test_sqrt_float
+; ARM32: vsqrt.f32
+; ARM32: vsqrt.f32
+; ARM32: vsqrt.f32
+; ARM32: vadd.f32
 define float @test_sqrt_float_mergeable_load(float %x, i32 %iptr) {
 entry:
@@ -164,6 +169,9 @@ entry:
 ; current folding only handles load + arithmetic op. The sqrt inst
 ; is considered an intrinsic call and not an arithmetic op.
 ; CHECK: sqrtss xmm{{.*}}
+; ARM32-LABEL: test_sqrt_float_mergeable_load
+; ARM32: vldr s{{.*}}
+; ARM32: vsqrt.f32
 define double @test_sqrt_double(double %x, i32 %iptr) {
 entry:
@@ -177,6 +185,11 @@ entry:
 ; CHECK: sqrtsd xmm{{.*}}
 ; CHECK: sqrtsd xmm{{.*}}
 ; CHECK: sqrtsd xmm{{.*}},QWORD PTR
+; ARM32-LABEL: test_sqrt_double
+; ARM32: vsqrt.f64
+; ARM32: vsqrt.f64
+; ARM32: vsqrt.f64
+; ARM32: vadd.f64
 define double @test_sqrt_double_mergeable_load(double %x, i32 %iptr) {
 entry:
@@ -188,6 +201,9 @@ entry:
 }
 ; CHECK-LABEL: test_sqrt_double_mergeable_load
 ; CHECK: sqrtsd xmm{{.*}}
+; ARM32-LABEL: test_sqrt_double_mergeable_load
+; ARM32: vldr d{{.*}}
+; ARM32: vsqrt.f64
 define float @test_sqrt_ignored(float %x, double %y) {
 entry: