Subzero. Enables (most) crosstests for ARM32.

This patch enables many crosstests for ARM32. Very limited vector support is implemented (essentially, whatever it takes to compile the .ll files contain vector operations.) Atomics as well as vector crosstests are still disabled. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1359193003 .

Subzero. Enables (most) crosstests for ARM32.
ba6a67c9 · John Porto · 188eae5c · ba6a67c9 · ba6a67c9 · ba6a67c9
Commit ba6a67c9 authored Sep 25, 2015 by John Porto
18 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -382,11 +382,10 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
 	  -e x8664,native,sse2 \
 	  -e x8664,native,sse4.1,test_vector_ops \
 	  -e x8664,native,sse2,test_global \
-	  -i arm32,native,neon,simple_loop \
+	  -i arm32,native,neon \
-	  -i arm32,native,neon,mem_intrin \
+	  -e arm32,native,neon,test_sync_atomic \
-	  -i arm32,native,neon,test_bitmanip \
+	  -e arm32,native,neon,test_vector_ops \
-	  -i arm32,native,neon,test_stacksave \
+	  -e arm32,native,neon,test_select
-	  -i arm32,native,neon,test_strengthreduce
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
 	$(LLVM_SRC_PATH)/utils/lit/lit.py -sv crosstest/Output
 endif

--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -139,8 +139,8 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
                ++Failures;
                std::cout << "test" << Funcs[f].Name
                          << (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value1
-                          << ", " << Value2 << "): sz=" << (unsigned)ResultSz
+                          << ", " << Value2 << "): sz=" << (uint64)ResultSz
-                          << " llc=" << (unsigned)ResultLlc << "\n";
+                          << " llc=" << (uint64)ResultLlc << "\n";
              }
            }
          }
@@ -154,6 +154,8 @@ const static size_t MaxTestsPerFunc = 100000;
 template <typename TypeUnsignedLabel, typename TypeSignedLabel>
 void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
  typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
  typedef typename Vectors<TypeUnsignedLabel>::ElementTy ElementTypeUnsigned;
@@ -230,6 +232,7 @@ void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      }
    }
  }
+#endif // ARM32
 }
 template <typename Type>
@@ -305,6 +308,8 @@ void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
 }
 void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  static const float NegInf = -1.0 / 0.0;
  static const float PosInf = 1.0 / 0.0;
  static const float Nan = 0.0 / 0.0;
@@ -363,6 +368,7 @@ void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      }
    }
  }
+#endif // ARM32
 }
 #ifdef X8664_STACK_HACK

--- a/crosstest/test_calling_conv.cpp
+++ b/crosstest/test_calling_conv.cpp
@@ -36,12 +36,12 @@ void caller_vvvvv(void) {
  CALL_AS_TYPE(callee_vvvvv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5);
 }
-void caller_vlvlivfvdviv(void) {
+void caller_vlvilvfvdviv(void) {
  v4f32 arg1 = {0, 1, 2, 3};
  int64 arg2 = 4;
  v4f32 arg3 = {6, 7, 8, 9};
-  int64 arg4 = 10;
+  int arg4 = 10;
-  int arg5 = 11;
+  int64 arg5 = 11;
  v4f32 arg6 = {12, 13, 14, 15};
  float arg7 = 16;
  v4f32 arg8 = {17, 18, 19, 20};
@@ -50,7 +50,7 @@ void caller_vlvlivfvdviv(void) {
  int arg11 = 26;
  v4f32 arg12 = {27, 28, 29, 30};
-  CALL_AS_TYPE(callee_vlvlivfvdviv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5,
+  CALL_AS_TYPE(callee_vlvilvfvdviv_Ty, Callee)(arg1, arg2, arg3, arg4, arg5,
                                               arg6, arg7, arg8, arg9, arg10,
                                               arg11, arg12);
 }
@@ -66,6 +66,8 @@ void __attribute__((noinline)) callee_i(int arg1) {
 void __attribute__((noinline))
 callee_vvvvv(v4si32 arg1, v4si32 arg2, v4si32 arg3, v4si32 arg4, v4si32 arg5) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  switch (ArgNum) {
    HANDLE_ARG(1);
    HANDLE_ARG(2);
@@ -73,24 +75,28 @@ callee_vvvvv(v4si32 arg1, v4si32 arg2, v4si32 arg3, v4si32 arg4, v4si32 arg5) {
    HANDLE_ARG(4);
    HANDLE_ARG(5);
  }
+#endif // ARM32
 }
 void __attribute__((noinline))
-callee_vlvlivfvdviv(v4f32 arg1, int64 arg2, v4f32 arg3, int64 arg4, int arg5,
+callee_vlvilvfvdviv(v4f32 arg1, int64 arg2, v4f32 arg3, int arg4, int64 arg5,
                    v4f32 arg6, float arg7, v4f32 arg8, double arg9,
                    v4f32 arg10, int arg11, v4f32 arg12) {
  switch (ArgNum) {
+#ifndef ARM32
+    // TODO(jpp): remove this once vector support is implemented.
    HANDLE_ARG(1);
-    HANDLE_ARG(2);
    HANDLE_ARG(3);
+    HANDLE_ARG(6);
+    HANDLE_ARG(8);
+    HANDLE_ARG(10);
+    HANDLE_ARG(12);
+#endif // ARM32
+    HANDLE_ARG(2);
    HANDLE_ARG(4);
    HANDLE_ARG(5);
-    HANDLE_ARG(6);
    HANDLE_ARG(7);
-    HANDLE_ARG(8);
    HANDLE_ARG(9);
-    HANDLE_ARG(10);
    HANDLE_ARG(11);
-    HANDLE_ARG(12);
  }
 }
--- a/crosstest/test_calling_conv.def
+++ b/crosstest/test_calling_conv.def
@@ -20,7 +20,7 @@
 /* caller,             callee,           argc */   \
 X(caller_i,            callee_i,              1)   \
 X(caller_vvvvv,        callee_vvvvv,          5)   \
-X(caller_vlvlivfvdviv, callee_vlvlivfvdviv,  12)   \
+X(caller_vlvilvfvdviv, callee_vlvilvfvdviv,  12)   \
 // #define X(caller, callee, argc)
 #endif // TEST_CALLING_CONV_DEF
--- a/crosstest/test_calling_conv.h
+++ b/crosstest/test_calling_conv.h
@@ -31,7 +31,7 @@ void caller_vvvvv();
 typedef void(callee_vvvvv_Ty)(v4si32, v4si32, v4si32, v4si32, v4si32);
 callee_vvvvv_Ty callee_vvvvv;
-void caller_vlvlivfvdviv();
+void caller_vlvilvfvdviv();
-typedef void(callee_vlvlivfvdviv_Ty)(v4f32, int64, v4f32, int64, int, v4f32,
+typedef void(callee_vlvilvfvdviv_Ty)(v4f32, int64, v4f32, int, int64, v4f32,
                                     float, v4f32, double, v4f32, int, v4f32);
-callee_vlvlivfvdviv_Ty callee_vlvlivfvdviv;
+callee_vlvilvfvdviv_Ty callee_vlvilvfvdviv;
--- a/crosstest/test_cast_main.cpp
+++ b/crosstest/test_cast_main.cpp
@@ -92,6 +92,8 @@ void testValue(FromType Val, size_t &TotalTests, size_t &Passes,
 template <typename FromType, typename ToType>
 void testVector(size_t &TotalTests, size_t &Passes, size_t &Failures,
                const char *FromTypeString, const char *ToTypeString) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  const static size_t NumElementsInType = Vectors<FromType>::NumElements;
  PRNG Index;
  static const float NegInf = -1.0 / 0.0;
@@ -109,6 +111,7 @@ void testVector(size_t &TotalTests, size_t &Passes, size_t &Failures,
    }
    COMPARE_VEC(cast, FromType, ToType, Value, FromTypeString, ToTypeString);
  }
+#endif // ARM32
 }
 #ifdef X8664_STACK_HACK

--- a/crosstest/test_fcmp_main.cpp
+++ b/crosstest/test_fcmp_main.cpp
@@ -116,6 +116,8 @@ void testsScalar(size_t &TotalTests, size_t &Passes, size_t &Failures) {
 }
 void testsVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  typedef v4si32 (*FuncTypeVector)(v4f32, v4f32);
  static struct {
    const char *Name;
@@ -157,6 +159,7 @@ void testsVector(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      }
    }
  }
+#endif // ARM32
 }
 #ifdef X8664_STACK_HACK

--- a/crosstest/test_icmp_main.cpp
+++ b/crosstest/test_icmp_main.cpp
@@ -125,6 +125,8 @@ const static size_t MaxTestsPerFunc = 100000;
 template <typename TypeUnsignedLabel, typename TypeSignedLabel>
 void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  typedef typename Vectors<TypeUnsignedLabel>::Ty TypeUnsigned;
  typedef typename Vectors<TypeSignedLabel>::Ty TypeSigned;
  typedef TypeUnsigned (*FuncTypeUnsigned)(TypeUnsigned, TypeUnsigned);
@@ -181,6 +183,7 @@ void testsVecInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      }
    }
  }
+#endif // ARM32
 }
 // Return true on wraparound
@@ -199,6 +202,8 @@ template <typename T> bool incrementI1Vector(typename Vectors<T>::Ty &Vect) {
 template <typename T>
 void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
+#ifndef ARM32
+  // TODO(jpp): remove this once vector support is implemented.
  typedef typename Vectors<T>::Ty Ty;
  typedef Ty (*FuncType)(Ty, Ty);
  static struct {
@@ -266,6 +271,7 @@ void testsVecI1(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      }
    }
  }
+#endif // ARM32
 }
 #ifdef X8664_STACK_HACK

--- a/pydir/crosstest.py
+++ b/pydir/crosstest.py
@@ -122,7 +122,8 @@ def main():
            bitcode_nonfinal = os.path.join(args.dir, base + '.' + key + '.bc')
            bitcode = os.path.join(args.dir, base + '.' + key + '.pnacl.ll')
            shellcmd(['{bin}/pnacl-clang'.format(bin=bindir),
-                      ('-O2' if args.clang_opt else '-O0'), '-c', arg,
+                      ('-O2' if args.clang_opt else '-O0'),
+                      ('-DARM32' if args.target == 'arm32' else ''), '-c', arg,
                      '-o', bitcode_nonfinal])
            shellcmd(['{bin}/pnacl-opt'.format(bin=bindir),
                      '-pnacl-abi-simplify-preopt',
@@ -185,12 +186,16 @@ def main():
    # configuration. In order to run the crosstests we play nasty, dangerous
    # tricks with the stack pointer.
    needs_stack_hack = (args.target == 'x8664')
-    stack_hack_params = []
+    target_params = []
    if needs_stack_hack:
      shellcmd('{bin}/clang -g -o stack_hack.x8664.{key}.o -c '
               'stack_hack.x8664.c'.format(bin=bindir, key=key))
-      stack_hack_params.append('-DX8664_STACK_HACK')
+      target_params.append('-DX8664_STACK_HACK')
-      stack_hack_params.append('stack_hack.x8664.{key}.o'.format(key=key))
+      target_params.append('stack_hack.x8664.{key}.o'.format(key=key))
+    if args.target == 'arm32':
+      target_params.append('-DARM32')
+      target_params.append('-static')
    # Set compiler to clang, clang++, pnacl-clang, or pnacl-clang++.
    compiler = '{bin}/{prefix}{cc}'.format(
@@ -204,7 +209,7 @@ def main():
                       '-lm', '-lpthread',
                       '-Wl,--defsym=__Sz_AbsoluteZero=0'] +
                      target_info.cross_headers)
-    shellcmd([compiler] + stack_hack_params + [args.driver] + objs +
+    shellcmd([compiler] + target_params + [args.driver] + objs +
             ['-o', os.path.join(args.dir, args.output)] + sb_native_args)
 if __name__ == '__main__':

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -350,23 +350,24 @@
 // the # of offset bits allowed as part of an addressing mode (for sign or zero
 // extending load/stores).
 #define ICETYPEARM32_TABLE                                                     \
-  /* tag,          element type, int_width, vec_width, addr bits sext, zext */ \
+  /* tag,          element type, int_width, vec_width, addr bits sext, zext,   \
-  X(IceType_void,  IceType_void, "" , ""    , 0 , 0)                           \
+     reg-reg addr allowed */                    \
-  X(IceType_i1,    IceType_void, "b", ""    , 8 , 12)                          \
+  X(IceType_void,  IceType_void, "" , ""    , 0 , 0 , 0)                    \
-  X(IceType_i8,    IceType_void, "b", ""    , 8 , 12)                          \
+  X(IceType_i1,    IceType_void, "b", ""    , 8 , 12, 1)                    \
-  X(IceType_i16,   IceType_void, "h", ""    , 8 , 8)                           \
+  X(IceType_i8,    IceType_void, "b", ""    , 8 , 12, 1)                    \
-  X(IceType_i32,   IceType_void, "" , ""    , 12, 12)                          \
+  X(IceType_i16,   IceType_void, "h", ""    , 8 , 8 , 1)                    \
-  X(IceType_i64,   IceType_void, "d", ""    , 8 , 8)                           \
+  X(IceType_i32,   IceType_void, "" , ""    , 12, 12, 1)                    \
-  X(IceType_f32,   IceType_void, "" , ".f32", 10, 10)                          \
+  X(IceType_i64,   IceType_void, "d", ""    , 8 , 8 , 1)                    \
-  X(IceType_f64,   IceType_void, "" , ".f64", 10, 10)                          \
+  X(IceType_f32,   IceType_void, "" , ".f32", 8,  8 , 0)                    \
-  X(IceType_v4i1,  IceType_i32 , "" , ".i32", 0 , 0)                           \
+  X(IceType_f64,   IceType_void, "" , ".f64", 8,  8 , 0)                    \
-  X(IceType_v8i1,  IceType_i16 , "" , ".i16", 0 , 0)                           \
+  X(IceType_v4i1,  IceType_i32 , "" , ".i32", 0 , 0 , 1)                    \
-  X(IceType_v16i1, IceType_i8  , "" , ".i8" , 0 , 0)                           \
+  X(IceType_v8i1,  IceType_i16 , "" , ".i16", 0 , 0 , 1)                    \
-  X(IceType_v16i8, IceType_i8  , "" , ".i8" , 0 , 0)                           \
+  X(IceType_v16i1, IceType_i8  , "" , ".i8" , 0 , 0 , 1)                    \
-  X(IceType_v8i16, IceType_i16 , "" , ".i16", 0 , 0)                           \
+  X(IceType_v16i8, IceType_i8  , "" , ".i8" , 0 , 0 , 1)                    \
-  X(IceType_v4i32, IceType_i32 , "" , ".i32", 0 , 0)                           \
+  X(IceType_v8i16, IceType_i16 , "" , ".i16", 0 , 0 , 1)                    \
-  X(IceType_v4f32, IceType_f32 , "" , ".f32", 0 , 0)
+  X(IceType_v4i32, IceType_i32 , "" , ".i32", 0 , 0 , 1)                    \
-//#define X(tag, elementty, int_width, vec_width, sbits, ubits)
+  X(IceType_v4f32, IceType_f32 , "" , ".f32", 0 , 0 , 1)
+//#define X(tag, elementty, int_width, vec_width, sbits, ubits, rraddr)
 // Shifter types for Data-processing operands as defined in section A5.1.2.
 #define ICEINSTARM32SHIFT_TABLE                                                \

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -320,12 +320,11 @@ public:
    Udiv,
    Umull,
    Uxt,
+    Vabs,
    Vadd,
    Vcmp,
    Vcvt,
    Vdiv,
-    Vldr,
-    Vmov,
    Vmrs,
    Vmul,
    Vsqrt,
@@ -780,13 +779,6 @@ using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
 using InstARM32Vsub = InstARM32ThreeAddrFP<InstARM32::Vsub>;
 using InstARM32Ldr = InstARM32Movlike<InstARM32::Ldr>;
-/// Move instruction (variable <- flex). This is more of a pseudo-inst. If var
-/// is a register, then we use "mov". If var is stack, then we use "str" to
-/// store to the stack.
-using InstARM32Mov = InstARM32Movlike<InstARM32::Mov>;
-/// Represents various vector mov instruction forms (simple single source,
-/// single dest forms only, not the 2 GPR <-> 1 D reg forms, etc.).
-using InstARM32Vldr = InstARM32Movlike<InstARM32::Vldr>;
 /// MovT leaves the bottom bits alone so dest is also a source. This helps
 /// indicate that a previous MovW setting dest is not dead code.
 using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>;
@@ -1120,90 +1112,47 @@ private:
 };
 /// Handles (some of) vmov's various formats.
-class InstARM32Vmov final : public InstARM32Pred {
+class InstARM32Mov final : public InstARM32Pred {
-  InstARM32Vmov() = delete;
+  InstARM32Mov() = delete;
-  InstARM32Vmov(const InstARM32Vmov &) = delete;
+  InstARM32Mov(const InstARM32Mov &) = delete;
-  InstARM32Vmov &operator=(const InstARM32Vmov &) = delete;
+  InstARM32Mov &operator=(const InstARM32Mov &) = delete;
 public:
-  /// RegisterPair is used to group registers in
+  static InstARM32Mov *create(Cfg *Func, Variable *Dest, Operand *Src,
-  ///
-  /// vmov D, (R, R)
-  ///
-  /// and
-  ///
-  /// vmov (R, R), D
-  struct RegisterPair {
-    explicit RegisterPair(Variable *V0, Variable *V1) : _0(V0), _1(V1) {
-      assert(V0->getType() == IceType_i32);
-      assert(V1->getType() == IceType_i32);
-    }
-    Variable *_0;
-    Variable *_1;
-  };
-  static InstARM32Vmov *create(Cfg *Func, Variable *Dest, Operand *Src,
                              CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Vmov>())
+    return new (Func->allocate<InstARM32Mov>())
-        InstARM32Vmov(Func, Dest, Src, Predicate);
+        InstARM32Mov(Func, Dest, Src, Predicate);
-  }
-  static InstARM32Vmov *create(Cfg *Func, const RegisterPair &Dests,
-                               Variable *Src, CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Vmov>())
-        InstARM32Vmov(Func, Dests, Src, Predicate);
-  }
-  static InstARM32Vmov *create(Cfg *Func, Variable *Dest,
-                               const RegisterPair &Srcs,
-                               CondARM32::Cond Predicate) {
-    return new (Func->allocate<InstARM32Vmov>())
-        InstARM32Vmov(Func, Dest, Srcs, Predicate);
  }
  bool isRedundantAssign() const override {
-    return Dest1 == nullptr && getSrcSize() == 1 &&
+    return !isMultiDest() && !isMultiSource() &&
           checkForRedundantAssign(getDest(), getSrc(0));
  }
  bool isSimpleAssign() const override { return true; }
  void emit(const Cfg *Func) const override;
  void emitIAS(const Cfg *Func) const override;
  void dump(const Cfg *Func) const override;
-  static bool classof(const Inst *Inst) { return isClassof(Inst, Vmov); }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Mov); }
-private:
-  InstARM32Vmov(Cfg *Func, Variable *Dest, Operand *Src,
-                CondARM32::Cond Predicate)
-      : InstARM32Pred(Func, InstARM32::Vmov, 1, Dest, Predicate) {
-    addSource(Src);
-  }
-  InstARM32Vmov(Cfg *Func, const RegisterPair &Dests, Variable *Src,
-                CondARM32::Cond Predicate)
-      : InstARM32Pred(Func, InstARM32::Vmov, 1, Dests._0, Predicate),
-        Dest1(Dests._1) {
-    addSource(Src);
-  }
-  InstARM32Vmov(Cfg *Func, Variable *Dest, const RegisterPair &Srcs,
-                CondARM32::Cond Predicate)
-      : InstARM32Pred(Func, InstARM32::Vmov, 2, Dest, Predicate) {
-    addSource(Srcs._0);
-    addSource(Srcs._1);
-  }
  bool isMultiDest() const {
    assert(getDest() != nullptr);
-    return Dest1 != nullptr;
+    return llvm::isa<Variable64On32>(getDest());
  }
  bool isMultiSource() const {
-    assert(getSrcSize() >= 1);
+    assert(getSrcSize() == 1);
-    return getSrcSize() > 1;
+    return llvm::isa<Variable64On32>(getSrc(0));
+  }
+private:
+  InstARM32Mov(Cfg *Func, Variable *Dest, Operand *Src,
+               CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, InstARM32::Mov, 1, Dest, Predicate) {
+    addSource(Src);
  }
  void emitMultiDestSingleSource(const Cfg *Func) const;
  void emitSingleDestMultiSource(const Cfg *Func) const;
  void emitSingleDestSingleSource(const Cfg *Func) const;
-  Variable *Dest1 = nullptr;
 };
 class InstARM32Vcmp final : public InstARM32Pred {
@@ -1246,15 +1195,33 @@ private:
  InstARM32Vmrs(Cfg *Func, CondARM32::Cond Predicate);
 };
+class InstARM32Vabs final : public InstARM32Pred {
+  InstARM32Vabs() = delete;
+  InstARM32Vabs(const InstARM32Vabs &) = delete;
+  InstARM32Vabs &operator=(const InstARM32Vabs &) = delete;
+public:
+  static InstARM32Vabs *create(Cfg *Func, Variable *Dest, Variable *Src,
+                               CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Vabs>())
+        InstARM32Vabs(Func, Dest, Src, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Vabs); }
+private:
+  InstARM32Vabs(Cfg *Func, Variable *Dest, Variable *Src,
+                CondARM32::Cond Predicate);
+};
 // Declare partial template specializations of emit() methods that already have
 // default implementations. Without this, there is the possibility of ODR
 // violations and link errors.
 template <> void InstARM32Ldr::emit(const Cfg *Func) const;
-template <> void InstARM32Mov::emit(const Cfg *Func) const;
 template <> void InstARM32Movw::emit(const Cfg *Func) const;
 template <> void InstARM32Movt::emit(const Cfg *Func) const;
-template <> void InstARM32Vldr::emit(const Cfg *Func) const;
 } // end of namespace Ice

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -189,7 +189,6 @@ protected:
  // The following are helpers that insert lowered ARM32 instructions with
  // minimal syntactic overhead, so that the lowering code can look as close to
  // assembly as practical.
  void _add(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1, Pred));
@@ -246,6 +245,10 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1, Pred));
  }
+  /// _ldr, for all your memory to Variable data moves. It handles all types
+  /// (integer, floating point, and vectors.) Addr needs to be valid for Dest's
+  /// type (e.g., no immediates for vector loads, and no index registers for fp
+  /// loads.)
  void _ldr(Variable *Dest, OperandARM32Mem *Addr,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Ldr::create(Func, Dest, Addr, Pred));
@@ -266,14 +269,17 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Mls::create(Func, Dest, Src0, Src1, Acc, Pred));
  }
-  /// If Dest=nullptr is passed in, then a new variable is created, marked as
+  /// _mov, for all your Variable to Variable data movement needs. It handles
-  /// infinite register allocation weight, and returned through the in/out Dest
+  /// all types (integer, floating point, and vectors), as well as moves between
-  /// argument.
+  /// Core and VFP registers. This is not a panacea: you must obey the (weird,
-  void _mov(Variable *&Dest, Operand *Src0,
+  /// confusing, non-uniform) rules for data moves in ARM.
-            CondARM32::Cond Pred = CondARM32::AL,
+  void _mov(Variable *Dest, Operand *Src0,
-            int32_t RegNum = Variable::NoRegister) {
+            CondARM32::Cond Pred = CondARM32::AL) {
-    if (Dest == nullptr)
+    // _mov used to be unique in the sense that it would create a temporary
-      Dest = makeReg(Src0->getType(), RegNum);
+    // automagically if Dest was nullptr. It won't do that anymore, so we keep
+    // an assert around just in case there is some untested code path where Dest
+    // is nullptr.
+    assert(Dest != nullptr);
    Context.insert(InstARM32Mov::create(Func, Dest, Src0, Pred));
  }
  void _mov_nonkillable(Variable *Dest, Operand *Src0,
@@ -348,6 +354,8 @@ protected:
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Sdiv::create(Func, Dest, Src0, Src1, Pred));
  }
+  /// _str, for all your Variable to memory transfers. Addr has the same
+  /// restrictions that it does in _ldr.
  void _str(Variable *Value, OperandARM32Mem *Addr,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Str::create(Func, Value, Addr, Pred));
@@ -387,6 +395,10 @@ protected:
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Uxt::create(Func, Dest, Src0, Pred));
  }
+  void _vabs(Variable *Dest, Variable *Src,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Vabs::create(Func, Dest, Src, Pred));
+  }
  void _vadd(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert(InstARM32Vadd::create(Func, Dest, Src0, Src1));
  }
@@ -397,10 +409,6 @@ protected:
  void _vdiv(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert(InstARM32Vdiv::create(Func, Dest, Src0, Src1));
  }
-  void _vldr(Variable *Dest, OperandARM32Mem *Src,
-             CondARM32::Cond Pred = CondARM32::AL) {
-    Context.insert(InstARM32Vldr::create(Func, Dest, Src, Pred));
-  }
  void _vcmp(Variable *Src0, Variable *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vcmp::create(Func, Src0, Src1, Pred));
@@ -408,33 +416,6 @@ protected:
  void _vmrs(CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert(InstARM32Vmrs::create(Func, Pred));
  }
-  // There are a whole bunch of vmov variants, to transfer within S/D/Q
-  // registers, between core integer registers and S/D, and from small
-  // immediates into S/D. For integer -> S/D/Q there is a variant which takes
-  // two integer register to fill a D, or to fill two consecutive S registers.
-  // Vmov can also be used to insert-element. E.g.,
-  //    "vmov.8 d0[1], r0"
-  // but insert-element is a "two-address" operation where only part of the
-  // register is modified. This cannot model that.
-  //
-  // This represents the simple single source, single dest variants only.
-  void _vmov(Variable *Dest, Operand *Src0,
-             CondARM32::Cond Pred = CondARM32::AL) {
-    Context.insert(InstARM32Vmov::create(Func, Dest, Src0, Pred));
-  }
-  // This represents the single source, multi dest variant.
-  void _vmov(InstARM32Vmov::RegisterPair Dests, Variable *Src0) {
-    constexpr CondARM32::Cond Pred = CondARM32::AL;
-    Context.insert(InstARM32Vmov::create(Func, Dests, Src0, Pred));
-    // The Vmov instruction created above does not define Dests._1. Therefore
-    // we add a Dest._1 = FakeDef pseudo instruction.
-    Context.insert(InstFakeDef::create(Func, Dests._1));
-  }
-  // This represents the multi source, single dest variant.
-  void _vmov(Variable *Dest, InstARM32Vmov::RegisterPair Srcs) {
-    constexpr CondARM32::Cond Pred = CondARM32::AL;
-    Context.insert(InstARM32Vmov::create(Func, Dest, Srcs, Pred));
-  }
  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert(InstARM32Vmul::create(Func, Dest, Src0, Src1));
  }
@@ -451,10 +432,11 @@ protected:
  /// offset, such that the addressing mode offset bits are now legal.
  void legalizeStackSlots();
  /// Returns true if the given Offset can be represented in a stack ldr/str.
-  bool isLegalVariableStackOffset(int32_t Offset) const;
+  bool isLegalVariableStackOffset(Type Ty, int32_t Offset) const;
  /// Assuming Var needs its offset legalized, define a new base register
-  /// centered on the given Var's offset and use it.
+  /// centered on the given Var's offset plus StackAdjust, and use it.
-  StackVariable *legalizeVariableSlot(Variable *Var, Variable *OrigBaseReg);
+  StackVariable *legalizeVariableSlot(Variable *Var, int32_t StackAdjust,
+                                      Variable *OrigBaseReg);
  TargetARM32Features CPUFeatures;
  bool UsesFramePointer = false;

--- a/tests_lit/llvm2ice_tests/64bit.pnacl.ll
+++ b/tests_lit/llvm2ice_tests/64bit.pnacl.ll
@@ -91,13 +91,11 @@ entry:
 ; ARM32-LABEL: pass64BitArg
 ; ARM32:      sub     sp, {{.*}} #16
-; ARM32:      str     {{.*}}, [sp, #4]
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      movw    r2, #123
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
 ; ARM32:      add     sp, {{.*}} #16
 ; ARM32:      sub     sp, {{.*}} #16
-; ARM32:      str     {{.*}}, [sp, #4]
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
@@ -105,7 +103,6 @@ entry:
 ; ARM32:      bl      {{.*}} ignore64BitArgNoInline
 ; ARM32:      add     sp, {{.*}} #16
 ; ARM32:      sub     sp, {{.*}} #16
-; ARM32:      str     {{.*}}, [sp, #4]
 ; ARM32:      str     {{.*}}, [sp]
 ; ARM32:      {{mov|ldr}} r0
 ; ARM32:      {{mov|ldr}} r1
@@ -147,9 +144,9 @@ entry:
 ; ARM32-LABEL: pass64BitConstArg
 ; ARM32:      sub     sp, {{.*}} #16
 ; ARM32:      movw    [[REG1:r.*]], {{.*}} ; 0xbeef
-; ARM32:      movt    [[REG1:r.*]], {{.*}} ; 0xdead
+; ARM32:      movt    [[REG1]], {{.*}}     ; 0xdead
 ; ARM32:      movw    [[REG2:r.*]], {{.*}} ; 0x5678
-; ARM32:      movt    [[REG2:r.*]], {{.*}} ; 0x1234
+; ARM32:      movt    [[REG2]], {{.*}}     ; 0x1234
 ; ARM32:      str     [[REG1]], [sp, #4]
 ; ARM32:      str     [[REG2]], [sp]
 ; ARM32:      {{mov|ldr}} r0
@@ -438,12 +435,13 @@ entry:
 ; OPTM1: je
 ; ARM32-LABEL: shl64BitSigned
-; ARM32: sub [[REG3:r.*]], [[REG2:r.*]], #32
+; ARM32: rsb     [[T0:r[0-9]+]], r2, #32
-; ARM32: lsl [[REG1:r.*]], {{r.*}}, [[REG2]]
+; ARM32: lsr     [[T1:r[0-9]+]], r0, [[T0]]
-; ARM32: orr [[REG1]], [[REG1]], [[REG0:r.*]], lsl [[REG3]]
+; ARM32: orr     [[T2:r[0-9]+]], [[T1]], r1, lsl r2
-; ARM32: rsb [[REG4:r.*]], [[REG2]], #32
+; ARM32: sub     [[T3:r[0-9]+]], r2, #32
-; ARM32: orr [[REG1]], [[REG1]], [[REG0]], lsr [[REG4]]
+; ARM32: cmp     [[T3]], #0
-; ARM32: lsl {{.*}}, [[REG0]], [[REG2]]
+; ARM32: lslge   [[T2]], r0, [[T3]]
+; ARM32: lsl     r{{[0-9]+}}, r0, r2
 define internal i32 @shl64BitSignedTrunc(i64 %a, i64 %b) {
 entry:
@@ -484,11 +482,12 @@ entry:
 ; OPTM1: je
 ; ARM32-LABEL: shl64BitUnsigned
-; ARM32: sub
-; ARM32: lsl
-; ARM32: orr
 ; ARM32: rsb
+; ARM32: lsr
 ; ARM32: orr
+; ARM32: sub
+; ARM32: cmp
+; ARM32: lslge
 ; ARM32: lsl
 define internal i64 @shr64BitSigned(i64 %a, i64 %b) {
@@ -511,12 +510,13 @@ entry:
 ; OPTM1: sar {{.*}},0x1f
 ; ARM32-LABEL: shr64BitSigned
-; ARM32: rsb
+; ARM32: lsr     [[T0:r[0-9]+]], r0, r2
-; ARM32: lsr
+; ARM32: rsb     [[T1:r[0-9]+]], r2, #32
-; ARM32: orr
+; ARM32: orr     r0, [[T0]], r1, lsl [[T1]]
-; ARM32: subs
+; ARM32: sub     [[T2:r[0-9]+]], r2, #32
-; ARM32: orrpl
+; ARM32: cmp     [[T2]], #0
-; ARM32: asr
+; ARM32: asrge   r0, r1, [[T2]] 
+; ARM32: asr     r{{[0-9]+}}, r1, r2
 define internal i32 @shr64BitSignedTrunc(i64 %a, i64 %b) {
 entry:
@@ -538,11 +538,12 @@ entry:
 ; OPTM1: sar {{.*}},0x1f
 ; ARM32-LABEL: shr64BitSignedTrunc
-; ARM32: rsb
 ; ARM32: lsr
+; ARM32: rsb
 ; ARM32: orr
-; ARM32: subs
+; ARM32: sub
-; ARM32: orrpl
+; ARM32: cmp
+; ARM32: asrge
 define internal i64 @shr64BitUnsigned(i64 %a, i64 %b) {
 entry:
@@ -562,11 +563,12 @@ entry:
 ; OPTM1: je
 ; ARM32-LABEL: shr64BitUnsigned
-; ARM32: rsb
 ; ARM32: lsr
+; ARM32: rsb
 ; ARM32: orr
 ; ARM32: sub
-; ARM32: orr
+; ARM32: cmp
+; ARM32: lsrge
 ; ARM32: lsr
 define internal i32 @shr64BitUnsignedTrunc(i64 %a, i64 %b) {
@@ -588,11 +590,12 @@ entry:
 ; OPTM1: je
 ; ARM32-LABEL: shr64BitUnsignedTrunc
-; ARM32: rsb
 ; ARM32: lsr
+; ARM32: rsb
 ; ARM32: orr
 ; ARM32: sub
-; ARM32: orr
+; ARM32: cmp
+; ARM32: lsrge
 define internal i64 @and64BitSigned(i64 %a, i64 %b) {
 entry:

--- a/tests_lit/llvm2ice_tests/bitcast.ll
+++ b/tests_lit/llvm2ice_tests/bitcast.ll
@@ -54,7 +54,7 @@ entry:
 ; ARM32-LABEL: cast_d2ll_const
 ; ARM32-DAG: movw [[ADDR:r[0-9]+]], #:lower16:.L$
 ; ARM32-DAG: movt [[ADDR]], #:upper16:.L$
-; ARM32-DAG: vldr [[DREG:d[0-9]+]], {{\[}}[[ADDR]], #0{{\]}}
+; ARM32-DAG: vldr [[DREG:d[0-9]+]], {{\[}}[[ADDR]]{{\]}}
 ; ARM32: vmov r{{[0-9]+}}, r{{[0-9]+}}, [[DREG]]
 define internal double @cast_ll2d(i64 %ll) {

--- a/tests_lit/llvm2ice_tests/fp.convert.ll
+++ b/tests_lit/llvm2ice_tests/fp.convert.ll
@@ -99,7 +99,7 @@ entry:
 ; ARM32-LABEL: doubleToSigned32Const
 ; ARM32-DAG: movw [[ADDR:r[0-9]+]], #:lower16:.L$
 ; ARM32-DAG: movt [[ADDR]], #:upper16:.L$
-; ARM32-DAG: vldr [[DREG:d[0-9]+]], {{\[}}[[ADDR]], #0{{\]}}
+; ARM32-DAG: vldr [[DREG:d[0-9]+]], {{\[}}[[ADDR]]{{\]}}
 ; ARM32-DAG: vcvt.s32.f64 [[REG:s[0-9]+]], [[DREG]]
 ; ARM32-DAF: vmov {{r[0-9]+}}, [[REG]]

--- a/tests_lit/llvm2ice_tests/int-arg.ll
+++ b/tests_lit/llvm2ice_tests/int-arg.ll
@@ -183,8 +183,8 @@ entry:
 ; CHECK-NEXT: mov {{.*}} [esp+0x14]
 ; CHECK: ret
 ; ARM32-LABEL: test_returning64_even_arg2
-; ARM32-NEXT: ldr r0, [sp]
+; ARM32-DAG: ldr r0, [sp]
-; ARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-DAG: ldr r1, [sp, #4]
 ; ARM32-NEXT: bx lr
 define i64 @test_returning64_even_arg2b(i64 %arg0, i32 %arg1, i32 %arg1b, i64 %arg2) {