Subzero. ARM32. Vector casts.

This CL un-scalarizes all vector casts operations in Subzero. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076 R=eholk@chromium.org Review URL: https://codereview.chromium.org/1878943009 .

Subzero. ARM32. Vector casts.
e88c7deb · John Porto · 15e77d46 · e88c7deb · e88c7deb · e88c7deb
Commit e88c7deb authored Apr 14, 2016 by John Porto
14 changed files
--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -224,13 +224,17 @@ bool encodeAdvSIMDExpandImm(IValueT Value, Type ElmtTy, IValueT &Op,
    return false;
  Imm8 = Value;
  switch (ElmtTy) {
+  case IceType_i8:
+    Op = 0;
+    Cmode = 14; // 0b1110
+    return true;
  case IceType_i16:
    Op = 0;
-    Cmode = 8; // 100:0
+    Cmode = 8; // 0b1000
    return true;
  case IceType_i32:
    Op = 0;
-    Cmode = 0; // 000:0
+    Cmode = 0; // 0b0000
    return true;
  default:
    return false;
@@ -1215,6 +1219,33 @@ void AssemblerARM32::emitSIMDqqq(IValueT Opcode, Type ElmtTy,
                  isFloatingType(ElmtTy), OpcodeName);
 }
+void AssemblerARM32::emitSIMDShiftqqc(IValueT Opcode, const Operand *OpQd,
+                                      const Operand *OpQm, const IValueT Imm6,
+                                      const char *OpcodeName) {
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", OpcodeName);
+  const IValueT Qn = 0;
+  const IValueT Qm = encodeQRegister(OpQm, "Qm", OpcodeName);
+  constexpr bool UseQRegs = true;
+  constexpr bool IsFloatTy = false;
+  constexpr IValueT ElmtShift = 16;
+  emitSIMDBase(Opcode | (Imm6 << ElmtShift), mapQRegToDReg(Qd),
+               mapQRegToDReg(Qn), mapQRegToDReg(Qm), UseQRegs, IsFloatTy);
+}
+void AssemblerARM32::emitSIMDCvtqq(IValueT Opcode, const Operand *OpQd,
+                                   const Operand *OpQm,
+                                   const char *OpcodeName) {
+  const IValueT SIMDOpcode =
+      B24 | B23 | B21 | B20 | B19 | B17 | B16 | B10 | B9 | Opcode;
+  constexpr bool UseQRegs = true;
+  constexpr bool IsFloatTy = false;
+  const IValueT Qd = encodeQRegister(OpQd, "Qd", OpcodeName);
+  constexpr IValueT Qn = 0;
+  const IValueT Qm = encodeQRegister(OpQm, "Qm", OpcodeName);
+  emitSIMDBase(SIMDOpcode, mapQRegToDReg(Qd), mapQRegToDReg(Qn),
+               mapQRegToDReg(Qm), UseQRegs, IsFloatTy);
+}
 void AssemblerARM32::emitVFPddd(CondARM32::Cond Cond, IValueT Opcode,
                                IValueT Dd, IValueT Dn, IValueT Dm) {
  assert(Dd < RegARM32::getNumDRegs());
@@ -2557,6 +2588,50 @@ void AssemblerARM32::vcvtus(const Operand *OpSd, const Operand *OpSm,
  emitVFPsss(Cond, VcvtsiOpcode, Sd, S0, Sm);
 }
+void AssemblerARM32::vcvtqsi(const Operand *OpQd, const Operand *OpQm) {
+  // VCVT (between floating-point and integer, Advanced SIMD)
+  //      - ARM Section A8.8.305, encoding A1:
+  //   vcvt<c>.f32.s32 <Qd>, <Qm>
+  //
+  // 111100111D11ss11dddd011ooQM0mmmm where Ddddd=Qd, Mmmmm=Qm, and 10=op.
+  constexpr const char *Vcvtqsi = "vcvt.s32.f32";
+  constexpr IValueT VcvtqsiOpcode = B8;
+  emitSIMDCvtqq(VcvtqsiOpcode, OpQd, OpQm, Vcvtqsi);
+}
+void AssemblerARM32::vcvtqsu(const Operand *OpQd, const Operand *OpQm) {
+  // VCVT (between floating-point and integer, Advanced SIMD)
+  //      - ARM Section A8.8.305, encoding A1:
+  //   vcvt<c>.f32.u32 <Qd>, <Qm>
+  //
+  // 111100111D11ss11dddd011ooQM0mmmm where Ddddd=Qd, Mmmmm=Qm, and 11=op.
+  constexpr const char *Vcvtqsu = "vcvt.u32.f32";
+  constexpr IValueT VcvtqsuOpcode = B8 | B7;
+  emitSIMDCvtqq(VcvtqsuOpcode, OpQd, OpQm, Vcvtqsu);
+}
+void AssemblerARM32::vcvtqis(const Operand *OpQd, const Operand *OpQm) {
+  // VCVT (between floating-point and integer, Advanced SIMD)
+  //      - ARM Section A8.8.305, encoding A1:
+  //   vcvt<c>.f32.s32 <Qd>, <Qm>
+  //
+  // 111100111D11ss11dddd011ooQM0mmmm where Ddddd=Qd, Mmmmm=Qm, and 01=op.
+  constexpr const char *Vcvtqis = "vcvt.f32.s32";
+  constexpr IValueT VcvtqisOpcode = 0;
+  emitSIMDCvtqq(VcvtqisOpcode, OpQd, OpQm, Vcvtqis);
+}
+void AssemblerARM32::vcvtqus(const Operand *OpQd, const Operand *OpQm) {
+  // VCVT (between floating-point and integer, Advanced SIMD)
+  //      - ARM Section A8.8.305, encoding A1:
+  //   vcvt<c>.f32.u32 <Qd>, <Qm>
+  //
+  // 111100111D11ss11dddd011ooQM0mmmm where Ddddd=Qd, Mmmmm=Qm, and 01=op.
+  constexpr const char *Vcvtqus = "vcvt.f32.u32";
+  constexpr IValueT VcvtqusOpcode = B7;
+  emitSIMDCvtqq(VcvtqusOpcode, OpQd, OpQm, Vcvtqus);
+}
 void AssemblerARM32::emitVFPds(CondARM32::Cond Cond, IValueT Opcode, IValueT Dd,
                               IValueT Sm) {
  assert(Dd < RegARM32::getNumDRegs());
@@ -3267,6 +3342,70 @@ void AssemblerARM32::vshlqi(Type ElmtTy, const Operand *OpQd,
  emitSIMDqqq(VshlOpcode, ElmtTy, OpQd, OpQn, OpQm, Vshl);
 }
+namespace {
+enum SIMDShiftType { ST_Vshl, ST_Vshr };
+IValueT encodeSIMDShiftImm6(SIMDShiftType Shift, Type ElmtTy,
+                            const ConstantInteger32 *Imm6) {
+  const IValueT Imm = Imm6->getValue();
+  assert(Imm > 0);
+  const SizeT MaxShift = getScalarIntBitWidth(ElmtTy);
+  assert(Imm < MaxShift);
+  assert(ElmtTy == IceType_i8 || ElmtTy == IceType_i16 ||
+         ElmtTy == IceType_i32);
+  const IValueT VshlImm = Imm - MaxShift;
+  const IValueT VshrImm = 2 * MaxShift - Imm;
+  return ((Shift == ST_Vshl) ? VshlImm : VshrImm) & (2 * MaxShift - 1);
+}
+} // end of anonymous namespace
+void AssemblerARM32::vshlqc(Type ElmtTy, const Operand *OpQd,
+                            const Operand *OpQm,
+                            const ConstantInteger32 *Imm6) {
+  // VSHL - ARM section A8.8.395, encoding A1:
+  //   vshl Qd, Qm, #Imm
+  //
+  // 1111001U1Diiiiiidddd0101LQM1mmmm where Ddddd=Qd, Mmmmm=Qm, iiiiii=Imm6,
+  // 0=U, 1=Q, 0=L.
+  assert(isScalarIntegerType(ElmtTy) &&
+         "vshl expects vector with integer element type");
+  constexpr const char *Vshl = "vshl";
+  constexpr IValueT VshlOpcode = B23 | B10 | B8 | B4;
+  emitSIMDShiftqqc(VshlOpcode, OpQd, OpQm,
+                   encodeSIMDShiftImm6(ST_Vshl, ElmtTy, Imm6), Vshl);
+}
+void AssemblerARM32::vshrqic(Type ElmtTy, const Operand *OpQd,
+                             const Operand *OpQm,
+                             const ConstantInteger32 *Imm6) {
+  // VSHR - ARM section A8.8.398, encoding A1:
+  //   vshr Qd, Qm, #Imm
+  //
+  // 1111001U1Diiiiiidddd0101LQM1mmmm where Ddddd=Qd, Mmmmm=Qm, iiiiii=Imm6,
+  // 0=U, 1=Q, 0=L.
+  assert(isScalarIntegerType(ElmtTy) &&
+         "vshr expects vector with integer element type");
+  constexpr const char *Vshr = "vshr";
+  constexpr IValueT VshrOpcode = B23 | B4;
+  emitSIMDShiftqqc(VshrOpcode, OpQd, OpQm,
+                   encodeSIMDShiftImm6(ST_Vshr, ElmtTy, Imm6), Vshr);
+}
+void AssemblerARM32::vshrquc(Type ElmtTy, const Operand *OpQd,
+                             const Operand *OpQm,
+                             const ConstantInteger32 *Imm6) {
+  // VSHR - ARM section A8.8.398, encoding A1:
+  //   vshr Qd, Qm, #Imm
+  //
+  // 1111001U1Diiiiiidddd0101LQM1mmmm where Ddddd=Qd, Mmmmm=Qm, iiiiii=Imm6,
+  // 0=U, 1=Q, 0=L.
+  assert(isScalarIntegerType(ElmtTy) &&
+         "vshr expects vector with integer element type");
+  constexpr const char *Vshr = "vshr";
+  constexpr IValueT VshrOpcode = B23 | B4;
+  emitSIMDShiftqqc(VshrOpcode, OpQd, OpQm,
+                   encodeSIMDShiftImm6(ST_Vshr, ElmtTy, Imm6), Vshr);
+}
 void AssemblerARM32::vshlqu(Type ElmtTy, const Operand *OpQd,
                            const Operand *OpQm, const Operand *OpQn) {
  // VSHL - ARM section A8.8.396, encoding A1:

--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -381,6 +381,14 @@ public:
  // vcvt<c>.u32.f32
  void vcvtus(const Operand *OpSd, const Operand *OpSm, CondARM32::Cond Cond);
+  void vcvtqsi(const Operand *OpQd, const Operand *OpQm);
+  void vcvtqsu(const Operand *OpQd, const Operand *OpQm);
+  void vcvtqis(const Operand *OpQd, const Operand *OpQm);
+  void vcvtqus(const Operand *OpQd, const Operand *OpQm);
  void vdivd(const Operand *OpDd, const Operand *OpDn, const Operand *OpDm,
             CondARM32::Cond Cond);
@@ -511,6 +519,15 @@ public:
  void vshlqu(Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
              const Operand *OpQn);
+  void vshlqc(Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+              const ConstantInteger32 *OpQn);
+  void vshrqic(Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+               const ConstantInteger32 *OpQn);
+  void vshrquc(Type ElmtTy, const Operand *OpQd, const Operand *OpQm,
+               const ConstantInteger32 *OpQn);
  void vsqrtd(const Operand *OpDd, const Operand *OpDm, CondARM32::Cond Cond);
  void vsqrts(const Operand *OpSd, const Operand *OpSm, CondARM32::Cond Cond);
@@ -739,6 +756,20 @@ private:
                   const Operand *OpQn, const Operand *OpQm,
                   const char *OpcodeName);
+  // Implements various forms of vector (SIMD) shifts using Q registers.
+  // Implements pattern 111100101Diiiiiidddd010101M1mmmm where Dddd=Qd, Mmmm=Qm,
+  // iiiiii=Imm6, and Opcode is unioned into the pattern.
+  void emitSIMDShiftqqc(IValueT Opcode, const Operand *OpQd,
+                        const Operand *OpQm, const IValueT Imm6,
+                        const char *OpcodeName);
+  // Implements various forms of vector (SIMD) casts between (signed and
+  // unsigned) integer and floating point types (f32). Implements pattern
+  // 111100111D11ss11dddd011ooQM0mmmm where Dddd=Qd, Mmmm=Qm, 10=ss, op=00, 1=Q,
+  // and Opcode is unioned into the pattern.
+  void emitSIMDCvtqq(IValueT Opcode, const Operand *OpQd, const Operand *OpQm,
+                     const char *CvtName);
  // Pattern cccctttxxxxnnnn0000iiiiiiiiiiii where cccc=Cond, nnnn=Rn,
  // ttt=Instruction type (derived from OpSrc1), iiiiiiiiiiii is derived from
  // OpSrc1, and xxxx=Opcode.

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -835,10 +835,48 @@ template <> void InstARM32Vshl::emitIAS(const Cfg *Func) const {
    switch (Sign) {
    case InstARM32::FS_None: // defaults to unsigned.
    case InstARM32::FS_Unsigned:
+      if (const auto *Imm6 = llvm::dyn_cast<ConstantInteger32>(getSrc(1))) {
+        Asm->vshlqc(ElmtTy, Dest, getSrc(0), Imm6);
+      } else {
        Asm->vshlqu(ElmtTy, Dest, getSrc(0), getSrc(1));
+      }
      break;
    case InstARM32::FS_Signed:
+      if (const auto *Imm6 = llvm::dyn_cast<ConstantInteger32>(getSrc(1))) {
+        Asm->vshlqc(ElmtTy, Dest, getSrc(0), Imm6);
+      } else {
        Asm->vshlqi(ElmtTy, Dest, getSrc(0), getSrc(1));
+      }
+      break;
+    }
+  } break;
+  }
+}
+template <> void InstARM32Vshr::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  const Type DestTy = Dest->getType();
+  switch (DestTy) {
+  default:
+    llvm::report_fatal_error("Vshr not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32: {
+    const Type ElmtTy = typeElementType(DestTy);
+    const auto *Imm6 = llvm::cast<ConstantInteger32>(getSrc(1));
+    assert(Sign != InstARM32::FS_None);
+    switch (Sign) {
+    case InstARM32::FS_None: // defaults to unsigned.
+    case InstARM32::FS_Unsigned:
+      Asm->vshrquc(ElmtTy, Dest, getSrc(0), Imm6);
+      break;
+    case InstARM32::FS_Signed:
+      Asm->vshrqic(ElmtTy, Dest, getSrc(0), Imm6);
      break;
    }
  } break;
@@ -1466,6 +1504,7 @@ template <> const char *InstARM32Vmul::Opcode = "vmul";
 template <> const char *InstARM32Vorr::Opcode = "vorr";
 template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg";
 template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl";
+template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshr>::Opcode = "vshr";
 template <> const char *InstARM32Vsub::Opcode = "vsub";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
@@ -2452,6 +2491,14 @@ const char *vcvtVariantSuffix(const InstARM32Vcvt::VcvtVariant Variant) {
    return ".f64.f32";
  case InstARM32Vcvt::D2s:
    return ".f32.f64";
+  case InstARM32Vcvt::Vs2si:
+    return ".s32.f32";
+  case InstARM32Vcvt::Vs2ui:
+    return ".u32.f32";
+  case InstARM32Vcvt::Vsi2s:
+    return ".f32.s32";
+  case InstARM32Vcvt::Vui2s:
+    return ".f32.u32";
  }
  llvm::report_fatal_error("Invalid VcvtVariant enum.");
 }
@@ -2503,6 +2550,18 @@ void InstARM32Vcvt::emitIAS(const Cfg *Func) const {
  case D2s:
    Asm->vcvtsd(getDest(), getSrc(0), getPredicate());
    break;
+  case Vs2si:
+    Asm->vcvtqsi(getDest(), getSrc(0));
+    break;
+  case Vs2ui:
+    Asm->vcvtqsu(getDest(), getSrc(0));
+    break;
+  case Vsi2s:
+    Asm->vcvtqis(getDest(), getSrc(0));
+    break;
+  case Vui2s:
+    Asm->vcvtqus(getDest(), getSrc(0));
+    break;
  }
  assert(!Asm->needsTextFixup());
 }
@@ -2913,6 +2972,7 @@ template class InstARM32FourAddrFP<InstARM32::Vmls>;
 template class InstARM32ThreeAddrFP<InstARM32::Vmul>;
 template class InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
 template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vshl>;
+template class InstARM32ThreeAddrSignAwareFP<InstARM32::Vshr>;
 template class InstARM32ThreeAddrFP<InstARM32::Vsub>;
 template class InstARM32LoadBase<InstARM32::Ldr>;

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -438,6 +438,7 @@ public:
    Vneg,
    Vorr,
    Vshl,
+    Vshr,
    Vsqrt,
    Vsub
  };
@@ -822,12 +823,18 @@ public:
        InstARM32ThreeAddrSignAwareFP(Func, Dest, Src0, Src1);
  }
+  static InstARM32ThreeAddrSignAwareFP *
+  create(Cfg *Func, Variable *Dest, Variable *Src0, ConstantInteger32 *Src1) {
+    return new (Func->allocate<InstARM32ThreeAddrSignAwareFP>())
+        InstARM32ThreeAddrSignAwareFP(Func, Dest, Src0, Src1);
+  }
  void emitIAS(const Cfg *Func) const override;
  void setSignType(InstARM32::FPSign SignType) { this->Sign = SignType; }
 private:
  InstARM32ThreeAddrSignAwareFP(Cfg *Func, Variable *Dest, Variable *Src0,
-                                Variable *Src1)
+                                Operand *Src1)
      : InstARM32ThreeAddrFP<K>(Func, Dest, Src0, Src1) {}
 };
@@ -993,6 +1000,7 @@ using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
 using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
 using InstARM32Vorr = InstARM32ThreeAddrFP<InstARM32::Vorr>;
 using InstARM32Vshl = InstARM32ThreeAddrSignAwareFP<InstARM32::Vshl>;
+using InstARM32Vshr = InstARM32ThreeAddrSignAwareFP<InstARM32::Vshr>;
 using InstARM32Vsub = InstARM32ThreeAddrFP<InstARM32::Vsub>;
 using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>;
 using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
@@ -1358,7 +1366,22 @@ class InstARM32Vcvt final : public InstARM32Pred {
  InstARM32Vcvt &operator=(const InstARM32Vcvt &) = delete;
 public:
-  enum VcvtVariant { S2si, S2ui, Si2s, Ui2s, D2si, D2ui, Si2d, Ui2d, S2d, D2s };
+  enum VcvtVariant {
+    S2si,
+    S2ui,
+    Si2s,
+    Ui2s,
+    D2si,
+    D2ui,
+    Si2d,
+    Ui2d,
+    S2d,
+    D2s,
+    Vs2si,
+    Vs2ui,
+    Vsi2s,
+    Vui2s,
+  };
  static InstARM32Vcvt *create(Cfg *Func, Variable *Dest, Variable *Src,
                               VcvtVariant Variant, CondARM32::Cond Predicate) {
    return new (Func->allocate<InstARM32Vcvt>())

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -894,6 +894,14 @@ protected:
  InstARM32Vshl *_vshl(Variable *Dest, Variable *Src0, Variable *Src1) {
    return Context.insert<InstARM32Vshl>(Dest, Src0, Src1);
  }
+  void _vshl(Variable *Dest, Variable *Src0, ConstantInteger32 *Src1) {
+    Context.insert<InstARM32Vshl>(Dest, Src0, Src1)
+        ->setSignType(InstARM32::FS_Unsigned);
+  }
+  InstARM32Vshr *_vshr(Variable *Dest, Variable *Src0,
+                       ConstantInteger32 *Src1) {
+    return Context.insert<InstARM32Vshr>(Dest, Src0, Src1);
+  }
  void _vsqrt(Variable *Dest, Variable *Src,
              CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vsqrt>(Dest, Src, Pred);

--- a/tests_lit/assembler/arm32/cmp-vec.ll
+++ b/tests_lit/assembler/arm32/cmp-vec.ll
@@ -36,7 +36,7 @@ entry:
 define internal <4 x i32> @cmpEq4f32(<4 x float> %a, <4 x float> %b) {
 ; ASM-LABEL:cmpEq4f32:
-; DIS-LABEL:00000240 <cmpEq4f32>:
+; DIS-LABEL:00000180 <cmpEq4f32>:
 entry:
  %cmp = fcmp oeq <4 x float> %a, %b
@@ -45,7 +45,7 @@ entry:
 ; ASM:        vcmp.f32 s0, s1
 ; ASM:        vcmp.f32 s0, s1
 ; ASM:        vcmp.f32 s0, s1
-; DIS:  27c:  eeb40a60
+; DIS:  1bc:  eeb40a60
  %cmp.ret_ext = zext <4 x i1> %cmp to <4 x i32>
  ret <4 x i32> %cmp.ret_ext

--- a/tests_lit/assembler/arm32/vcvt.f32.s32.ll
+++ b/tests_lit/assembler/arm32/vcvt.f32.s32.ll
@@ -36,3 +36,17 @@ entry:
  ret float %v
 }
+define internal <4 x float> @IntVecToFloatVec(<4 x i32> %a) {
+; ASM-LABEL: IntVecToFloatVec:
+; DIS-LABEL: 00000030 <IntVecToFloatVec>:
+; IASM-LABEL: IntVecToFloatVec:
+  %v = sitofp <4 x i32> %a to <4 x float>
+; ASM:         vcvt.f32.s32    q0, q0
+; DIS:     40: f3bb0640
+; IASM-NOT:    vcvt.f32.s32
+  ret <4 x float> %v
+}
--- a/tests_lit/assembler/arm32/vcvt.f32.u32.ll
+++ b/tests_lit/assembler/arm32/vcvt.f32.u32.ll
@@ -36,3 +36,17 @@ entry:
  ret float %v
 }
+define internal <4 x float> @UIntVecToFloatVec(<4 x i32> %a) {
+; ASM-LABEL: UIntVecToFloatVec:
+; DIS-LABEL: 00000030 <UIntVecToFloatVec>:
+; IASM-LABEL: UIntVecToFloatVec:
+  %v = uitofp <4 x i32> %a to <4 x float>
+; ASM:         vcvt.f32.u32    q0, q0
+; DIS:     40: f3bb06c0
+; IASM-NOT:    vcvt.f32.u32
+  ret <4 x float> %v
+}
--- a/tests_lit/assembler/arm32/vcvt.s32.f32.ll
+++ b/tests_lit/assembler/arm32/vcvt.s32.f32.ll
@@ -36,3 +36,17 @@ entry:
  ret i32 %v
 }
+define internal <4 x i32> @FloatVecToIntVec(<4 x float> %a) {
+; ASM-LABEL: FloatVecToIntVec:
+; DIS-LABEL: 00000030 <FloatVecToIntVec>:
+; IASM-LABEL: FloatVecToIntVec:
+  %v = fptosi <4 x float> %a to <4 x i32>
+; ASM:         vcvt.s32.f32    q0, q0
+; DIS:     40: f3bb0740
+; IASM-NOT:    vcvt.s32.f32
+  ret <4 x i32> %v
+}
--- a/tests_lit/assembler/arm32/vcvt.u32.f32.ll
+++ b/tests_lit/assembler/arm32/vcvt.u32.f32.ll
@@ -35,3 +35,17 @@ entry:
  ret i32 %v
 }
+define internal <4 x i32> @FloatVecToUIntVec(<4 x float> %a) {
+; ASM-LABEL: FloatVecToUIntVec:
+; DIS-LABEL: 00000030 <FloatVecToUIntVec>:
+; IASM-LABEL: FloatVecToUIntVec:
+  %v = fptoui <4 x float> %a to <4 x i32>
+; ASM:         vcvt.u32.f32    q0, q0
+; DIS:     40: f3bb07c0
+; IASM-NOT:    vcvt.u32.f32
+  ret <4 x i32> %v
+}
--- a/tests_lit/assembler/arm32/vec-move.ll
+++ b/tests_lit/assembler/arm32/vec-move.ll
@@ -23,17 +23,19 @@
 ; RUN:   | FileCheck %s --check-prefix=DIS
-define internal <4 x float> @testMoveVector(<4 x i32> %a) {
+define internal <4 x float> @testMoveVector(<4 x i32> %a, <4 x i32> %b) {
 ; ASM-LABEL: testMoveVector:
 ; DIS-LABEL:{{.+}} <testMoveVector>:
 ; IASM-LABEL: testMoveVector:
 entry:
-  %0 = sitofp <4 x i32> %a to <4 x float>
+  %0 = bitcast <4 x i32> %b to <4 x float>
  ret <4 x float> %0
 ; ASM:  vmov.f32        q0, q1
-; DIS:  3c:     eef03a40
+; The integrated assembler emits a vorr instead of a vmov.
+; DIS:  0:     f2220152
 ; IASM-NOT: vmov.f32    q0, q1
+; IASM-NOT: vorr        q0, q1, q1
 }
--- a/tests_lit/assembler/arm32/vec-sh-imm.ll
+++ b/tests_lit/assembler/arm32/vec-sh-imm.ll
+; Show that we know how to translate vshl and vshr with immediate shift amounts.
+; We abuse sign extension of vectors of i1 because that's the only way to force
+; Subzero to emit these instructions.
+; NOTE: We use -O2 to get rid of memory stores.
+; REQUIRES: allow_dump
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 | FileCheck %s --check-prefix=DIS
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=IASM
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -O2 | FileCheck %s --check-prefix=DIS
+define internal <4 x i32> @SextV4I1(<4 x i32> %a) {
+; ASM-LABEL:SextV4I1
+; DIS-LABEL:00000000 <SextV4I1>:
+; IASM-LABEL:SextV4I1:
+  %trunc = trunc <4 x i32> %a to <4 x i1>
+  %sext = sext <4 x i1> %trunc to <4 x i32>
+  ret <4 x i32> %sext
+; ASM:         vshl.u32 {{.*}}, #31
+; ASM-NEXT:    vshr.s32 {{.*}}, #31
+; DIS:      0: f2bf0550
+; DIS-NEXT: 4: f2a10050
+; IASM-NOT:    vshl.u32 {{.*}}, #31
+; IASM-NOT:    vshr.s32 {{.*}}, #31
+}
+define internal <8 x i16> @SextV8I1(<8 x i16> %a) {
+; ASM-LABEL:SextV8I1
+; DIS-LABEL:00000010 <SextV8I1>:
+; IASM-LABEL:SextV8I1:
+  %trunc = trunc <8 x i16> %a to <8 x i1>
+  %sext = sext <8 x i1> %trunc to <8 x i16>
+  ret <8 x i16> %sext
+; ASM:      vshl.u16 {{.*}}, #15
+; ASM-NEXT: vshr.s16 {{.*}}, #15
+; DIS:      10: f29f0550
+; DIS-NEXT: 14: f2910050
+; IASM-NOT: vshl.u16 {{.*}}, #15
+; IASM-NOT: vshr.s16 {{.*}}, #15
+}
+define internal <16 x i8> @SextV16I1(<16 x i8> %a) {
+; ASM-LABEL:SextV16I1
+; DIS-LABEL:00000020 <SextV16I1>:
+; IASM-LABEL:SextV16I1:
+  %trunc = trunc <16 x i8> %a to <16 x i1>
+  %sext = sext <16 x i1> %trunc to <16 x i8>
+  ret <16 x i8> %sext
+; ASM:      vshl.u8 {{.*}}, #7
+; ASM-NEXT: vshr.s8 {{.*}}, #7
+; DIS:      20: f28f0550
+; DIS-NEXT: 24: f2890050
+; IASM-NOT: vshl.u8 {{.*}}, #7
+; IASM-NOT: vshr.s8 {{.*}}, #7
+}
--- a/tests_lit/llvm2ice_tests/vector-cast.ll
+++ b/tests_lit/llvm2ice_tests/vector-cast.ll
 ; This file tests casting / conversion operations that apply to vector types.
 ; bitcast operations are in vector-bitcast.ll.
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
+; RUN: %p2i -i %s --target=x8632 --filetype=obj --disassemble --args -O2 \
-; RUN: %p2i -i %s --filetype=obj --disassemble --args -Om1 | FileCheck %s
+; RUN:     | FileCheck %s --check-prefix=X8632 --check-prefix=CHECK
+; RUN: %p2i -i %s --target=x8632 --filetype=obj --disassemble --args -Om1 \
+; RUN:     | FileCheck %s --check-prefix=X8632 --check-prefix=CHECK
+; RUN: %p2i -i %s --target=arm32 --filetype=obj --disassemble --args -O2 \
+; RUN:     | FileCheck %s --check-prefix=ARM32 --check-prefix=CHECK
+; RUN: %p2i -i %s --target=arm32 --filetype=obj --disassemble --args -Om1 \
+; RUN:     | FileCheck %s --check-prefix=ARM32 --check-prefix=CHECK
 ; sext operations
@@ -12,12 +19,14 @@ entry:
  ret <16 x i8> %res
 ; CHECK-LABEL: test_sext_v16i1_to_v16i8
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqb
+; X8632: pcmpeqb
-; CHECK: psubb
+; X8632: psubb
-; CHECK: pand
+; X8632: pand
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpgtb
+; X8632: pcmpgtb
+; ARM32: vshl.s8
+; ARM32-NEXT: vshr.s8
 }
 define internal <8 x i16> @test_sext_v8i1_to_v8i16(<8 x i1> %arg) {
@@ -26,8 +35,10 @@ entry:
  ret <8 x i16> %res
 ; CHECK-LABEL: test_sext_v8i1_to_v8i16
-; CHECK: psllw {{.*}},0xf
+; X8632: psllw {{.*}},0xf
-; CHECK: psraw {{.*}},0xf
+; X8632: psraw {{.*}},0xf
+; ARM32: vshl.s16
+; ARM32-NEXT: vshr.s16
 }
 define internal <4 x i32> @test_sext_v4i1_to_v4i32(<4 x i1> %arg) {
@@ -36,8 +47,10 @@ entry:
  ret <4 x i32> %res
 ; CHECK-LABEL: test_sext_v4i1_to_v4i32
-; CHECK: pslld {{.*}},0x1f
+; X8632: pslld {{.*}},0x1f
-; CHECK: psrad {{.*}},0x1f
+; X8632: psrad {{.*}},0x1f
+; ARM32: vshl.s32
+; ARM32-NEXT: vshr.s32
 }
 ; zext operations
@@ -48,10 +61,12 @@ entry:
  ret <16 x i8> %res
 ; CHECK-LABEL: test_zext_v16i1_to_v16i8
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqb
+; X8632: pcmpeqb
-; CHECK: psubb
+; X8632: psubb
-; CHECK: pand
+; X8632: pand
+; ARM32: vmov.i8 [[S:.*]], #1
+; ARM32-NEXT: vand {{.*}}, [[S]]
 }
 define internal <8 x i16> @test_zext_v8i1_to_v8i16(<8 x i1> %arg) {
@@ -60,10 +75,12 @@ entry:
  ret <8 x i16> %res
 ; CHECK-LABEL: test_zext_v8i1_to_v8i16
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqw
+; X8632: pcmpeqw
-; CHECK: psubw
+; X8632: psubw
-; CHECK: pand
+; X8632: pand
+; ARM32: vmov.i16 [[S:.*]], #1
+; ARM32-NEXT: vand {{.*}}, [[S]]
 }
 define internal <4 x i32> @test_zext_v4i1_to_v4i32(<4 x i1> %arg) {
@@ -72,10 +89,12 @@ entry:
  ret <4 x i32> %res
 ; CHECK-LABEL: test_zext_v4i1_to_v4i32
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqd
+; X8632: pcmpeqd
-; CHECK: psubd
+; X8632: psubd
-; CHECK: pand
+; X8632: pand
+; ARM32: vmov.i32 [[S:.*]], #1
+; ARM32-NEXT: vand {{.*}}, [[S]]
 }
 ; trunc operations
@@ -86,10 +105,10 @@ entry:
  ret <16 x i1> %res
 ; CHECK-LABEL: test_trunc_v16i8_to_v16i1
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqb
+; X8632: pcmpeqb
-; CHECK: psubb
+; X8632: psubb
-; CHECK: pand
+; X8632: pand
 }
 define internal <8 x i1> @test_trunc_v8i16_to_v8i1(<8 x i16> %arg) {
@@ -98,10 +117,10 @@ entry:
  ret <8 x i1> %res
 ; CHECK-LABEL: test_trunc_v8i16_to_v8i1
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqw
+; X8632: pcmpeqw
-; CHECK: psubw
+; X8632: psubw
-; CHECK: pand
+; X8632: pand
 }
 define internal <4 x i1> @test_trunc_v4i32_to_v4i1(<4 x i32> %arg) {
@@ -110,10 +129,10 @@ entry:
  ret <4 x i1> %res
 ; CHECK-LABEL: test_trunc_v4i32_to_v4i1
-; CHECK: pxor
+; X8632: pxor
-; CHECK: pcmpeqd
+; X8632: pcmpeqd
-; CHECK: psubd
+; X8632: psubd
-; CHECK: pand
+; X8632: pand
 }
 ; fpto[us]i operations
@@ -124,7 +143,8 @@ entry:
  ret <4 x i32> %res
 ; CHECK-LABEL: test_fptosi_v4f32_to_v4i32
-; CHECK: cvttps2dq
+; X8632: cvttps2dq
+; ARM32: vcvt.s32.f32
 }
 define internal <4 x i32> @test_fptoui_v4f32_to_v4i32(<4 x float> %arg) {
@@ -133,7 +153,8 @@ entry:
  ret <4 x i32> %res
 ; CHECK-LABEL: test_fptoui_v4f32_to_v4i32
-; CHECK: call {{.*}} R_{{.*}} __Sz_fptoui_4xi32_f32
+; X8632: call {{.*}} R_{{.*}} __Sz_fptoui_4xi32_f32
+; ARM32: vcvt.u32.f32
 }
 ; [su]itofp operations
@@ -144,7 +165,8 @@ entry:
  ret <4 x float> %res
 ; CHECK-LABEL: test_sitofp_v4i32_to_v4f32
-; CHECK: cvtdq2ps
+; X8632: cvtdq2ps
+; ARM32: vcvt.f32.s32
 }
 define internal <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
@@ -153,5 +175,6 @@ entry:
  ret <4 x float> %res
 ; CHECK-LABEL: test_uitofp_v4i32_to_v4f32
-; CHECK: call {{.*}} R_{{.*}} __Sz_uitofp_4xi32_4xf32
+; X8632: call {{.*}} R_{{.*}} __Sz_uitofp_4xi32_4xf32
+; ARM32: vcvt.f32.u32
 }