Optimize common vector shuffle patterns for ARM32.

Use VDUP for replicating a single element. Use VZIP for interleaving vectors. Use VMOV Dd, Dm for rearranging quadword vectors. Bug b/67106219 Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09 Reviewed-on: https://chromium-review.googlesource.com/695004Reviewed-by: Jim Stichnoth <stichnot@chromium.org> Reviewed-on: https://swiftshader-review.googlesource.com/12968Reviewed-by: Nicolas Capens <nicolascapens@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com>

Optimize common vector shuffle patterns for ARM32.
f6951fa3 · Nicolas Capens · Nicolas Capens · 416cfb9f · f6951fa3 · f6951fa3
Commit f6951fa3 authored Oct 02, 2017 by Nicolas Capens Committed by Nicolas Capens Oct 03, 2017
8 changed files
--- a/third_party/subzero/src/IceAssemblerARM32.cpp
+++ b/third_party/subzero/src/IceAssemblerARM32.cpp
@@ -3418,6 +3418,97 @@ void AssemblerARM32::vmlap(Type ElmtTy, const Operand *OpQd,
  emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy);
 }
+void AssemblerARM32::vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+                          IValueT Idx) {
+  // VDUP (scalar) - ARMv7-A/R section A8.6.302, encoding A1:
+  //   VDUP<c>.<size> <Qd>, <Dm[x]>
+  //
+  // 111100111D11iiiiddd011000QM0mmmm where Dddd=<Qd>, Mmmmm=<Dm>, and
+  // iiii=imm4 encodes <size> and [x].
+  constexpr const char *Vdup = "vdup";
+  const IValueT VdupOpcode = B25 | B24 | B23 | B21 | B20 | B11 | B10;
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vdup));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vdup));
+  constexpr bool UseQRegs = true;
+  constexpr bool IsFloatTy = false;
+  IValueT Imm4 = 0;
+  bool Lower = true;
+  switch (ElmtTy) {
+  case IceType_i8:
+    assert(Idx < 16);
+    Lower = Idx < 8;
+    Imm4 = 1 | ((Idx & 0x7) << 1);
+    break;
+  case IceType_i16:
+    assert(Idx < 8);
+    Lower = Idx < 4;
+    Imm4 = 2 | ((Idx & 0x3) << 2);
+    break;
+  case IceType_i32:
+  case IceType_f32:
+    assert(Idx < 4);
+    Lower = Idx < 2;
+    Imm4 = 4 | ((Idx & 0x1) << 3);
+    break;
+  default:
+    assert(false && "vdup only supports 8, 16, and 32-bit elements");
+    break;
+  }
+  emitSIMDBase(VdupOpcode, Dd, Imm4, Dn + (Lower ? 0 : 1), UseQRegs, IsFloatTy);
+}
+void AssemblerARM32::vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+                          const Operand *OpQm) {
+  // Pseudo-instruction which interleaves the elements of the lower halves of
+  // two quadword registers.
+  // Vzip - ARMv7-A/R section A8.6.410, encoding A1:
+  //   VZIP<c>.<size> <Dd>, <Dm>
+  //
+  // 111100111D11ss10dddd00011QM0mmmm where Ddddd=<Dd>, Mmmmm=<Dm>, and
+  // ss=<size>
+  assert(ElmtTy != IceType_i64 && "vzip on i64 vector not allowed");
+  constexpr const char *Vzip = "vzip";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vzip));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vzip));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vzip));
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloatTy = false;
+  // VMOV Dd, Dm
+  // 111100100D10mmmmdddd0001MQM1mmmm
+  constexpr IValueT VmovOpcode = B25 | B21 | B8 | B4;
+  // Copy lower half of second source to upper half of destination.
+  emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloatTy);
+  // Copy lower half of first source to lower half of destination.
+  if (Dd != Dn)
+    emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloatTy);
+  constexpr IValueT ElmtShift = 18;
+  const IValueT ElmtSize = encodeElmtType(ElmtTy);
+  assert(Utils::IsUint(2, ElmtSize));
+  if (ElmtTy != IceType_i32 && ElmtTy != IceType_f32) {
+    constexpr IValueT VzipOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B8 | B7;
+    // Zip the lower and upper half of destination.
+    emitSIMDBase(VzipOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
+                 IsFloatTy);
+  } else {
+    constexpr IValueT VtrnOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B7;
+    emitSIMDBase(VtrnOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
+                 IsFloatTy);
+  }
+}
 void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn,
                            const Operand *OpQm) {
  // VMUL (floating-point) - ARM section A8.8.351, encoding A1:
@@ -3448,6 +3539,110 @@ void AssemblerARM32::vmvnq(const Operand *OpQd, const Operand *OpQm) {
               mapQRegToDReg(Qm), UseQRegs, IsFloat);
 }
+void AssemblerARM32::vmovlq(const Operand *OpQd, const Operand *OpQn,
+                            const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the lower
+  // half of the second operand into the lower half of the destination.
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+  if (Dd != Dm)
+    emitSIMDBase(VmovOpcode, Dd, Dm, Dm, UseQRegs, IsFloat);
+  if (Dd + 1 != Dn + 1)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
+}
+void AssemblerARM32::vmovhq(const Operand *OpQd, const Operand *OpQn,
+                            const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the high
+  // half of the second operand into the high half of the destination.
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+  if (Dd != Dn)
+    emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
+  if (Dd + 1 != Dm + 1)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dm + 1, Dm + 1, UseQRegs, IsFloat);
+}
+void AssemblerARM32::vmovhlq(const Operand *OpQd, const Operand *OpQn,
+                             const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the high
+  // half of the second operand into the lower half of the destination.
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+  if (Dd != Dm + 1)
+    emitSIMDBase(VmovOpcode, Dd, Dm + 1, Dm + 1, UseQRegs, IsFloat);
+  if (Dd + 1 != Dn + 1)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
+}
+void AssemblerARM32::vmovlhq(const Operand *OpQd, const Operand *OpQn,
+                             const Operand *OpQm) {
+  // Pseudo-instruction to copy the first source operand and insert the lower
+  // half of the second operand into the high half of the destination.
+  // VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
+  //   VMOV<c> <Dd>, <Dm>
+  //
+  // 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
+  constexpr const char *Vmov = "vmov";
+  const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
+  const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
+  const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
+  constexpr bool UseQRegs = false;
+  constexpr bool IsFloat = false;
+  const IValueT VmovOpcode = B25 | B21 | B8 | B4;
+  if (Dd + 1 != Dm)
+    emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloat);
+  if (Dd != Dn)
+    emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
+}
 void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd,
                            const Operand *OpQm) {
  // VNEG - ARM section A8.8.355, encoding A1:

--- a/third_party/subzero/src/IceAssemblerARM32.h
+++ b/third_party/subzero/src/IceAssemblerARM32.h
@@ -546,6 +546,13 @@ public:
  void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
             const Operand *OpQm);
+  // Vector element replication.
+  void vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, IValueT Idx);
+  // Vector interleave lower halves.
+  void vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
+            const Operand *OpQm);
  // Float vector multiply.
  void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
@@ -554,6 +561,11 @@ public:
  void vmvnq(const Operand *OpQd, const Operand *OpQm);
+  void vmovlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+  void vmovhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+  void vmovhlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
+  void vmovlhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
  void vnegqs(const Operand *OpQd, const Operand *OpQm);
  void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm);

--- a/third_party/subzero/src/IceInst.h
+++ b/third_party/subzero/src/IceInst.h
@@ -997,35 +997,45 @@ public:
    return Indexes[Pos];
  }
-  inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
+  int32_t getIndexValue(SizeT Pos) const { return getIndex(Pos)->getValue(); }
-                         int32_t i4, int32_t i5, int32_t i6, int32_t i7) const {
+  bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3) const {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == getNumIndexes());
+    (void)ExpectedNumElements;
+    return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
+           getIndexValue(2) == i2 && getIndexValue(3) == i3;
+  }
+  bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
+                  int32_t i5, int32_t i6, int32_t i7) const {
    static constexpr SizeT ExpectedNumElements = 8;
    assert(ExpectedNumElements == getNumIndexes());
    (void)ExpectedNumElements;
-    return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
+    return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
-           getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
+           getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
-           getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
+           getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
-           getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7;
+           getIndexValue(6) == i6 && getIndexValue(7) == i7;
  }
-  inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3,
+  bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
-                         int32_t i4, int32_t i5, int32_t i6, int32_t i7,
+                  int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9,
-                         int32_t i8, int32_t i9, int32_t i10, int32_t i11,
+                  int32_t i10, int32_t i11, int32_t i12, int32_t i13,
-                         int32_t i12, int32_t i13, int32_t i14,
+                  int32_t i14, int32_t i15) const {
-                         int32_t i15) const {
    static constexpr SizeT ExpectedNumElements = 16;
    assert(ExpectedNumElements == getNumIndexes());
    (void)ExpectedNumElements;
-    return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 &&
+    return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
-           getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 &&
+           getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
-           getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 &&
+           getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
-           getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 &&
+           getIndexValue(6) == i6 && getIndexValue(7) == i7 &&
-           getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 &&
+           getIndexValue(8) == i8 && getIndexValue(9) == i9 &&
-           getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 &&
+           getIndexValue(10) == i10 && getIndexValue(11) == i11 &&
-           getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 &&
+           getIndexValue(12) == i12 && getIndexValue(13) == i13 &&
-           getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15;
+           getIndexValue(14) == i14 && getIndexValue(15) == i15;
  }
  bool isMemoryWrite() const override { return false; }

--- a/third_party/subzero/src/IceInstARM32.cpp
+++ b/third_party/subzero/src/IceInstARM32.cpp
@@ -903,6 +903,82 @@ template <> void InstARM32Vmvn::emitIAS(const Cfg *Func) const {
  }
 }
+template <> void InstARM32Vmovl::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovlq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovlq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+template <> void InstARM32Vmovh::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovhq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovhq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+template <> void InstARM32Vmovhl::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovhlq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovhlq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
+template <> void InstARM32Vmovlh::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Variable *Dest = getDest();
+  switch (Dest->getType()) {
+  default:
+    llvm::report_fatal_error("Vmovlhq not defined on type " +
+                             typeStdString(Dest->getType()));
+  case IceType_v4i1:
+  case IceType_v8i1:
+  case IceType_v16i1:
+  case IceType_v16i8:
+  case IceType_v8i16:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    Asm->vmovlhq(Dest, getSrc(0), getSrc(1));
+  } break;
+  }
+}
 template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const {
  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
  const Variable *Dest = getDest();
@@ -1168,6 +1244,15 @@ template <> void InstARM32Vmlap::emitIAS(const Cfg *Func) const {
  assert(!Asm->needsTextFixup());
 }
+template <> void InstARM32Vzip::emitIAS(const Cfg *Func) const {
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Src0 = getSrc(0);
+  const Operand *Src1 = getSrc(1);
+  Type DestTy = Dest->getType();
+  Asm->vzip(typeElementType(DestTy), Dest, Src0, Src1);
+  assert(!Asm->needsTextFixup());
+}
 template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const {
  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
  const Variable *Dest = getDest();
@@ -1425,6 +1510,12 @@ InstARM32Vstr1::InstARM32Vstr1(Cfg *Func, Variable *Value, OperandARM32Mem *Mem,
  this->Size = Size;
 }
+InstARM32Vdup::InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src,
+                             IValueT Idx)
+    : InstARM32Pred(Func, InstARM32::Vdup, 1, Dest, CondARM32::AL), Idx(Idx) {
+  addSource(Src);
+}
 InstARM32Trap::InstARM32Trap(Cfg *Func)
    : InstARM32(Func, InstARM32::Trap, 0, nullptr) {}
@@ -1775,6 +1866,10 @@ template <> const char *InstARM32Vmla::Opcode = "vmla";
 template <> const char *InstARM32Vmls::Opcode = "vmls";
 template <> const char *InstARM32Vmul::Opcode = "vmul";
 template <> const char *InstARM32Vmvn::Opcode = "vmvn";
+template <> const char *InstARM32Vmovl::Opcode = "vmovl";
+template <> const char *InstARM32Vmovh::Opcode = "vmovh";
+template <> const char *InstARM32Vmovhl::Opcode = "vmovhl";
+template <> const char *InstARM32Vmovlh::Opcode = "vmovlh";
 template <> const char *InstARM32Vorr::Opcode = "vorr";
 template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg";
 template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl";
@@ -1790,6 +1885,7 @@ template <>
 const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh";
 template <>
 const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap";
+template <> const char *InstARM32ThreeAddrFP<InstARM32::Vzip>::Opcode = "vzip";
 // Four-addr ops
 template <> const char *InstARM32Mla::Opcode = "mla";
 template <> const char *InstARM32Mls::Opcode = "mls";
@@ -2805,6 +2901,43 @@ void InstARM32Vstr1::dump(const Cfg *Func) const {
  getSrc(0)->dump(Func);
 }
+void InstARM32Vdup::emit(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Type Ty = getSrc(0)->getType();
+  const char *Opcode = "vdup";
+  Str << "\t" << Opcode;
+  Str << getPredicate() << "." << getWidthString(Ty) << getVecElmtBitsize(Ty);
+  Str << "\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+  Str << ", " << Idx;
+}
+void InstARM32Vdup::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  const Operand *Dest = getDest();
+  const Operand *Src = getSrc(0);
+  Type DestTy = Dest->getType();
+  Asm->vdup(typeElementType(DestTy), Dest, Src, Idx);
+}
+void InstARM32Vdup::dump(const Cfg *Func) const {
+  if (!BuildDefs::dump())
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = ";
+  dumpOpcodePred(Str, "vdup", getDest()->getType());
+  Str << " ";
+  dumpSources(Func);
+  Str << ", " << Idx;
+}
 void InstARM32Trap::emit(const Cfg *Func) const {
  if (!BuildDefs::dump())
    return;
@@ -3386,6 +3519,7 @@ template class InstARM32LoadBase<InstARM32::Ldr>;
 template class InstARM32LoadBase<InstARM32::Ldrex>;
 template class InstARM32LoadBase<InstARM32::Vldr1d>;
 template class InstARM32LoadBase<InstARM32::Vldr1q>;
+template class InstARM32ThreeAddrFP<InstARM32::Vzip>;
 template class InstARM32TwoAddrGPR<InstARM32::Movt>;
 template class InstARM32UnaryopGPR<InstARM32::Movw, false>;

--- a/third_party/subzero/src/IceInstARM32.h
+++ b/third_party/subzero/src/IceInstARM32.h
@@ -434,12 +434,17 @@ public:
    Vcmp,
    Vcvt,
    Vdiv,
+    Vdup,
    Veor,
    Vldr1d,
    Vldr1q,
    Vmla,
    Vmlap,
    Vmls,
+    Vmovl,
+    Vmovh,
+    Vmovhl,
+    Vmovlh,
    Vmrs,
    Vmul,
    Vmulh,
@@ -453,7 +458,8 @@ public:
    Vshr,
    Vsqrt,
    Vstr1,
-    Vsub
+    Vsub,
+    Vzip
  };
  static constexpr size_t InstSize = sizeof(uint32_t);
@@ -1020,6 +1026,10 @@ using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
 using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
 using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
 using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>;
+using InstARM32Vmovl = InstARM32ThreeAddrFP<InstARM32::Vmovl>;
+using InstARM32Vmovh = InstARM32ThreeAddrFP<InstARM32::Vmovh>;
+using InstARM32Vmovhl = InstARM32ThreeAddrFP<InstARM32::Vmovhl>;
+using InstARM32Vmovlh = InstARM32ThreeAddrFP<InstARM32::Vmovlh>;
 using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
 using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>;
 using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
@@ -1036,6 +1046,7 @@ using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>;
 using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
 using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>;
 using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>;
+using InstARM32Vzip = InstARM32ThreeAddrFP<InstARM32::Vzip>;
 /// MovT leaves the bottom bits alone so dest is also a source. This helps
 /// indicate that a previous MovW setting dest is not dead code.
 using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>;
@@ -1374,6 +1385,30 @@ private:
  SizeT Size;
 };
+/// Vector element duplication/replication instruction.
+class InstARM32Vdup final : public InstARM32Pred {
+  InstARM32Vdup() = delete;
+  InstARM32Vdup(const InstARM32Vdup &) = delete;
+  InstARM32Vdup &operator=(const InstARM32Vdup &) = delete;
+public:
+  /// Value must be a register.
+  static InstARM32Vdup *create(Cfg *Func, Variable *Dest, Variable *Src,
+                               IValueT Idx) {
+    return new (Func->allocate<InstARM32Vdup>())
+        InstARM32Vdup(Func, Dest, Src, Idx);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Instr) { return isClassof(Instr, Vdup); }
+private:
+  InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src, IValueT Idx);
+  const IValueT Idx;
+};
 class InstARM32Trap : public InstARM32 {
  InstARM32Trap() = delete;
  InstARM32Trap(const InstARM32Trap &) = delete;

--- a/third_party/subzero/src/IceTargetLoweringARM32.cpp
+++ b/third_party/subzero/src/IceTargetLoweringARM32.cpp
@@ -5357,7 +5357,7 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
      Func->setError("Unexpected size for LoadSubVector");
      return;
    }
-    _mov(Dest, T); // FIXME: necessary?
+    _mov(Dest, T);
    return;
  }
  case Intrinsics::StoreSubVector: {
@@ -5975,8 +5975,121 @@ void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
  const Type DestTy = Dest->getType();
  auto *T = makeReg(DestTy);
+  auto *Src0 = Instr->getSrc(0);
+  auto *Src1 = Instr->getSrc(1);
+  const SizeT NumElements = typeNumElements(DestTy);
+  const Type ElementType = typeElementType(DestTy);
+  bool Replicate = true;
+  for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
+    if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
+      Replicate = false;
+    }
+  }
+  if (Replicate) {
+    Variable *Src0Var = legalizeToReg(Src0);
+    _vdup(T, Src0Var, Instr->getIndexValue(0));
+    _mov(Dest, T);
+    return;
+  }
  switch (DestTy) {
+  case IceType_v8i1:
+  case IceType_v8i16: {
+    static constexpr SizeT ExpectedNumElements = 8;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vqmovn2(T, Src0R, Src0R, false, false);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
+  case IceType_v16i1:
+  case IceType_v16i8: {
+    static constexpr SizeT ExpectedNumElements = 16;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+    if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
+                          23)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
+  case IceType_v4i1:
+  case IceType_v4i32:
+  case IceType_v4f32: {
+    static constexpr SizeT ExpectedNumElements = 4;
+    assert(ExpectedNumElements == Instr->getNumIndexes());
+    (void)ExpectedNumElements;
+    if (Instr->indexesAre(0, 0, 1, 1)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vzip(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(0, 4, 1, 5)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vzip(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(0, 1, 4, 5)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vmovlh(T, Src0R, Src1R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(2, 3, 2, 3)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      _vmovhl(T, Src0R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+    if (Instr->indexesAre(2, 3, 6, 7)) {
+      Variable *Src0R = legalizeToReg(Src0);
+      Variable *Src1R = legalizeToReg(Src1);
+      _vmovhl(T, Src1R, Src0R);
+      _mov(Dest, T);
+      return;
+    }
+  } break;
  default:
    break;
    // TODO(jpp): figure out how to properly lower this without scalarization.
@@ -5984,10 +6097,6 @@ void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
  // Unoptimized shuffle. Perform a series of inserts and extracts.
  Context.insert<InstFakeDef>(T);
-  auto *Src0 = Instr->getSrc(0);
-  auto *Src1 = Instr->getSrc(1);
-  const SizeT NumElements = typeNumElements(DestTy);
-  const Type ElementType = typeElementType(DestTy);
  for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
    auto *Index = Instr->getIndex(I);
    const SizeT Elem = Index->getValue();

--- a/third_party/subzero/src/IceTargetLoweringARM32.h
+++ b/third_party/subzero/src/IceTargetLoweringARM32.h
@@ -885,6 +885,9 @@ protected:
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred);
  }
+  void _vdup(Variable *Dest, Variable *Src, int Idx) {
+    Context.insert<InstARM32Vdup>(Dest, Src, Idx);
+  }
  void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Veor>(Dest, Src0, Src1);
  }
@@ -908,6 +911,18 @@ protected:
  void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmls>(Dest, Src0, Src1);
  }
+  void _vmovl(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovl>(Dest, Src0, Src1);
+  }
+  void _vmovh(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovh>(Dest, Src0, Src1);
+  }
+  void _vmovhl(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovhl>(Dest, Src0, Src1);
+  }
+  void _vmovlh(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vmovlh>(Dest, Src0, Src1);
+  }
  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
  }
@@ -966,6 +981,9 @@ protected:
  void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vsub>(Dest, Src0, Src1);
  }
+  void _vzip(Variable *Dest, Variable *Src0, Variable *Src1) {
+    Context.insert<InstARM32Vzip>(Dest, Src0, Src1);
+  }
  // Iterates over the CFG and determines the maximum outgoing stack arguments
  // bytes. This information is later used during addProlog() to pre-allocate

--- a/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
+++ b/third_party/subzero/src/IceTargetLoweringX86BaseImpl.h
@@ -6304,22 +6304,22 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
      break;
    }
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index3 = Instr->getIndexValue(3);
-    const SizeT Index4 = Instr->getIndex(4)->getValue();
+    const SizeT Index4 = Instr->getIndexValue(4);
-    const SizeT Index5 = Instr->getIndex(5)->getValue();
+    const SizeT Index5 = Instr->getIndexValue(5);
-    const SizeT Index6 = Instr->getIndex(6)->getValue();
+    const SizeT Index6 = Instr->getIndexValue(6);
-    const SizeT Index7 = Instr->getIndex(7)->getValue();
+    const SizeT Index7 = Instr->getIndexValue(7);
-    const SizeT Index8 = Instr->getIndex(8)->getValue();
+    const SizeT Index8 = Instr->getIndexValue(8);
-    const SizeT Index9 = Instr->getIndex(9)->getValue();
+    const SizeT Index9 = Instr->getIndexValue(9);
-    const SizeT Index10 = Instr->getIndex(10)->getValue();
+    const SizeT Index10 = Instr->getIndexValue(10);
-    const SizeT Index11 = Instr->getIndex(11)->getValue();
+    const SizeT Index11 = Instr->getIndexValue(11);
-    const SizeT Index12 = Instr->getIndex(12)->getValue();
+    const SizeT Index12 = Instr->getIndexValue(12);
-    const SizeT Index13 = Instr->getIndex(13)->getValue();
+    const SizeT Index13 = Instr->getIndexValue(13);
-    const SizeT Index14 = Instr->getIndex(14)->getValue();
+    const SizeT Index14 = Instr->getIndexValue(14);
-    const SizeT Index15 = Instr->getIndex(15)->getValue();
+    const SizeT Index15 = Instr->getIndexValue(15);
    lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
                                   Index3, Index4, Index5, Index6, Index7,
@@ -6376,14 +6376,14 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
      break;
    }
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index3 = Instr->getIndexValue(3);
-    const SizeT Index4 = Instr->getIndex(4)->getValue();
+    const SizeT Index4 = Instr->getIndexValue(4);
-    const SizeT Index5 = Instr->getIndex(5)->getValue();
+    const SizeT Index5 = Instr->getIndexValue(5);
-    const SizeT Index6 = Instr->getIndex(6)->getValue();
+    const SizeT Index6 = Instr->getIndexValue(6);
-    const SizeT Index7 = Instr->getIndex(7)->getValue();
+    const SizeT Index7 = Instr->getIndexValue(7);
 #define TO_BYTE_INDEX(I) ((I) << 1)
    lowerShuffleVector_UsingPshufb(
@@ -6403,10 +6403,10 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
  case IceType_v4f32: {
    static constexpr SizeT ExpectedNumElements = 4;
    assert(ExpectedNumElements == Instr->getNumIndexes());
-    const SizeT Index0 = Instr->getIndex(0)->getValue();
+    const SizeT Index0 = Instr->getIndexValue(0);
-    const SizeT Index1 = Instr->getIndex(1)->getValue();
+    const SizeT Index1 = Instr->getIndexValue(1);
-    const SizeT Index2 = Instr->getIndex(2)->getValue();
+    const SizeT Index2 = Instr->getIndexValue(2);
-    const SizeT Index3 = Instr->getIndex(3)->getValue();
+    const SizeT Index3 = Instr->getIndexValue(3);
    Variable *T = nullptr;
    switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
 #define CASE_SRCS_IN(S0, S1, S2, S3)                                           \
@@ -6611,8 +6611,7 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
          InstExtractElement::create(Func, ExtElmt, Src0, Index));
    } else {
      lowerExtractElement(InstExtractElement::create(
-          Func, ExtElmt, Src1,
+          Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
-          Ctx->getConstantInt32(Index->getValue() - NumElements)));
    }
    auto *NewT = makeReg(DestTy);
    lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,