ARM32 Vector lowering - scalarize select

With this change, we pass the select crosstest. Since this would have introduced a three-argument version of scalarizeInstruction, I decided to generalize it using templates. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1683243003 .

ARM32 Vector lowering - scalarize select
d6cf6b38 · Eric Holk · b35615b4 · d6cf6b38 · d6cf6b38 · d6cf6b38
Commit d6cf6b38 authored Feb 17, 2016 by Eric Holk
7 changed files
--- a/Makefile.standalone
+++ b/Makefile.standalone
@@ -448,7 +448,8 @@ check-xtest: $(OBJDIR)/pnacl-sz make_symlink runtime
          -i x8664,sandbox,sse4.1,Om1 \
          -i arm32,neon \
          -e arm32,neon,test_vector_ops \
-          -e arm32,neon,test_select
+          -e arm32,nonsfi \
+          -e arm32,neon,test_vector_ops
 	PNACL_BIN_PATH=$(PNACL_BIN_PATH) \
 	$(LLVM_SRC_PATH)/utils/lit/lit.py -sv $(CHECK_XTEST_TESTS)
 endif

--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -313,6 +313,7 @@ TargetLowering::AutoBundle::~AutoBundle() {
 }
 void TargetLowering::genTargetHelperCalls() {
+  Utils::BoolFlagSaver _(GeneratingTargetHelpers, true);
  for (CfgNode *Node : Func->getNodes()) {
    Context.init(Node);
    while (!Context.atEnd()) {
@@ -711,10 +712,9 @@ void TargetLowering::scalarizeArithmetic(InstArithmetic::OpKind Kind,
                                         Variable *Dest, Operand *Src0,
                                         Operand *Src1) {
  scalarizeInstruction(
-      Dest, Src0, Src1,
+      Dest, [this, Kind](Variable *Dest, Operand *Src0, Operand *Src1) {
-      [this, Kind](Variable *Dest, Variable *Src0, Variable *Src1) {
        return Context.insert<InstArithmetic>(Kind, Dest, Src0, Src1);
-      });
+      }, Src0, Src1);
 }
 void TargetLowering::emitWithoutPrefix(const ConstantRelocatable *C,

--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -325,6 +325,10 @@ private:
  // locking/unlocking) to prevent nested bundles.
  bool AutoBundling = false;
+  /// This indicates whether we are in the genTargetHelperCalls phase, and
+  /// therefore can do things like scalarization.
+  bool GeneratingTargetHelpers = false;
  // _bundle_lock(), and _bundle_unlock(), were made private to force subtargets
  // to use the AutoBundle helper.
  void
@@ -469,39 +473,42 @@ protected:
  /// Generalizes scalarizeArithmetic to support other instruction types.
  ///
-  /// MakeInstruction is a function-like object with signature
+  /// insertScalarInstruction is a function-like object with signature
  /// (Variable *Dest, Variable *Src0, Variable *Src1) -> Instr *.
-  template <typename F>
+  template <typename... Operands,
-  void scalarizeInstruction(Variable *Dest, Operand *Src0, Operand *Src1,
+            typename F = std::function<Inst *(Variable *, Operands *...)>>
-                            F &&MakeInstruction) {
+  void scalarizeInstruction(Variable *Dest, F insertScalarInstruction,
+                            Operands *... Srcs) {
+    assert(GeneratingTargetHelpers &&
+           "scalarizeInstruction called during incorrect phase");
    const Type DestTy = Dest->getType();
    assert(isVectorType(DestTy));
    const Type DestElementTy = typeElementType(DestTy);
    const SizeT NumElements = typeNumElements(DestTy);
-    const Type Src0ElementTy = typeElementType(Src0->getType());
-    const Type Src1ElementTy = typeElementType(Src1->getType());
-    assert(NumElements == typeNumElements(Src0->getType()));
-    assert(NumElements == typeNumElements(Src1->getType()));
    Variable *T = Func->makeVariable(DestTy);
    Context.insert<InstFakeDef>(T);
    for (SizeT I = 0; I < NumElements; ++I) {
-      Constant *Index = Ctx->getConstantInt32(I);
+      auto *Index = Ctx->getConstantInt32(I);
-      // Extract the next two inputs.
+      auto makeExtractThunk = [this, Index, NumElements](Operand *Src) {
-      Variable *Op0 = Func->makeVariable(Src0ElementTy);
+        return [this, Index, NumElements, Src]() {
-      Context.insert<InstExtractElement>(Op0, Src0, Index);
+          assert(typeNumElements(Src->getType()) == NumElements);
-      Variable *Op1 = Func->makeVariable(Src1ElementTy);
-      Context.insert<InstExtractElement>(Op1, Src1, Index);
+          const auto ElementTy = typeElementType(Src->getType());
+          auto *Op = Func->makeVariable(ElementTy);
+          Context.insert<InstExtractElement>(Op, Src, Index);
+          return Op;
+        };
+      };
      // Perform the operation as a scalar operation.
-      Variable *Res = Func->makeVariable(DestElementTy);
+      auto *Res = Func->makeVariable(DestElementTy);
-      auto Arith = MakeInstruction(Res, Op0, Op1);
+      auto *Arith = applyToThunkedArgs(insertScalarInstruction, Res,
-      // We might have created an operation that needed a helper call.
+                                       makeExtractThunk(Srcs)...);
      genTargetHelperCallFor(Arith);
-      // Insert the result into position.
      Variable *DestT = Func->makeVariable(DestTy);
      Context.insert<InstInsertElement>(DestT, T, Res, Index);
      T = DestT;
@@ -509,38 +516,38 @@ protected:
    Context.insert<InstAssign>(Dest, T);
  }
-  template <typename F>
+  // applyToThunkedArgs is used by scalarizeInstruction. Ideally, we would just
-  void scalarizeUnaryInstruction(Variable *Dest, Operand *Src0,
+  // call insertScalarInstruction(Res, Srcs...), but C++ does not specify
-                                 F &&MakeInstruction) {
+  // evaluation order which means this leads to an unpredictable final
-    const Type DestTy = Dest->getType();
+  // output. Instead, we wrap each of the Srcs in a thunk and these
-    assert(isVectorType(DestTy));
+  // applyToThunkedArgs functions apply the thunks in a well defined order so we
-    const Type DestElementTy = typeElementType(DestTy);
+  // still get well-defined output.
-    const SizeT NumElements = typeNumElements(DestTy);
+  Inst *applyToThunkedArgs(
-    const Type Src0ElementTy = typeElementType(Src0->getType());
+      std::function<Inst *(Variable *, Variable *)> insertScalarInstruction,
+      Variable *Res, std::function<Variable *()> thunk0) {
-    assert(NumElements == typeNumElements(Src0->getType()));
+    auto *Src0 = thunk0();
+    return insertScalarInstruction(Res, Src0);
-    Variable *T = Func->makeVariable(DestTy);
+  }
-    Context.insert<InstFakeDef>(T);
-    for (SizeT I = 0; I < NumElements; ++I) {
-      Constant *Index = Ctx->getConstantInt32(I);
-      // Extract the next two inputs.
-      Variable *Op0 = Func->makeVariable(Src0ElementTy);
-      Context.insert<InstExtractElement>(Op0, Src0, Index);
-      // Perform the operation as a scalar operation.
+  Inst *
-      Variable *Res = Func->makeVariable(DestElementTy);
+  applyToThunkedArgs(std::function<Inst *(Variable *, Variable *, Variable *)>
-      auto Arith = MakeInstruction(Res, Op0);
+                         insertScalarInstruction,
-      // We might have created an operation that needed a helper call.
+                     Variable *Res, std::function<Variable *()> thunk0,
-      genTargetHelperCallFor(Arith);
+                     std::function<Variable *()> thunk1) {
+    auto *Src0 = thunk0();
+    auto *Src1 = thunk1();
+    return insertScalarInstruction(Res, Src0, Src1);
+  }
-      // Insert the result into position.
+  Inst *applyToThunkedArgs(
-      Variable *DestT = Func->makeVariable(DestTy);
+      std::function<Inst *(Variable *, Variable *, Variable *, Variable *)>
-      Context.insert<InstInsertElement>(DestT, T, Res, Index);
+          insertScalarInstruction,
-      T = DestT;
+      Variable *Res, std::function<Variable *()> thunk0,
-    }
+      std::function<Variable *()> thunk1, std::function<Variable *()> thunk2) {
-    Context.insert<InstAssign>(Dest, T);
+    auto *Src0 = thunk0();
+    auto *Src1 = thunk1();
+    auto *Src2 = thunk2();
+    return insertScalarInstruction(Res, Src0, Src1, Src2);
  }
  /// SandboxType enumerates all possible sandboxing strategies that

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -592,10 +592,10 @@ void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
    const InstCast::OpKind CastKind = CastInstr->getCastKind();
    if (isVectorType(DestTy)) {
-      scalarizeUnaryInstruction(
+      scalarizeInstruction(
-          Dest, Src0, [this, CastKind](Variable *Dest, Variable *Src) {
+          Dest, [this, CastKind](Variable *Dest, Variable *Src) {
            return Context.insert<InstCast>(CastKind, Dest, Src);
-          });
+          }, Src0);
      CastInstr->setDeleted();
      return;
    }
@@ -753,10 +753,11 @@ void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
      auto *CmpInstr = llvm::cast<InstIcmp>(Instr);
      const auto Condition = CmpInstr->getCondition();
      scalarizeInstruction(
-          Dest, CmpInstr->getSrc(0), CmpInstr->getSrc(1),
+          Dest,
          [this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
            return Context.insert<InstIcmp>(Condition, Dest, Src0, Src1);
-          });
+          },
+          CmpInstr->getSrc(0), CmpInstr->getSrc(1));
      CmpInstr->setDeleted();
    }
    return;
@@ -768,14 +769,33 @@ void TargetARM32::genTargetHelperCallFor(Inst *Instr) {
      auto *CmpInstr = llvm::cast<InstFcmp>(Instr);
      const auto Condition = CmpInstr->getCondition();
      scalarizeInstruction(
-          Dest, CmpInstr->getSrc(0), CmpInstr->getSrc(1),
+          Dest,
          [this, Condition](Variable *Dest, Variable *Src0, Variable *Src1) {
            return Context.insert<InstFcmp>(Condition, Dest, Src0, Src1);
-          });
+          },
+          CmpInstr->getSrc(0), CmpInstr->getSrc(1));
      CmpInstr->setDeleted();
    }
    return;
  }
+  case Inst::Select: {
+    Variable *Dest = Instr->getDest();
+    const auto DestTy = Dest->getType();
+    if (isVectorType(DestTy)) {
+      auto *SelectInstr = llvm::cast<InstSelect>(Instr);
+      scalarizeInstruction(Dest,
+                           [this](Variable *Dest, Variable *Src0,
+                                  Variable *Src1, Variable *Src2) {
+                             return Context.insert<InstSelect>(Dest, Src0, Src1,
+                                                               Src2);
+                           },
+                           llvm::cast<Variable>(SelectInstr->getSrc(0)),
+                           llvm::cast<Variable>(SelectInstr->getSrc(1)),
+                           llvm::cast<Variable>(SelectInstr->getSrc(2)));
+      SelectInstr->setDeleted();
+    }
+    return;
+  }
  }
 }

--- a/src/IceTargetLoweringX86BaseImpl.h
+++ b/src/IceTargetLoweringX86BaseImpl.h
@@ -87,21 +87,7 @@ template <> struct PoolTypeConverter<uint8_t> {
 namespace X86NAMESPACE {
-/// A helper class to ease the settings of RandomizationPoolingPause to disable
+using Utils::BoolFlagSaver;
-/// constant blinding or pooling for some translation phases.
-class BoolFlagSaver {
-  BoolFlagSaver() = delete;
-  BoolFlagSaver(const BoolFlagSaver &) = delete;
-  BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
-public:
-  BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
-  ~BoolFlagSaver() { Flag = OldValue; }
-private:
-  const bool OldValue;
-  bool &Flag;
-};
 template <typename Traits> class BoolFoldingEntry {
  BoolFoldingEntry(const BoolFoldingEntry &) = delete;

--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -123,6 +123,25 @@ template <typename T> static bool isPositiveZero(T Val) {
  return Val == 0 && !std::signbit(Val);
 }
+/// An RAII class to ensure that a boolean flag is restored to its previous
+/// value upon function exit.
+///
+/// Used in places like RandomizationPoolingPause and generating target helper
+/// calls.
+class BoolFlagSaver {
+  BoolFlagSaver() = delete;
+  BoolFlagSaver(const BoolFlagSaver &) = delete;
+  BoolFlagSaver &operator=(const BoolFlagSaver &) = delete;
+public:
+  BoolFlagSaver(bool &F, bool NewValue) : OldValue(F), Flag(F) { F = NewValue; }
+  ~BoolFlagSaver() { Flag = OldValue; }
+private:
+  const bool OldValue;
+  bool &Flag;
+};
 } // end of namespace Utils
 } // end of namespace Ice

--- a/tests_lit/assembler/arm32/select-vec.ll
+++ b/tests_lit/assembler/arm32/select-vec.ll
+; Test that we handle select on vectors.
+; TODO(eholk): This test will need to be updated once comparison is no longer
+; scalarized.
+; REQUIRES: allow_dump
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -O2 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+define internal <4 x float> @select4float(<4 x i1> %s, <4 x float> %a,
+                                          <4 x float> %b) {
+; ASM-LABEL:select4float:
+; DIS-LABEL:00000000 <select4float>:
+entry:
+  %res = select <4 x i1> %s, <4 x float> %a, <4 x float> %b
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.f32	s16, s4
+; ASM-NEXT:	vmov.f32	s17, s8
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s17, s16
+; ASM-NEXT:	vmov.f32	s12, s17
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.f32	s16, s5
+; ASM-NEXT:	vmov.f32	s17, s9
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s17, s16
+; ASM-NEXT:	vmov.f32	s13, s17
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.f32	s16, s6
+; ASM-NEXT:	vmov.f32	s17, s10
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s17, s16
+; ASM-NEXT:	vmov.f32	s14, s17
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.f32	s4, s7
+; ASM-NEXT:	vmov.f32	s8, s11
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	vmovne.f32	s8, s4
+; ASM-NEXT:	vmov.f32	s15, s8
+; ASM-NEXT:	vmov.f32	q0, q3
+; ASM-NEXT:	vpop	{s16, s17}
+; ASM-NEXT:	# s16 = def.pseudo
+; ASM-NEXT:	# s17 = def.pseudo
+; ASM-NEXT:	bx	lr
+  ret <4 x float> %res
+}
+define internal <4 x i32> @select4i32(<4 x i1> %s, <4 x i32> %a, <4 x i32> %b) {
+; ASM-LABEL:select4i32:
+; DIS-LABEL:00000000 <select4i32>:
+entry:
+  %res = select <4 x i1> %s, <4 x i32> %a, <4 x i32> %b
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.32	r1, d2[0]
+; ASM-NEXT:	vmov.32	r2, d4[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d6[0], r2
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.32	r1, d2[1]
+; ASM-NEXT:	vmov.32	r2, d4[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d6[1], r2
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.32	r1, d3[0]
+; ASM-NEXT:	vmov.32	r2, d5[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d7[0], r2
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.32	r1, d3[1]
+; ASM-NEXT:	vmov.32	r2, d5[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.32	d7[1], r2
+; ASM-NEXT:	vmov.i32	q0, q3
+; ASM-NEXT:	bx	lr
+  ret <4 x i32> %res
+}
+define internal <8 x i16> @select8i16(<8 x i1> %s, <8 x i16> %a, <8 x i16> %b) {
+; ASM-LABEL:select8i16:
+; DIS-LABEL:00000000 <select8i16>:
+entry:
+  %res = select <8 x i1> %s, <8 x i16> %a, <8 x i16> %b
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.s16	r1, d2[0]
+; ASM-NEXT:	vmov.s16	r2, d4[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[0], r2
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.s16	r1, d2[1]
+; ASM-NEXT:	vmov.s16	r2, d4[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[1], r2
+; ASM-NEXT:	vmov.s8	r0, d0[2]
+; ASM-NEXT:	vmov.s16	r1, d2[2]
+; ASM-NEXT:	vmov.s16	r2, d4[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[2], r2
+; ASM-NEXT:	vmov.s8	r0, d0[3]
+; ASM-NEXT:	vmov.s16	r1, d2[3]
+; ASM-NEXT:	vmov.s16	r2, d4[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d6[3], r2
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.s16	r1, d3[0]
+; ASM-NEXT:	vmov.s16	r2, d5[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[0], r2
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.s16	r1, d3[1]
+; ASM-NEXT:	vmov.s16	r2, d5[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[1], r2
+; ASM-NEXT:	vmov.s8	r0, d1[2]
+; ASM-NEXT:	vmov.s16	r1, d3[2]
+; ASM-NEXT:	vmov.s16	r2, d5[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[2], r2
+; ASM-NEXT:	vmov.s8	r0, d1[3]
+; ASM-NEXT:	vmov.s16	r1, d3[3]
+; ASM-NEXT:	vmov.s16	r2, d5[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.16	d7[3], r2
+; ASM-NEXT:	vmov.i16	q0, q3
+; ASM-NEXT:	bx	lr
+  ret <8 x i16> %res
+}
+define internal <16 x i8> @select16i8(<16 x i1> %s, <16 x i8> %a,
+                                      <16 x i8> %b) {
+; ASM-LABEL:select16i8:
+; DIS-LABEL:00000000 <select16i8>:
+entry:
+  %res = select <16 x i1> %s, <16 x i8> %a, <16 x i8> %b
+; ASM:	# q3 = def.pseudo
+; ASM-NEXT:	vmov.s8	r0, d0[0]
+; ASM-NEXT:	vmov.s8	r1, d2[0]
+; ASM-NEXT:	vmov.s8	r2, d4[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[0], r2
+; ASM-NEXT:	vmov.s8	r0, d0[1]
+; ASM-NEXT:	vmov.s8	r1, d2[1]
+; ASM-NEXT:	vmov.s8	r2, d4[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[1], r2
+; ASM-NEXT:	vmov.s8	r0, d0[2]
+; ASM-NEXT:	vmov.s8	r1, d2[2]
+; ASM-NEXT:	vmov.s8	r2, d4[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[2], r2
+; ASM-NEXT:	vmov.s8	r0, d0[3]
+; ASM-NEXT:	vmov.s8	r1, d2[3]
+; ASM-NEXT:	vmov.s8	r2, d4[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[3], r2
+; ASM-NEXT:	vmov.s8	r0, d0[4]
+; ASM-NEXT:	vmov.s8	r1, d2[4]
+; ASM-NEXT:	vmov.s8	r2, d4[4]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[4], r2
+; ASM-NEXT:	vmov.s8	r0, d0[5]
+; ASM-NEXT:	vmov.s8	r1, d2[5]
+; ASM-NEXT:	vmov.s8	r2, d4[5]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[5], r2
+; ASM-NEXT:	vmov.s8	r0, d0[6]
+; ASM-NEXT:	vmov.s8	r1, d2[6]
+; ASM-NEXT:	vmov.s8	r2, d4[6]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[6], r2
+; ASM-NEXT:	vmov.s8	r0, d0[7]
+; ASM-NEXT:	vmov.s8	r1, d2[7]
+; ASM-NEXT:	vmov.s8	r2, d4[7]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d6[7], r2
+; ASM-NEXT:	vmov.s8	r0, d1[0]
+; ASM-NEXT:	vmov.s8	r1, d3[0]
+; ASM-NEXT:	vmov.s8	r2, d5[0]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[0], r2
+; ASM-NEXT:	vmov.s8	r0, d1[1]
+; ASM-NEXT:	vmov.s8	r1, d3[1]
+; ASM-NEXT:	vmov.s8	r2, d5[1]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[1], r2
+; ASM-NEXT:	vmov.s8	r0, d1[2]
+; ASM-NEXT:	vmov.s8	r1, d3[2]
+; ASM-NEXT:	vmov.s8	r2, d5[2]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[2], r2
+; ASM-NEXT:	vmov.s8	r0, d1[3]
+; ASM-NEXT:	vmov.s8	r1, d3[3]
+; ASM-NEXT:	vmov.s8	r2, d5[3]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[3], r2
+; ASM-NEXT:	vmov.s8	r0, d1[4]
+; ASM-NEXT:	vmov.s8	r1, d3[4]
+; ASM-NEXT:	vmov.s8	r2, d5[4]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[4], r2
+; ASM-NEXT:	vmov.s8	r0, d1[5]
+; ASM-NEXT:	vmov.s8	r1, d3[5]
+; ASM-NEXT:	vmov.s8	r2, d5[5]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[5], r2
+; ASM-NEXT:	vmov.s8	r0, d1[6]
+; ASM-NEXT:	vmov.s8	r1, d3[6]
+; ASM-NEXT:	vmov.s8	r2, d5[6]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[6], r2
+; ASM-NEXT:	vmov.s8	r0, d1[7]
+; ASM-NEXT:	vmov.s8	r1, d3[7]
+; ASM-NEXT:	vmov.s8	r2, d5[7]
+; ASM-NEXT:	tst	r0, #1
+; ASM-NEXT:	movne	r2, r1
+; ASM-NEXT:	vmov.8	d7[7], r2
+; ASM-NEXT:	vmov.i8	q0, q3
+; ASM-NEXT:	bx	lr
+  ret <16 x i8> %res
+}