Subzero: ARM32: lowering of vector insert and extract.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4076 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1655313002 .

Subzero: ARM32: lowering of vector insert and extract.
658bae20 · Eric Holk · 2d6c8267 · 658bae20 · 658bae20 · 658bae20
Commit 658bae20 authored Feb 08, 2016 by Eric Holk
6 changed files
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -20,7 +20,6 @@
 #include "IceCfgNode.h"
 #include "IceInst.h"
 #include "IceOperand.h"
-#include "IceRegistersARM32.h"
 #include "IceTargetLoweringARM32.h"

 namespace Ice {
@@ -28,6 +27,8 @@ namespace ARM32 {

 namespace {

+using Register = RegARM32::AllRegisters;
+
 // maximum number of registers allowed in vpush/vpop.
 static constexpr SizeT VpushVpopMaxConsecRegs = 16;

@@ -1043,6 +1044,132 @@ InstARM32Mov::InstARM32Mov(Cfg *Func, Variable *Dest, Operand *Src,
  }
 }

+// These next two functions find the D register that maps to the half of the Q
+// register that this instruction is accessing.
+Register getDRegister(const Variable *Src, uint32_t Index) {
+  assert(Src->hasReg());
+  const auto SrcReg = static_cast<Register>(Src->getRegNum());
+
+  const RegARM32::RegTableType &SrcEntry = RegARM32::RegTable[SrcReg];
+  assert(SrcEntry.IsVec128);
+
+  const uint32_t NumElements = typeNumElements(Src->getType());
+
+  // This code assumes the Aliases list goes Q_n, S_2n, S_2n+1. The asserts in
+  // the next two branches help to check that this is still true.
+  if (Index < NumElements / 2) {
+    // We have a Q register that's made up of two D registers. This assert is
+    // to help ensure that we picked the right D register.
+    //
+    // TODO(jpp): find a way to do this that doesn't rely on ordering of the
+    // alias list.
+    assert(RegARM32::RegTable[SrcEntry.Aliases[1]].Encoding + 1 ==
+           RegARM32::RegTable[SrcEntry.Aliases[2]].Encoding);
+    return static_cast<Register>(SrcEntry.Aliases[1]);
+  } else {
+    // We have a Q register that's made up of two D registers. This assert is
+    // to help ensure that we picked the right D register.
+    //
+    // TODO(jpp): find a way to do this that doesn't rely on ordering of the
+    // alias list.
+    assert(RegARM32::RegTable[SrcEntry.Aliases[2]].Encoding - 1 ==
+           RegARM32::RegTable[SrcEntry.Aliases[1]].Encoding);
+    return static_cast<Register>(SrcEntry.Aliases[2]);
+  }
+}
+
+constexpr uint32_t getDIndex(uint32_t NumElements, uint32_t Index) {
+  return (Index < NumElements / 2) ? Index : Index - (NumElements / 2);
+}
+
+// For floating point values, we can insertelement or extractelement by moving
+// directly from an S register. This function finds the right one.
+Register getSRegister(const Variable *Src, uint32_t Index) {
+  assert(Src->hasReg());
+  const auto SrcReg = static_cast<Register>(Src->getRegNum());
+
+  // For floating point values, we need to be allocated to Q0 - Q7, so we can
+  // directly access the value we want as one of the S registers.
+  assert(Src->getType() == IceType_v4f32);
+  assert(SrcReg < RegARM32::Reg_q8);
+
+  // This part assumes the register alias list goes q0, d0, d1, s0, s1, s2, s3.
+  assert(Index < 4);
+
+  // TODO(jpp): find a way to do this that doesn't rely on ordering of the alias
+  // list.
+  return static_cast<Register>(RegARM32::RegTable[SrcReg].Aliases[Index + 3]);
+}
+
+void InstARM32Extract::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  const Type DestTy = getDest()->getType();
+
+  const auto *Src = llvm::cast<Variable>(getSrc(0));
+
+  if (isIntegerType(DestTy)) {
+    Str << "\t"
+        << "vmov" << getPredicate();
+    const uint32_t BitSize = typeWidthInBytes(DestTy) * CHAR_BIT;
+    if (BitSize < 32) {
+      Str << ".s" << BitSize;
+    } else {
+      Str << "." << BitSize;
+    }
+    Str << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+
+    const size_t VectorSize = typeNumElements(Src->getType());
+
+    const Register SrcReg = getDRegister(Src, Index);
+
+    Str << RegARM32::RegTable[SrcReg].Name;
+    Str << "[" << getDIndex(VectorSize, Index) << "]";
+  } else if (isFloatingType(DestTy)) {
+    const Register SrcReg = getSRegister(Src, Index);
+
+    Str << "\t"
+        << "vmov" << getPredicate() << ".f32"
+        << "\t";
+    getDest()->emit(Func);
+    Str << ", " << RegARM32::RegTable[SrcReg].Name;
+  } else {
+    assert(false && "Invalid extract type");
+  }
+}
+
+void InstARM32Insert::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  const Variable *Dest = getDest();
+  const Type DestTy = getDest()->getType();
+
+  const auto *Src = llvm::cast<Variable>(getSrc(0));
+
+  if (isIntegerType(DestTy)) {
+    Str << "\t"
+        << "vmov" << getPredicate();
+    const size_t BitSize = typeWidthInBytes(typeElementType(DestTy)) * CHAR_BIT;
+    Str << "." << BitSize << "\t";
+
+    const size_t VectorSize = typeNumElements(DestTy);
+    const Register DestReg = getDRegister(Dest, Index);
+    const uint32_t Index = getDIndex(VectorSize, this->Index);
+    Str << RegARM32::RegTable[DestReg].Name;
+    Str << "[" << Index << "], ";
+    Src->emit(Func);
+  } else if (isFloatingType(DestTy)) {
+    Str << "\t"
+        << "vmov" << getPredicate() << ".f32"
+        << "\t";
+    const Register DestReg = getSRegister(Dest, Index);
+    Str << RegARM32::RegTable[DestReg].Name << ", ";
+    Src->emit(Func);
+  } else {
+    assert(false && "Invalid insert type");
+  }
+}
+
 template <InstARM32::InstKindARM32 K>
 void InstARM32CmpLike<K>::emitIAS(const Cfg *Func) const {
  emitUsingTextFixup(Func);

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -23,6 +23,7 @@
 #include "IceInst.h"
 #include "IceInstARM32.def"
 #include "IceOperand.h"
+#include "IceRegistersARM32.h"

 namespace Ice {
 namespace ARM32 {
@@ -389,6 +390,8 @@ public:
    Cmp,
    Dmb,
    Eor,
+    Extract,
+    Insert,
    Label,
    Ldr,
    Ldrex,
@@ -1349,6 +1352,62 @@ private:
  Variable *DestHi = nullptr;
 };

+/// Generates vmov Rd, Dn[x] instructions, and their related floating point
+/// versions.
+class InstARM32Extract final : public InstARM32Pred {
+  InstARM32Extract() = delete;
+  InstARM32Extract(const InstARM32Extract &) = delete;
+  InstARM32Extract &operator=(const InstARM32Extract &) = delete;
+
+public:
+  static InstARM32Extract *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                  uint32_t Index, CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Extract>())
+        InstARM32Extract(Func, Dest, Src0, Index, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Extract); }
+
+private:
+  InstARM32Extract(Cfg *Func, Variable *Dest, Variable *Src0, uint32_t Index,
+                   CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, InstARM32::Extract, 1, Dest, Predicate),
+        Index(Index) {
+    assert(Index < typeNumElements(Src0->getType()));
+    addSource(Src0);
+  }
+
+  const uint32_t Index;
+};
+
+/// Generates vmov Dn[x], Rd instructions, and their related floating point
+/// versions.
+class InstARM32Insert final : public InstARM32Pred {
+  InstARM32Insert() = delete;
+  InstARM32Insert(const InstARM32Insert &) = delete;
+  InstARM32Insert &operator=(const InstARM32Insert &) = delete;
+
+public:
+  static InstARM32Insert *create(Cfg *Func, Variable *Dest, Variable *Src0,
+                                 uint32_t Index, CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Insert>())
+        InstARM32Insert(Func, Dest, Src0, Index, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Insert); }
+
+private:
+  InstARM32Insert(Cfg *Func, Variable *Dest, Variable *Src0, uint32_t Index,
+                  CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, InstARM32::Insert, 1, Dest, Predicate),
+        Index(Index) {
+    assert(Index < typeNumElements(Dest->getType()));
+    addSource(Src0);
+  }
+
+  const uint32_t Index;
+};
+
 class InstARM32Vcmp final : public InstARM32Pred {
  InstARM32Vcmp() = delete;
  InstARM32Vcmp(const InstARM32Vcmp &) = delete;

--- a/src/IceRegistersARM32.h
+++ b/src/IceRegistersARM32.h
@@ -219,8 +219,12 @@ static inline IceString getRegName(int32_t RegNum) {
  return RegTable[RegNum].Name;
 }

-// Extend enum RegClass with ARM32-specific register classes (if any).
-enum RegClassARM32 : uint8_t { RCARM32_NUM = RC_Target };
+// Extend enum RegClass with ARM32-specific register classes.
+enum RegClassARM32 : uint8_t {
+  RCARM32_QtoS = RC_Target, // Denotes Q registers that are aliased by S
+                            // registers.
+  RCARM32_NUM
+};

 } // end of namespace RegARM32
 } // end of namespace ARM32

--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -296,7 +296,9 @@ void TargetARM32::staticInit(GlobalContext *Ctx) {
  llvm::SmallBitVector Float32Registers(RegARM32::Reg_NUM);
  llvm::SmallBitVector Float64Registers(RegARM32::Reg_NUM);
  llvm::SmallBitVector VectorRegisters(RegARM32::Reg_NUM);
+  llvm::SmallBitVector QtoSRegisters(RegARM32::Reg_NUM);
  llvm::SmallBitVector InvalidRegisters(RegARM32::Reg_NUM);
+  const unsigned EncodedReg_q8 = RegARM32::RegTable[RegARM32::Reg_q8].Encoding;
  for (int i = 0; i < RegARM32::Reg_NUM; ++i) {
    const auto &Entry = RegARM32::RegTable[i];
    IntegerRegisters[i] = Entry.IsInt;
@@ -305,6 +307,9 @@ void TargetARM32::staticInit(GlobalContext *Ctx) {
    Float64Registers[i] = Entry.IsFP64;
    VectorRegisters[i] = Entry.IsVec128;
    RegisterAliases[i].resize(RegARM32::Reg_NUM);
+    // TODO(eholk): It would be better to store a QtoS flag in the
+    // IceRegistersARM32 table than to compare their encodings here.
+    QtoSRegisters[i] = Entry.IsVec128 && Entry.Encoding < EncodedReg_q8;
    for (int j = 0; j < Entry.NumAliases; ++j) {
      assert(i == j || !RegisterAliases[i][Entry.Aliases[j]]);
      RegisterAliases[i].set(Entry.Aliases[j]);
@@ -340,6 +345,7 @@ void TargetARM32::staticInit(GlobalContext *Ctx) {
  TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
  TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
  TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
+  TypeToRegisterSet[RegARM32::RCARM32_QtoS] = QtoSRegisters;

  for (size_t i = 0; i < llvm::array_lengthof(TypeToRegisterSet); ++i)
    TypeToRegisterSetUnfiltered[i] = TypeToRegisterSet[i];
@@ -3834,7 +3840,28 @@ void TargetARM32::lowerCast(const InstCast *Instr) {
 }

 void TargetARM32::lowerExtractElement(const InstExtractElement *Instr) {
-  UnimplementedLoweringError(this, Instr);
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+
+  Variable *Src0 = legalizeToReg(Instr->getSrc(0));
+  Operand *Src1 = Instr->getSrc(1);
+
+  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src1)) {
+    const uint32_t Index = Imm->getValue();
+    Variable *T = makeReg(DestTy);
+    Variable *TSrc0 = makeReg(Src0->getType());
+
+    if (isFloatingType(DestTy)) {
+      // We need to make sure the source is in a suitable register.
+      TSrc0->setRegClass(RegARM32::RCARM32_QtoS);
+    }
+
+    _mov(TSrc0, Src0);
+    _extractelement(T, TSrc0, Index);
+    _mov(Dest, T);
+    return;
+  }
+  assert(false && "extractelement requires a constant index");
 }

 namespace {
@@ -4229,7 +4256,28 @@ void TargetARM32::lowerIcmp(const InstIcmp *Instr) {
 }

 void TargetARM32::lowerInsertElement(const InstInsertElement *Instr) {
-  UnimplementedLoweringError(this, Instr);
+  Variable *Dest = Instr->getDest();
+  Type DestTy = Dest->getType();
+
+  Variable *Src0 = legalizeToReg(Instr->getSrc(0));
+  Variable *Src1 = legalizeToReg(Instr->getSrc(1));
+  Operand *Src2 = Instr->getSrc(2);
+
+  if (const auto *Imm = llvm::dyn_cast<ConstantInteger32>(Src2)) {
+    const uint32_t Index = Imm->getValue();
+    Variable *T = makeReg(DestTy);
+
+    if (isFloatingType(DestTy)) {
+      T->setRegClass(RegARM32::RCARM32_QtoS);
+    }
+
+    _mov(T, Src0);
+    _insertelement(T, Src1, Index);
+    _set_dest_redefined();
+    _mov(Dest, T);
+    return;
+  }
+  assert(false && "insertelement requires a constant index");
 }

 namespace {

--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -85,13 +85,18 @@ public:
  const llvm::SmallBitVector &
  getRegistersForVariable(const Variable *Var) const override {
    RegClass RC = Var->getRegClass();
-    assert(RC < RC_Target);
-    return TypeToRegisterSet[RC];
+    switch (RC) {
+    default:
+      assert(RC < RC_Target);
+      return TypeToRegisterSet[RC];
+    case RegARM32::RCARM32_QtoS:
+      return TypeToRegisterSet[RC];
+    }
  }
  const llvm::SmallBitVector &
  getAllRegistersForVariable(const Variable *Var) const override {
    RegClass RC = Var->getRegClass();
-    assert(RC < RC_Target);
+    assert((RegARM32::RegClassARM32)RC < RegARM32::RCARM32_NUM);
    return TypeToRegisterSetUnfiltered[RC];
  }
  const llvm::SmallBitVector &getAliasesForRegister(SizeT Reg) const override {
@@ -413,6 +418,20 @@ protected:
    }
  }

+  // Generates a vmov instruction to extract the given index from a vector
+  // register.
+  void _extractelement(Variable *Dest, Variable *Src0, uint32_t Index,
+                       CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Extract>(Dest, Src0, Index, Pred);
+  }
+
+  // Generates a vmov instruction to insert a value into the given index of a
+  // vector register.
+  void _insertelement(Variable *Dest, Variable *Src0, uint32_t Index,
+                      CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert<InstARM32Insert>(Dest, Src0, Index, Pred);
+  }
+
  // --------------------------------------------------------------------------
  // Begin bool folding machinery.
  //

--- a/tests_lit/assembler/arm32/insert-extract.ll
+++ b/tests_lit/assembler/arm32/insert-extract.ll
+; Show that we know how to translate insertelement and extractelement.
+
+; REQUIRES: allow_dump
+
+; Compile using standalone assembler.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --args -Om1 \
+; RUN:   | FileCheck %s --check-prefix=ASM
+
+; Show bytes in assembled standalone code.
+; RUN: %p2i --filetype=asm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -Om1 \
+; RUN:   | FileCheck %s --check-prefix=DIS
+
+; Compile using integrated assembler.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --args -Om1 \
+; RUN:   | FileCheck %s --check-prefix=IASM
+
+; Show bytes in assembled integrated code.
+; RUN: %p2i --filetype=iasm -i %s --target=arm32 --assemble --disassemble \
+; RUN:   --args -Om1 \
+; RUN:   | FileCheck %s --check-prefix=DIS
+
+define internal i32 @extract1_v4i32(<4 x i32> %src) {
+; ASM-LABEL: extract1_v4i32:
+; DIS-LABEL: 00000000 <extract1_v4i32>:
+; IASM-LABEL: extract1_v4i32:
+
+  %1 = extractelement <4 x i32> %src, i32 1
+
+; ASM: vmov.32	r0, d0[1]
+; DIS:   10:       ee300b10
+
+  ret i32 %1
+}
+
+define internal i32 @extract2_v4i32(<4 x i32> %src) {
+; ASM-LABEL: extract2_v4i32:
+; DIS-LABEL: 00000030 <extract2_v4i32>:
+; IASM-LABEL: extract2_v4i32:
+
+  %1 = extractelement <4 x i32> %src, i32 2
+
+; ASM: vmov.32	r0, d1[0]
+; DIS:   40:       ee110b10
+
+  ret i32 %1
+}
+
+define internal i32 @extract3_v8i16(<8 x i16> %src) {
+; ASM-LABEL: extract3_v8i16:
+; DIS-LABEL: 00000060 <extract3_v8i16>:
+; IASM-LABEL: extract3_v8i16:
+
+  %1 = extractelement <8 x i16> %src, i32 3
+
+; ASM: vmov.s16	r0, d0[3]
+; DIS:   70:       ee300b70
+  %2 = sext i16 %1 to i32
+  ret i32 %2
+}
+
+define internal i32 @extract4_v8i16(<8 x i16> %src) {
+; ASM-LABEL: extract4_v8i16:
+; DIS-LABEL: 00000090 <extract4_v8i16>:
+; IASM-LABEL: extract4_v8i16:
+
+  %1 = extractelement <8 x i16> %src, i32 4
+
+; ASM: vmov.s16	r0, d1[0]
+; DIS:   a0:       ee110b30
+
+  %2 = sext i16 %1 to i32
+  ret i32 %2
+}
+
+define internal i32 @extract7_v4i8(<16 x i8> %src) {
+; ASM-LABEL: extract7_v4i8:
+; DIS-LABEL: 000000c0 <extract7_v4i8>:
+; IASM-LABEL: extract7_v4i8:
+
+  %1 = extractelement <16 x i8> %src, i32 7
+
+; ASM: vmov.s8	r0, d0[7]
+; DIS:   d0:       ee700b70
+
+  %2 = sext i8 %1 to i32
+  ret i32 %2
+}
+
+define internal i32 @extract8_v16i8(<16 x i8> %src) {
+; ASM-LABEL: extract8_v16i8:
+; DIS-LABEL: 000000f0 <extract8_v16i8>:
+; IASM-LABEL: extract8_v16i8:
+
+  %1 = extractelement <16 x i8> %src, i32 8
+
+; ASM: vmov.s8	r0, d1[0]
+; DIS:   100:       ee510b10
+
+  %2 = sext i8 %1 to i32
+  ret i32 %2
+}
+
+define internal float @extract1_v4float(<4 x float> %src) {
+; ASM-LABEL: extract1_v4float:
+; DIS-LABEL: 00000120 <extract1_v4float>:
+; IASM-LABEL: extract1_v4float:
+
+  %1 = extractelement <4 x float> %src, i32 1
+
+; ASM: vmov.f32	s0, s1
+; DIS:   130:       eeb00a60
+
+  ret float %1
+}
+
+define internal float @extract2_v4float(<4 x float> %src) {
+; ASM-LABEL: extract2_v4float:
+; DIS-LABEL: 00000150 <extract2_v4float>:
+; IASM-LABEL: extract2_v4float:
+
+  %1 = extractelement <4 x float> %src, i32 2
+
+; ASM: vmov.f32	s0, s2
+; DIS:   160:       eeb00a41
+
+  ret float %1
+}
+
+define internal <4 x i32> @insert1_v4i32(<4 x i32> %src, i32 %s) {
+; ASM-LABEL: insert1_v4i32:
+; DIS-LABEL: 00000180 <insert1_v4i32>:
+; IASM-LABEL: insert1_v4i32:
+
+  %1 = insertelement <4 x i32> %src, i32 %s, i32 1
+
+; ASM: vmov.32	d0[1], r0
+; DIS:   198:       ee200b10
+
+  ret <4 x i32> %1
+}
+
+define internal <4 x i32> @insert2_v4i32(<4 x i32> %src, i32 %s) {
+; ASM-LABEL: insert2_v4i32:
+; DIS-LABEL: 000001b0 <insert2_v4i32>:
+; IASM-LABEL: insert2_v4i32:
+
+  %1 = insertelement <4 x i32> %src, i32 %s, i32 2
+
+; ASM: vmov.32	d1[0], r0
+; DIS:   1c8:       ee010b10
+
+  ret <4 x i32> %1
+}
+
+define internal <8 x i16> @insert3_v8i16(<8 x i16> %src, i32 %s) {
+; ASM-LABEL: insert3_v8i16:
+; DIS-LABEL: 000001e0 <insert3_v8i16>:
+; IASM-LABEL: insert3_v8i16:
+
+  %s2 = trunc i32 %s to i16
+  %1 = insertelement <8 x i16> %src, i16 %s2, i32 3
+
+; ASM: vmov.16	d0[3], r0
+; DIS:   200:       ee200b70
+  ret <8 x i16> %1
+}
+
+define internal <8 x i16> @insert4_v8i16(<8 x i16> %src, i32 %s) {
+; ASM-LABEL: insert4_v8i16:
+; DIS-LABEL: 00000220 <insert4_v8i16>:
+; IASM-LABEL: insert4_v8i16:
+
+  %s2 = trunc i32 %s to i16
+  %1 = insertelement <8 x i16> %src, i16 %s2, i32 4
+
+; ASM: vmov.16	d1[0], r0
+; DIS:   240:       ee010b30
+  ret <8 x i16> %1
+}
+
+define internal <16 x i8> @insert7_v4i8(<16 x i8> %src, i32 %s) {
+; ASM-LABEL: insert7_v4i8:
+; DIS-LABEL: 00000260 <insert7_v4i8>:
+; IASM-LABEL: insert7_v4i8:
+
+  %s2 = trunc i32 %s to i8
+  %1 = insertelement <16 x i8> %src, i8 %s2, i32 7
+
+; ASM: vmov.8	d0[7], r0
+; DIS:   280:       ee600b70
+
+  ret <16 x i8> %1
+}
+
+define internal <16 x i8> @insert8_v16i8(<16 x i8> %src, i32 %s) {
+; ASM-LABEL: insert8_v16i8:
+; DIS-LABEL: 000002a0 <insert8_v16i8>:
+; IASM-LABEL: insert8_v16i8:
+
+  %s2 = trunc i32 %s to i8
+  %1 = insertelement <16 x i8> %src, i8 %s2, i32 8
+
+; ASM: vmov.8	d1[0], r0
+; DIS:   2c0:       ee410b10
+
+  ret <16 x i8> %1
+}
+
+define internal <4 x float> @insert1_v4float(<4 x float> %src, float %s) {
+; ASM-LABEL: insert1_v4float:
+; DIS-LABEL: 000002e0 <insert1_v4float>:
+; IASM-LABEL: insert1_v4float:
+
+  %1 = insertelement <4 x float> %src, float %s, i32 1
+
+; ASM: vmov.f32	s1, s4
+; DIS:   2f8:       eef00a42
+
+  ret <4 x float> %1
+}
+
+define internal <4 x float> @insert2_v4float(<4 x float> %src, float %s) {
+; ASM-LABEL: insert2_v4float:
+; DIS-LABEL: 00000310 <insert2_v4float>:
+; IASM-LABEL: insert2_v4float:
+
+  %1 = insertelement <4 x float> %src, float %s, i32 2
+
+; ASM: vmov.f32	s2, s4
+; DIS:   328:       eeb01a42
+
+  ret <4 x float> %1
+}