Add insert/extract element to the integrated ARM assembler.

BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4334 R=jpp@chromium.org Review URL: https://codereview.chromium.org/1679023008 .

Add insert/extract element to the integrated ARM assembler.
6de32b21 · Karl Schimpf · cc69fa29 · 6de32b21 · 6de32b21 · 6de32b21
Commit 6de32b21 authored Feb 10, 2016 by Karl Schimpf
7 changed files
--- a/src/DartARM32/assembler_arm.cc
+++ b/src/DartARM32/assembler_arm.cc
@@ -683,7 +683,8 @@ void Assembler::vmovrrs(Register rt, Register rt2, SRegister sm,
  Emit(encoding);
 }

-
+#if 0
+// Moved to ARM32::AssemblerARM32::vmovdqir().
 void Assembler::vmovdr(DRegister dn, int i, Register rt, Condition cond) {
  ASSERT(TargetCPUFeatures::vfp_supported());
  ASSERT((i == 0) || (i == 1));
@@ -701,7 +702,6 @@ void Assembler::vmovdr(DRegister dn, int i, Register rt, Condition cond) {
  Emit(encoding);
 }

-#if 0
 // Moved to ARM32::AssemblerARM32::vmovdrr().
 void Assembler::vmovdrr(DRegister dm, Register rt, Register rt2,
                        Condition cond) {

--- a/src/DartARM32/assembler_arm.h
+++ b/src/DartARM32/assembler_arm.h
@@ -630,9 +630,8 @@ class Assembler : public ValueObject {
  void vmovdrr(DRegister dm, Register rt, Register rt2, Condition cond = AL);
  // Moved to ARM32::AssemblerARM32::vmovrrd().
  void vmovrrd(Register rt, Register rt2, DRegister dm, Condition cond = AL);
-#endif
+  // Moved to ARM32::AssemblerARM32::vmovqir().
  void vmovdr(DRegister dd, int i, Register rt, Condition cond = AL);
-#if 0
  // Moved to ARM32::AssemblerARM32::vmovss().
  void vmovs(SRegister sd, SRegister sm, Condition cond = AL);
  // Moved to ARM32::AssemblerARM32::vmovdd().
@@ -1409,6 +1408,7 @@ class Assembler : public ValueObject {
  // ARM32::AssemblerARM32::veord()
  // ARM32::AssemblerARM32::vld1qr()
  // ARM32::AssemblerARM32::vst1qr()
+  // ARM32::AssemblerARM32::vmorqi()
 #endif

  DISALLOW_ALLOCATION();

--- a/src/IceAssemblerARM32.cpp
+++ b/src/IceAssemblerARM32.cpp
@@ -158,7 +158,7 @@ IValueT encodeElmtType(Type ElmtTy) {
    return 3;
  default:
    llvm::report_fatal_error("SIMD op: Don't understand element type " +
-                             std::string(typeString(ElmtTy)));
+                             typeIceString(ElmtTy));
  }
 }

@@ -213,7 +213,17 @@ IValueT getEncodedQRegNum(const Variable *Var) {
  return RegARM32::getEncodedQReg(Var->getRegNum());
 }

-IValueT mapQRegToDReg(IValueT EncodedQReg) { return EncodedQReg << 1; }
+IValueT mapQRegToDReg(IValueT EncodedQReg) {
+  IValueT DReg = EncodedQReg << 1;
+  assert(DReg < RegARM32::getNumDRegs());
+  return DReg;
+}
+
+IValueT mapQRegToSReg(IValueT EncodedQReg) {
+  IValueT SReg = EncodedQReg << 2;
+  assert(SReg < RegARM32::getNumSRegs());
+  return SReg;
+}

 IValueT getYInRegXXXXY(IValueT RegXXXXY) { return RegXXXXY & 0x1; }

@@ -1010,6 +1020,60 @@ void AssemblerARM32::emitDivOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd,
  emitInst(Encoding);
 }

+void AssemblerARM32::emitInsertExtractInt(CondARM32::Cond Cond,
+                                          const Operand *OpQn, uint32_t Index,
+                                          const Operand *OpRt, bool IsExtract,
+                                          const char *InstName) {
+  const IValueT Rt = encodeGPRegister(OpRt, "Rt", InstName);
+  IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", InstName));
+  assert(Rt != RegARM32::Encoded_Reg_pc);
+  assert(Rt != RegARM32::Encoded_Reg_sp);
+  assert(CondARM32::isDefined(Cond));
+  const uint32_t BitSize = typeWidthInBytes(OpRt->getType()) * CHAR_BIT;
+  IValueT Opcode1 = 0;
+  IValueT Opcode2 = 0;
+  switch (BitSize) {
+  default:
+    llvm::report_fatal_error(std::string(InstName) +
+                             ": Unable to process type " +
+                             typeIceString(OpRt->getType()));
+  case 8:
+    assert(Index < 16);
+    Dn = Dn | mask(Index, 3, 1);
+    Opcode1 = B1 | mask(Index, 2, 1);
+    Opcode2 = mask(Index, 0, 2);
+    break;
+  case 16:
+    assert(Index < 8);
+    Dn = Dn | mask(Index, 2, 1);
+    Opcode1 = mask(Index, 1, 1);
+    Opcode2 = (mask(Index, 0, 1) << 1) | B0;
+    break;
+  case 32:
+    assert(Index < 4);
+    Dn = Dn | mask(Index, 1, 1);
+    Opcode1 = mask(Index, 0, 1);
+    break;
+  }
+  const IValueT Encoding = B27 | B26 | B25 | B11 | B9 | B8 | B4 |
+                           (encodeCondition(Cond) << kConditionShift) |
+                           (Opcode1 << 21) |
+                           (getXXXXInRegYXXXX(Dn) << kRnShift) | (Rt << 12) |
+                           (encodeBool(IsExtract) << 20) |
+                           (getYInRegYXXXX(Dn) << 7) | (Opcode2 << 5);
+  emitInst(Encoding);
+}
+
+void AssemblerARM32::emitMoveSS(CondARM32::Cond Cond, IValueT Sd, IValueT Sm) {
+  // VMOV (register) - ARM section A8.8.340, encoding A2:
+  //   vmov<c>.f32 <Sd>, <Sm>
+  //
+  // cccc11101D110000dddd101001M0mmmm where cccc=Cond, ddddD=Sd, and mmmmM=Sm.
+  constexpr IValueT VmovssOpcode = B23 | B21 | B20 | B6;
+  constexpr IValueT S0 = 0;
+  emitVFPsss(Cond, VmovssOpcode, Sd, S0, Sm);
+}
+
 void AssemblerARM32::emitMulOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd,
                               IValueT Rn, IValueT Rm, IValueT Rs,
                               bool SetFlags) {
@@ -2654,6 +2718,33 @@ void AssemblerARM32::vmovdrr(const Operand *OpDm, const Operand *OpRt,
  emitInst(Encoding);
 }

+void AssemblerARM32::vmovqir(const Operand *OpQn, uint32_t Index,
+                             const Operand *OpRt, CondARM32::Cond Cond) {
+  // VMOV (ARM core register to scalar) - ARM section A8.8.341, encoding A1:
+  //   vmov<c>.<size> <Dn[x]>, <Rt>
+  constexpr const char *Vmovdr = "vmovdr";
+  constexpr bool IsExtract = true;
+  emitInsertExtractInt(Cond, OpQn, Index, OpRt, !IsExtract, Vmovdr);
+}
+
+void AssemblerARM32::vmovqis(const Operand *OpQd, uint32_t Index,
+                             const Operand *OpSm, CondARM32::Cond Cond) {
+  constexpr const char *Vmovqis = "vmovqis";
+  assert(Index < 4);
+  IValueT Sd = mapQRegToSReg(encodeQRegister(OpQd, "Qd", Vmovqis)) + Index;
+  IValueT Sm = encodeSRegister(OpSm, "Sm", Vmovqis);
+  emitMoveSS(Cond, Sd, Sm);
+}
+
+void AssemblerARM32::vmovrqi(const Operand *OpRt, const Operand *OpQn,
+                             uint32_t Index, CondARM32::Cond Cond) {
+  // VMOV (scalar to ARM core register) - ARM section A8.8.342, encoding A1:
+  //   vmov<c>.<dt> <Rt>, <Dn[x]>
+  constexpr const char *Vmovrd = "vmovrd";
+  constexpr bool IsExtract = true;
+  emitInsertExtractInt(Cond, OpQn, Index, OpRt, IsExtract, Vmovrd);
+}
+
 void AssemblerARM32::vmovrrd(const Operand *OpRt, const Operand *OpRt2,
                             const Operand *OpDm, CondARM32::Cond Cond) {
  // VMOV (between two ARM core registers and a doubleword extension register).
@@ -2716,16 +2807,20 @@ void AssemblerARM32::vmovs(const Operand *OpSd,

 void AssemblerARM32::vmovss(const Operand *OpSd, const Variable *OpSm,
                            CondARM32::Cond Cond) {
-  // VMOV (register) - ARM section A8.8.340, encoding A2:
-  //   vmov<c>.f32 <Sd>, <Sm>
-  //
-  // cccc11101D110000dddd101001M0mmmm where cccc=Cond, ddddD=Sd, and mmmmM=Sm.
  constexpr const char *Vmovss = "Vmovss";
  IValueT Sd = encodeSRegister(OpSd, "Sd", Vmovss);
  IValueT Sm = encodeSRegister(OpSm, "Sm", Vmovss);
-  constexpr IValueT VmovssOpcode = B23 | B21 | B20 | B6;
-  constexpr IValueT S0 = 0;
-  emitVFPsss(Cond, VmovssOpcode, Sd, S0, Sm);
+  emitMoveSS(Cond, Sd, Sm);
+}
+
+void AssemblerARM32::vmovsqi(const Operand *OpSd, const Operand *OpQm,
+                             uint32_t Index, CondARM32::Cond Cond) {
+  constexpr const char *Vmovsqi = "vmovsqi";
+  const IValueT Sd = encodeSRegister(OpSd, "Sd", Vmovsqi);
+  assert(Index < 4);
+  const IValueT Sm =
+      mapQRegToSReg(encodeQRegister(OpQm, "Qm", Vmovsqi)) + Index;
+  emitMoveSS(Cond, Sd, Sm);
 }

 void AssemblerARM32::vmovsr(const Operand *OpSn, const Operand *OpRt,

--- a/src/IceAssemblerARM32.h
+++ b/src/IceAssemblerARM32.h
@@ -417,24 +417,48 @@ public:
    vld1qr(ElmtSize, OpQd, OpRn, TInfo);
  }

+  // Dn = FpImm
  void vmovd(const Operand *OpDn, const OperandARM32FlexFpImm *OpFpImm,
             CondARM32::Cond Cond);

+  // Dd = Dm
  void vmovdd(const Operand *OpDd, const Variable *OpDm, CondARM32::Cond Cond);

+  // Dm = Rt:Rt2
  void vmovdrr(const Operand *OpDm, const Operand *OpRt, const Operand *OpRt2,
               CondARM32::Cond Cond);

+  // Qd[Index] = Rt
+  void vmovqir(const Operand *OpQd, uint32_t Index, const Operand *OpRt,
+               CondARM32::Cond Cond);
+
+  // Qd[Index] = Sm
+  void vmovqis(const Operand *OpQd, uint32_t Indx, const Operand *OpSm,
+               CondARM32::Cond Cond);
+
+  // Rt = Qm[Index]
+  void vmovrqi(const Operand *OpRt, const Operand *OpQd, uint32_t Index,
+               CondARM32::Cond Cond);
+
+  // Rt:Rt2 = Dm
  void vmovrrd(const Operand *OpRt, const Operand *OpRt2, const Operand *OpDm,
               CondARM32::Cond Cond);

+  // Rt = Sn
  void vmovrs(const Operand *OpRt, const Operand *OpSn, CondARM32::Cond Cond);

+  // Sn = FpImm
  void vmovs(const Operand *OpSn, const OperandARM32FlexFpImm *OpFpImm,
             CondARM32::Cond Cond);

-  void vmovss(const Operand *OpDd, const Variable *OpDm, CondARM32::Cond Cond);
+  // Sd = Sm
+  void vmovss(const Operand *OpSd, const Variable *OpSm, CondARM32::Cond Cond);
+
+  // Sd = Qm[Index]
+  void vmovsqi(const Operand *OpSd, const Operand *OpQm, uint32_t Index,
+               CondARM32::Cond Cond);

+  // Sn = Rt
  void vmovsr(const Operand *OpSn, const Operand *OpRt, CondARM32::Cond Cond);

  void vmlad(const Operand *OpDd, const Operand *OpDn, const Operand *OpDm,
@@ -641,6 +665,17 @@ private:
  void emitDivOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,
                 IValueT Rm);

+  // cccc1110iiiennnntttt1011Njj10000 where cccc=Cond, tttt=Rt, Ndddd=2*Qn=Dn,
+  // iii=Opcode1, jj=Opcode2, Opcode1Opcode2 encodes Index and the
+  // corresponding element size of the vector element, and e=IsExtract.
+  void emitInsertExtractInt(CondARM32::Cond Cond, const Operand *OpQn,
+                            uint32_t Index, const Operand *OpRt, bool IsExtract,
+                            const char *InstName);
+
+  // cccc11101D110000dddd101001M0mmmm where cccc=Cond, ddddD=Sd, and mmmmM=Sm.
+  // Assigns Sd the value of Sm.
+  void emitMoveSS(CondARM32::Cond Cond, IValueT Sd, IValueT Sm);
+
  // Pattern ccccxxxxxxxfnnnnddddssss1001mmmm where cccc=Cond, dddd=Rd, nnnn=Rn,
  // mmmm=Rm, ssss=Rs, f=SetFlags and xxxxxxx=Opcode.
  void emitMulOp(CondARM32::Cond Cond, IValueT Opcode, IValueT Rd, IValueT Rn,

--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -1067,6 +1067,8 @@ InstARM32Mov::InstARM32Mov(Cfg *Func, Variable *Dest, Operand *Src,
  }
 }

+namespace {
+
 // These next two functions find the D register that maps to the half of the Q
 // register that this instruction is accessing.
 Register getDRegister(const Variable *Src, uint32_t Index) {
@@ -1124,6 +1126,8 @@ Register getSRegister(const Variable *Src, uint32_t Index) {
  return static_cast<Register>(RegARM32::RegTable[SrcReg].Aliases[Index + 3]);
 }

+} // end of anonymous namespace
+
 void InstARM32Extract::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  const Type DestTy = getDest()->getType();
@@ -1162,6 +1166,23 @@ void InstARM32Extract::emit(const Cfg *Func) const {
  }
 }

+void InstARM32Extract::emitIAS(const Cfg *Func) const {
+  const Operand *Dest = getDest();
+  const Type DestTy = Dest->getType();
+  const Operand *Src = getSrc(0);
+  assert(isVectorType(Src->getType()));
+  assert(DestTy == typeElementType(Src->getType()));
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  if (isIntegerType(DestTy)) {
+    Asm->vmovrqi(Dest, Src, Index, getPredicate());
+    assert(!Asm->needsTextFixup());
+    return;
+  }
+  assert(isFloatingType(DestTy));
+  Asm->vmovsqi(Dest, Src, Index, getPredicate());
+  assert(!Asm->needsTextFixup());
+}
+
 void InstARM32Insert::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  const Variable *Dest = getDest();
@@ -1193,6 +1214,24 @@ void InstARM32Insert::emit(const Cfg *Func) const {
  }
 }

+void InstARM32Insert::emitIAS(const Cfg *Func) const {
+  const Variable *Dest = getDest();
+  const Operand *Src = getSrc(0);
+  const Type SrcTy = Src->getType();
+  assert(isVectorType(Dest->getType()));
+  assert(typeElementType(Dest->getType()) == SrcTy);
+  auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
+  if (isIntegerType(SrcTy)) {
+    const Operand *Src = getSrc(0);
+    Asm->vmovqir(Dest, Index, Src, getPredicate());
+    assert(!Asm->needsTextFixup());
+    return;
+  }
+  assert(isFloatingType(SrcTy));
+  Asm->vmovqis(Dest, Index, Src, getPredicate());
+  assert(!Asm->needsTextFixup());
+}
+
 template <InstARM32::InstKindARM32 K>
 void InstARM32CmpLike<K>::emitIAS(const Cfg *Func) const {
  emitUsingTextFixup(Func);

--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -1368,6 +1368,7 @@ public:
        InstARM32Extract(Func, Dest, Src0, Index, Predicate);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, Extract); }

 private:
@@ -1396,6 +1397,7 @@ public:
        InstARM32Insert(Func, Dest, Src0, Index, Predicate);
  }
  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
  static bool classof(const Inst *Inst) { return isClassof(Inst, Insert); }

 private:

--- a/tests_lit/assembler/arm32/insert-extract.ll
+++ b/tests_lit/assembler/arm32/insert-extract.ll
@@ -27,9 +27,9 @@ define internal i32 @extract1_v4i32(<4 x i32> %src) {

  %1 = extractelement <4 x i32> %src, i32 1

-; ASM: vmov.32	r0, d0[1]
+; ASM: vmov.32  r0, d0[1]
 ; DIS:   10:       ee300b10
-
+; IASM-NOT: vmov.32  r0, d0[1]
  ret i32 %1
 }

@@ -40,8 +40,9 @@ define internal i32 @extract2_v4i32(<4 x i32> %src) {

  %1 = extractelement <4 x i32> %src, i32 2

-; ASM: vmov.32	r0, d1[0]
+; ASM: vmov.32  r0, d1[0]
 ; DIS:   40:       ee110b10
+; IASM-NOT: vmov.32  r0, d1[0]

  ret i32 %1
 }
@@ -53,8 +54,10 @@ define internal i32 @extract3_v8i16(<8 x i16> %src) {

  %1 = extractelement <8 x i16> %src, i32 3

-; ASM: vmov.s16	r0, d0[3]
+; ASM: vmov.s16 r0, d0[3]
 ; DIS:   70:       ee300b70
+; IASM-NOT: vmov.s16 r0, d0[3]
+
  %2 = sext i16 %1 to i32
  ret i32 %2
 }
@@ -66,8 +69,9 @@ define internal i32 @extract4_v8i16(<8 x i16> %src) {

  %1 = extractelement <8 x i16> %src, i32 4

-; ASM: vmov.s16	r0, d1[0]
+; ASM: vmov.s16 r0, d1[0]
 ; DIS:   a0:       ee110b30
+; IASM-NOT: vmov.s16 r0, d1[0]

  %2 = sext i16 %1 to i32
  ret i32 %2
@@ -80,8 +84,9 @@ define internal i32 @extract7_v4i8(<16 x i8> %src) {

  %1 = extractelement <16 x i8> %src, i32 7

-; ASM: vmov.s8	r0, d0[7]
+; ASM: vmov.s8  r0, d0[7]
 ; DIS:   d0:       ee700b70
+; IASM-NOT: vmov.s8  r0, d0[7]

  %2 = sext i8 %1 to i32
  ret i32 %2
@@ -94,8 +99,9 @@ define internal i32 @extract8_v16i8(<16 x i8> %src) {

  %1 = extractelement <16 x i8> %src, i32 8

-; ASM: vmov.s8	r0, d1[0]
+; ASM: vmov.s8  r0, d1[0]
 ; DIS:   100:       ee510b10
+; IASM-NOT: vmov.s8  r0, d1[0]

  %2 = sext i8 %1 to i32
  ret i32 %2
@@ -108,8 +114,9 @@ define internal float @extract1_v4float(<4 x float> %src) {

  %1 = extractelement <4 x float> %src, i32 1

-; ASM: vmov.f32	s0, s1
+; ASM: vmov.f32 s0, s1
 ; DIS:   130:       eeb00a60
+; IASM-NOT: vmov.f32 s0, s1

  ret float %1
 }
@@ -121,8 +128,9 @@ define internal float @extract2_v4float(<4 x float> %src) {

  %1 = extractelement <4 x float> %src, i32 2

-; ASM: vmov.f32	s0, s2
+; ASM: vmov.f32 s0, s2
 ; DIS:   160:       eeb00a41
+; IASM-NOT: vmov.f32 s0, s2

  ret float %1
 }
@@ -134,8 +142,9 @@ define internal <4 x i32> @insert1_v4i32(<4 x i32> %src, i32 %s) {

  %1 = insertelement <4 x i32> %src, i32 %s, i32 1

-; ASM: vmov.32	d0[1], r0
+; ASM: vmov.32  d0[1], r0
 ; DIS:   198:       ee200b10
+; IASM-NOT: vmov.32  d0[1], r0

  ret <4 x i32> %1
 }
@@ -147,8 +156,9 @@ define internal <4 x i32> @insert2_v4i32(<4 x i32> %src, i32 %s) {

  %1 = insertelement <4 x i32> %src, i32 %s, i32 2

-; ASM: vmov.32	d1[0], r0
+; ASM: vmov.32  d1[0], r0
 ; DIS:   1c8:       ee010b10
+; IASM-NOT: vmov.32  d1[0], r0

  ret <4 x i32> %1
 }
@@ -161,8 +171,10 @@ define internal <8 x i16> @insert3_v8i16(<8 x i16> %src, i32 %s) {
  %s2 = trunc i32 %s to i16
  %1 = insertelement <8 x i16> %src, i16 %s2, i32 3

-; ASM: vmov.16	d0[3], r0
+; ASM: vmov.16  d0[3], r0
 ; DIS:   200:       ee200b70
+; IASM-NOT: vmov.16  d0[3], r0
+
  ret <8 x i16> %1
 }

@@ -174,8 +186,10 @@ define internal <8 x i16> @insert4_v8i16(<8 x i16> %src, i32 %s) {
  %s2 = trunc i32 %s to i16
  %1 = insertelement <8 x i16> %src, i16 %s2, i32 4

-; ASM: vmov.16	d1[0], r0
+; ASM: vmov.16  d1[0], r0
 ; DIS:   240:       ee010b30
+; IASM-NOT: vmov.16  d1[0], r0
+
  ret <8 x i16> %1
 }

@@ -187,8 +201,9 @@ define internal <16 x i8> @insert7_v4i8(<16 x i8> %src, i32 %s) {
  %s2 = trunc i32 %s to i8
  %1 = insertelement <16 x i8> %src, i8 %s2, i32 7

-; ASM: vmov.8	d0[7], r0
+; ASM: vmov.8   d0[7], r0
 ; DIS:   280:       ee600b70
+; IASM-NOT: vmov.8   d0[7], r0

  ret <16 x i8> %1
 }
@@ -201,8 +216,9 @@ define internal <16 x i8> @insert8_v16i8(<16 x i8> %src, i32 %s) {
  %s2 = trunc i32 %s to i8
  %1 = insertelement <16 x i8> %src, i8 %s2, i32 8

-; ASM: vmov.8	d1[0], r0
+; ASM: vmov.8   d1[0], r0
 ; DIS:   2c0:       ee410b10
+; IASM-NOT: vmov.8   d1[0], r0

  ret <16 x i8> %1
 }
@@ -214,8 +230,9 @@ define internal <4 x float> @insert1_v4float(<4 x float> %src, float %s) {

  %1 = insertelement <4 x float> %src, float %s, i32 1

-; ASM: vmov.f32	s1, s4
+; ASM: vmov.f32 s1, s4
 ; DIS:   2f8:       eef00a42
+; IASM-NOT: vmov.f32 s1, s4

  ret <4 x float> %1
 }
@@ -227,8 +244,9 @@ define internal <4 x float> @insert2_v4float(<4 x float> %src, float %s) {

  %1 = insertelement <4 x float> %src, float %s, i32 2

-; ASM: vmov.f32	s2, s4
+; ASM: vmov.f32 s2, s4
 ; DIS:   328:       eeb01a42
+; IASM-NOT: vmov.f32 s2, s4

  ret <4 x float> %1
 }