Use movss to implement insertelement when elements = 4 and index = 0.

This avoids using a pair of shufps instructions as the previous lowering was doing. Instead, we use movss to copy the element to be inserted into the lower 32 bits of the destination. Define InstX8632Movss as a Binop, the class to which it properly belongs. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/412353005

Use movss to implement insertelement when elements = 4 and index = 0.
cfe5146f · Matt Wala · ce0ca8f8 · cfe5146f · cfe5146f · cfe5146f
Commit cfe5146f authored Jul 25, 2014 by Matt Wala
Showing with 65 additions and 25 deletions

IceInstX8632.cpp src/IceInstX8632.cpp +1 -1

IceInstX8632.h src/IceInstX8632.h +7 -1

IceTargetLoweringX8632.cpp src/IceTargetLoweringX8632.cpp +25 -17

vector-ops.ll tests_lit/llvm2ice_tests/vector-ops.ll +32 -6

No files found.
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -462,7 +462,6 @@ template <> const char *InstX8632Bsf::Opcode = "bsf";
 template <> const char *InstX8632Bsr::Opcode = "bsr";
 template <> const char *InstX8632Lea::Opcode = "lea";
 template <> const char *InstX8632Movd::Opcode = "movd";
-template <> const char *InstX8632Movss::Opcode = "movss";
 template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
 // Binary ops
 template <> const char *InstX8632Add::Opcode = "add";
@@ -499,6 +498,7 @@ template <> const char *InstX8632Sar::Opcode = "sar";
 template <> const char *InstX8632Psra::Opcode = "psra";
 template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
+template <> const char *InstX8632Movss::Opcode = "movss";
 // Ternary ops
 template <> const char *InstX8632Shufps::Opcode = "shufps";
 template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -552,7 +552,6 @@ typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
 typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
 typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
 typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
-typedef InstX8632Unaryop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
 typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
 typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
@@ -586,6 +585,13 @@ typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar;
 typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
 typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
 typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
+// TODO: movss is only a binary operation when the source and dest
+// operands are both registers.  In other cases, it behaves like a copy
+// (mov-like) operation.  Eventually, InstX8632Movss should assert that
+// both its source and dest operands are registers, and the lowering
+// code should use _mov instead of _movss in cases where a copy
+// operation is intended.
+typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
 typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
 typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
 typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2165,24 +2165,26 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
    // require aligned memory operands until support for stack alignment
    // is implemented.
 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
-    Operand *T = NULL;
+    Variable *T = NULL;
    if (Index) {
      // The shuffle only needs to occur if the element to be extracted
      // is not at the lowest index.
      Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
      T = makeReg(Ty);
-      _pshufd(llvm::cast<Variable>(T), ALIGN_HACK(SourceVectOperand), Mask);
+      _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);
    } else {
-      // TODO(wala): If SourceVectOperand is in memory, express it as
-      // mem32 so that the call to legalizeToVar() is made unnecessary.
-      // _movd and _movss only take mem32 memory operands.
      T = legalizeToVar(SourceVectOperand);
    }
    if (InVectorElementTy == IceType_i32) {
      _movd(ExtractedElement, T);
-    } else { // InVectorElementTy == IceType_f32
+    } else { // Ty == Icetype_f32
-      // TODO: _mov should be able to be used here.
+      // TODO(wala): _movss is only used here because _mov does not
+      // allow a vector source and a scalar destination.  _mov should be
+      // able to be used here.
+      // _movss is a binary instruction, so the FakeDef is needed to
+      // keep the live range analysis consistent.
+      Context.insert(InstFakeDef::create(Func, ExtractedElement));
      _movss(ExtractedElement, T);
    }
 #undef ALIGN_HACK
@@ -2521,6 +2523,7 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
  // Only constant indices are allowed in PNaCl IR.
  assert(ElementIndex);
  unsigned Index = ElementIndex->getValue();
+  assert(Index < typeNumElements(SourceVectOperand->getType()));
  Type Ty = SourceVectOperand->getType();
  Type ElementTy = typeElementType(Ty);
@@ -2538,7 +2541,8 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
  }
  if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
-    // Lower insertelement with 32-bit wide elements using shufps.
+    // Lower insertelement with 32-bit wide elements using shufps or
+    // movss.
    // TODO(wala): SSE4.1 has pinsrd and insertps.
    Variable *Element = NULL;
    if (InVectorElementTy == IceType_f32) {
@@ -2551,6 +2555,14 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
      _movd(Element, T);
    }
+    if (Index == 0) {
+      Variable *T = makeReg(Ty);
+      _movp(T, SourceVectOperand);
+      _movss(T, Element);
+      _movp(Inst->getDest(), T);
+      return;
+    }
    // shufps treats the source and desination operands as vectors of
    // four doublewords.  The destination's two high doublewords are
    // selected from the source operand and the two low doublewords are
@@ -2560,10 +2572,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
    // Element[0] is being inserted into SourceVectOperand.  Indices are
    // ordered from left to right.
    //
-    // insertelement into index 0 (result is stored in Element):
-    //   Element := Element[0, 0] SourceVectOperand[0, 1]
-    //   Element := Element[0, 3] SourceVectOperand[2, 3]
-    //
    // insertelement into index 1 (result is stored in Element):
    //   Element := Element[0, 0] SourceVectOperand[0, 0]
    //   Element := Element[3, 0] SourceVectOperand[2, 3]
@@ -2577,17 +2585,17 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
    //   T := SourceVectOperand
    //   Element := Element[0, 0] T[0, 2]
    //   T := T[0, 1] Element[3, 0]
-    const unsigned char Mask1[4] = {64, 0, 192, 128};
+    const unsigned char Mask1[3] = {0, 192, 128};
-    const unsigned char Mask2[4] = {236, 227, 196, 52};
+    const unsigned char Mask2[3] = {227, 196, 52};
-    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index]);
+    Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);
-    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index]);
+    Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);
    // ALIGNHACK: Force vector operands to registers in instructions that
    // require aligned memory operands until support for stack alignment
    // is implemented.
 #define ALIGN_HACK(Vect) legalizeToVar((Vect))
-    if (Index < 2) {
+    if (Index == 1) {
      SourceVectOperand = ALIGN_HACK(SourceVectOperand);
      _shufps(Element, SourceVectOperand, Mask1Constant);
      _shufps(Element, SourceVectOperand, Mask2Constant);

--- a/tests_lit/llvm2ice_tests/vector-ops.ll
+++ b/tests_lit/llvm2ice_tests/vector-ops.ll
@@ -12,20 +12,37 @@
 ; insertelement operations
-define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt) {
+define <4 x float> @insertelement_v4f32_0(<4 x float> %vec, float %elt) {
+entry:
+  %res = insertelement <4 x float> %vec, float %elt, i32 0
+  ret <4 x float> %res
+; CHECK-LABEL: insertelement_v4f32_0:
+; CHECK: movss
+}
+define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
+entry:
+  %res = insertelement <4 x i32> %vec, i32 %elt, i32 0
+  ret <4 x i32> %res
+; CHECK-LABEL: insertelement_v4i32_0:
+; CHECK: movss
+}
+define <4 x float> @insertelement_v4f32_1(<4 x float> %vec, float %elt) {
 entry:
  %res = insertelement <4 x float> %vec, float %elt, i32 1
  ret <4 x float> %res
-; CHECK-LABEL: insertelement_v4f32:
+; CHECK-LABEL: insertelement_v4f32_1:
 ; CHECK: shufps
 ; CHECK: shufps
 }
-define <4 x i32> @insertelement_v4i32(<4 x i32> %vec, i32 %elt) {
+define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
 entry:
  %res = insertelement <4 x i32> %vec, i32 %elt, i32 1
  ret <4 x i32> %res
-; CHECK-LABEL: insertelement_v4i32:
+; CHECK-LABEL: insertelement_v4i32_1:
 ; CHECK: shufps
 ; CHECK: shufps
 }
@@ -50,12 +67,21 @@ entry:
 ; CHECK: mov
 }
-define <4 x i1> @insertelement_v4i1(<4 x i1> %vec, i32 %elt.arg) {
+define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
+entry:
+  %elt = trunc i32 %elt.arg to i1
+  %res = insertelement <4 x i1> %vec, i1 %elt, i32 0
+  ret <4 x i1> %res
+; CHECK-LABEL: insertelement_v4i1_0:
+; CHECK: movss
+}
+define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
 entry:
  %elt = trunc i32 %elt.arg to i1
  %res = insertelement <4 x i1> %vec, i1 %elt, i32 1
  ret <4 x i1> %res
-; CHECK-LABEL: insertelement_v4i1:
+; CHECK-LABEL: insertelement_v4i1_1:
 ; CHECK: shufps
 ; CHECK: shufps
 }