Commit cfe5146f by Matt Wala

Use movss to implement insertelement when elements = 4 and index = 0.

This avoids using a pair of shufps instructions as the previous lowering was doing. Instead, we use movss to copy the element to be inserted into the lower 32 bits of the destination. Define InstX8632Movss as a Binop, the class to which it properly belongs. BUG=none R=jvoung@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/412353005
parent ce0ca8f8
...@@ -462,7 +462,6 @@ template <> const char *InstX8632Bsf::Opcode = "bsf"; ...@@ -462,7 +462,6 @@ template <> const char *InstX8632Bsf::Opcode = "bsf";
template <> const char *InstX8632Bsr::Opcode = "bsr"; template <> const char *InstX8632Bsr::Opcode = "bsr";
template <> const char *InstX8632Lea::Opcode = "lea"; template <> const char *InstX8632Lea::Opcode = "lea";
template <> const char *InstX8632Movd::Opcode = "movd"; template <> const char *InstX8632Movd::Opcode = "movd";
template <> const char *InstX8632Movss::Opcode = "movss";
template <> const char *InstX8632Sqrtss::Opcode = "sqrtss"; template <> const char *InstX8632Sqrtss::Opcode = "sqrtss";
// Binary ops // Binary ops
template <> const char *InstX8632Add::Opcode = "add"; template <> const char *InstX8632Add::Opcode = "add";
...@@ -499,6 +498,7 @@ template <> const char *InstX8632Sar::Opcode = "sar"; ...@@ -499,6 +498,7 @@ template <> const char *InstX8632Sar::Opcode = "sar";
template <> const char *InstX8632Psra::Opcode = "psra"; template <> const char *InstX8632Psra::Opcode = "psra";
template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq"; template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt"; template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
template <> const char *InstX8632Movss::Opcode = "movss";
// Ternary ops // Ternary ops
template <> const char *InstX8632Shufps::Opcode = "shufps"; template <> const char *InstX8632Shufps::Opcode = "shufps";
template <> const char *InstX8632Pinsrw::Opcode = "pinsrw"; template <> const char *InstX8632Pinsrw::Opcode = "pinsrw";
......
...@@ -552,7 +552,6 @@ typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf; ...@@ -552,7 +552,6 @@ typedef InstX8632Unaryop<InstX8632::Bsf> InstX8632Bsf;
typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr; typedef InstX8632Unaryop<InstX8632::Bsr> InstX8632Bsr;
typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea; typedef InstX8632Unaryop<InstX8632::Lea> InstX8632Lea;
typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd; typedef InstX8632Unaryop<InstX8632::Movd> InstX8632Movd;
typedef InstX8632Unaryop<InstX8632::Movss> InstX8632Movss;
typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss; typedef InstX8632Unaryop<InstX8632::Sqrtss> InstX8632Sqrtss;
typedef InstX8632Binop<InstX8632::Add> InstX8632Add; typedef InstX8632Binop<InstX8632::Add> InstX8632Add;
typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps; typedef InstX8632Binop<InstX8632::Addps> InstX8632Addps;
...@@ -586,6 +585,13 @@ typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar; ...@@ -586,6 +585,13 @@ typedef InstX8632Binop<InstX8632::Sar, true> InstX8632Sar;
typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra; typedef InstX8632Binop<InstX8632::Psra> InstX8632Psra;
typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq; typedef InstX8632Binop<InstX8632::Pcmpeq> InstX8632Pcmpeq;
typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt; typedef InstX8632Binop<InstX8632::Pcmpgt> InstX8632Pcmpgt;
// TODO: movss is only a binary operation when the source and dest
// operands are both registers. In other cases, it behaves like a copy
// (mov-like) operation. Eventually, InstX8632Movss should assert that
// both its source and dest operands are registers, and the lowering
// code should use _mov instead of _movss in cases where a copy
// operation is intended.
typedef InstX8632Binop<InstX8632::Movss> InstX8632Movss;
typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv; typedef InstX8632Ternop<InstX8632::Idiv> InstX8632Idiv;
typedef InstX8632Ternop<InstX8632::Div> InstX8632Div; typedef InstX8632Ternop<InstX8632::Div> InstX8632Div;
typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw; typedef InstX8632Ternop<InstX8632::Pinsrw> InstX8632Pinsrw;
......
...@@ -2165,24 +2165,26 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) { ...@@ -2165,24 +2165,26 @@ void TargetX8632::lowerExtractElement(const InstExtractElement *Inst) {
// require aligned memory operands until support for stack alignment // require aligned memory operands until support for stack alignment
// is implemented. // is implemented.
#define ALIGN_HACK(Vect) legalizeToVar((Vect)) #define ALIGN_HACK(Vect) legalizeToVar((Vect))
Operand *T = NULL; Variable *T = NULL;
if (Index) { if (Index) {
// The shuffle only needs to occur if the element to be extracted // The shuffle only needs to occur if the element to be extracted
// is not at the lowest index. // is not at the lowest index.
Constant *Mask = Ctx->getConstantInt(IceType_i8, Index); Constant *Mask = Ctx->getConstantInt(IceType_i8, Index);
T = makeReg(Ty); T = makeReg(Ty);
_pshufd(llvm::cast<Variable>(T), ALIGN_HACK(SourceVectOperand), Mask); _pshufd(T, ALIGN_HACK(SourceVectOperand), Mask);
} else { } else {
// TODO(wala): If SourceVectOperand is in memory, express it as
// mem32 so that the call to legalizeToVar() is made unnecessary.
// _movd and _movss only take mem32 memory operands.
T = legalizeToVar(SourceVectOperand); T = legalizeToVar(SourceVectOperand);
} }
if (InVectorElementTy == IceType_i32) { if (InVectorElementTy == IceType_i32) {
_movd(ExtractedElement, T); _movd(ExtractedElement, T);
} else { // InVectorElementTy == IceType_f32 } else { // Ty == Icetype_f32
// TODO: _mov should be able to be used here. // TODO(wala): _movss is only used here because _mov does not
// allow a vector source and a scalar destination. _mov should be
// able to be used here.
// _movss is a binary instruction, so the FakeDef is needed to
// keep the live range analysis consistent.
Context.insert(InstFakeDef::create(Func, ExtractedElement));
_movss(ExtractedElement, T); _movss(ExtractedElement, T);
} }
#undef ALIGN_HACK #undef ALIGN_HACK
...@@ -2521,6 +2523,7 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2521,6 +2523,7 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
// Only constant indices are allowed in PNaCl IR. // Only constant indices are allowed in PNaCl IR.
assert(ElementIndex); assert(ElementIndex);
unsigned Index = ElementIndex->getValue(); unsigned Index = ElementIndex->getValue();
assert(Index < typeNumElements(SourceVectOperand->getType()));
Type Ty = SourceVectOperand->getType(); Type Ty = SourceVectOperand->getType();
Type ElementTy = typeElementType(Ty); Type ElementTy = typeElementType(Ty);
...@@ -2538,7 +2541,8 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2538,7 +2541,8 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
} }
if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) { if (Ty == IceType_v4i32 || Ty == IceType_v4f32 || Ty == IceType_v4i1) {
// Lower insertelement with 32-bit wide elements using shufps. // Lower insertelement with 32-bit wide elements using shufps or
// movss.
// TODO(wala): SSE4.1 has pinsrd and insertps. // TODO(wala): SSE4.1 has pinsrd and insertps.
Variable *Element = NULL; Variable *Element = NULL;
if (InVectorElementTy == IceType_f32) { if (InVectorElementTy == IceType_f32) {
...@@ -2551,6 +2555,14 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2551,6 +2555,14 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
_movd(Element, T); _movd(Element, T);
} }
if (Index == 0) {
Variable *T = makeReg(Ty);
_movp(T, SourceVectOperand);
_movss(T, Element);
_movp(Inst->getDest(), T);
return;
}
// shufps treats the source and desination operands as vectors of // shufps treats the source and desination operands as vectors of
// four doublewords. The destination's two high doublewords are // four doublewords. The destination's two high doublewords are
// selected from the source operand and the two low doublewords are // selected from the source operand and the two low doublewords are
...@@ -2560,10 +2572,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2560,10 +2572,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
// Element[0] is being inserted into SourceVectOperand. Indices are // Element[0] is being inserted into SourceVectOperand. Indices are
// ordered from left to right. // ordered from left to right.
// //
// insertelement into index 0 (result is stored in Element):
// Element := Element[0, 0] SourceVectOperand[0, 1]
// Element := Element[0, 3] SourceVectOperand[2, 3]
//
// insertelement into index 1 (result is stored in Element): // insertelement into index 1 (result is stored in Element):
// Element := Element[0, 0] SourceVectOperand[0, 0] // Element := Element[0, 0] SourceVectOperand[0, 0]
// Element := Element[3, 0] SourceVectOperand[2, 3] // Element := Element[3, 0] SourceVectOperand[2, 3]
...@@ -2577,17 +2585,17 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2577,17 +2585,17 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
// T := SourceVectOperand // T := SourceVectOperand
// Element := Element[0, 0] T[0, 2] // Element := Element[0, 0] T[0, 2]
// T := T[0, 1] Element[3, 0] // T := T[0, 1] Element[3, 0]
const unsigned char Mask1[4] = {64, 0, 192, 128}; const unsigned char Mask1[3] = {0, 192, 128};
const unsigned char Mask2[4] = {236, 227, 196, 52}; const unsigned char Mask2[3] = {227, 196, 52};
Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index]); Constant *Mask1Constant = Ctx->getConstantInt(IceType_i8, Mask1[Index - 1]);
Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index]); Constant *Mask2Constant = Ctx->getConstantInt(IceType_i8, Mask2[Index - 1]);
// ALIGNHACK: Force vector operands to registers in instructions that // ALIGNHACK: Force vector operands to registers in instructions that
// require aligned memory operands until support for stack alignment // require aligned memory operands until support for stack alignment
// is implemented. // is implemented.
#define ALIGN_HACK(Vect) legalizeToVar((Vect)) #define ALIGN_HACK(Vect) legalizeToVar((Vect))
if (Index < 2) { if (Index == 1) {
SourceVectOperand = ALIGN_HACK(SourceVectOperand); SourceVectOperand = ALIGN_HACK(SourceVectOperand);
_shufps(Element, SourceVectOperand, Mask1Constant); _shufps(Element, SourceVectOperand, Mask1Constant);
_shufps(Element, SourceVectOperand, Mask2Constant); _shufps(Element, SourceVectOperand, Mask2Constant);
......
...@@ -12,20 +12,37 @@ ...@@ -12,20 +12,37 @@
; insertelement operations ; insertelement operations
define <4 x float> @insertelement_v4f32(<4 x float> %vec, float %elt) { define <4 x float> @insertelement_v4f32_0(<4 x float> %vec, float %elt) {
entry:
%res = insertelement <4 x float> %vec, float %elt, i32 0
ret <4 x float> %res
; CHECK-LABEL: insertelement_v4f32_0:
; CHECK: movss
}
define <4 x i32> @insertelement_v4i32_0(<4 x i32> %vec, i32 %elt) {
entry:
%res = insertelement <4 x i32> %vec, i32 %elt, i32 0
ret <4 x i32> %res
; CHECK-LABEL: insertelement_v4i32_0:
; CHECK: movss
}
define <4 x float> @insertelement_v4f32_1(<4 x float> %vec, float %elt) {
entry: entry:
%res = insertelement <4 x float> %vec, float %elt, i32 1 %res = insertelement <4 x float> %vec, float %elt, i32 1
ret <4 x float> %res ret <4 x float> %res
; CHECK-LABEL: insertelement_v4f32: ; CHECK-LABEL: insertelement_v4f32_1:
; CHECK: shufps ; CHECK: shufps
; CHECK: shufps ; CHECK: shufps
} }
define <4 x i32> @insertelement_v4i32(<4 x i32> %vec, i32 %elt) { define <4 x i32> @insertelement_v4i32_1(<4 x i32> %vec, i32 %elt) {
entry: entry:
%res = insertelement <4 x i32> %vec, i32 %elt, i32 1 %res = insertelement <4 x i32> %vec, i32 %elt, i32 1
ret <4 x i32> %res ret <4 x i32> %res
; CHECK-LABEL: insertelement_v4i32: ; CHECK-LABEL: insertelement_v4i32_1:
; CHECK: shufps ; CHECK: shufps
; CHECK: shufps ; CHECK: shufps
} }
...@@ -50,12 +67,21 @@ entry: ...@@ -50,12 +67,21 @@ entry:
; CHECK: mov ; CHECK: mov
} }
define <4 x i1> @insertelement_v4i1(<4 x i1> %vec, i32 %elt.arg) { define <4 x i1> @insertelement_v4i1_0(<4 x i1> %vec, i32 %elt.arg) {
entry:
%elt = trunc i32 %elt.arg to i1
%res = insertelement <4 x i1> %vec, i1 %elt, i32 0
ret <4 x i1> %res
; CHECK-LABEL: insertelement_v4i1_0:
; CHECK: movss
}
define <4 x i1> @insertelement_v4i1_1(<4 x i1> %vec, i32 %elt.arg) {
entry: entry:
%elt = trunc i32 %elt.arg to i1 %elt = trunc i32 %elt.arg to i1
%res = insertelement <4 x i1> %vec, i1 %elt, i32 1 %res = insertelement <4 x i1> %vec, i1 %elt, i32 1
ret <4 x i1> %res ret <4 x i1> %res
; CHECK-LABEL: insertelement_v4i1: ; CHECK-LABEL: insertelement_v4i1_1:
; CHECK: shufps ; CHECK: shufps
; CHECK: shufps ; CHECK: shufps
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment