Commit 71c69373 by Nicolas Capens Committed by Nicolas Capens

Optimize lowering of x86 byte and word vector unpack.

BUG=swiftshader:15 Change-Id: Id0d3bed46d00336fc31501c41a26ebe2d4ddd697 Reviewed-on: https://chromium-review.googlesource.com/392626Reviewed-by: 's avatarJim Stichnoth <stichnot@chromium.org> Tested-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent acfb3df0
...@@ -435,8 +435,8 @@ public: ...@@ -435,8 +435,8 @@ public:
void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask); void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
void pshufd(Type Ty, XmmRegister dst, const Address &src, void pshufd(Type Ty, XmmRegister dst, const Address &src,
const Immediate &mask); const Immediate &mask);
void punpckldq(Type, XmmRegister Dst, XmmRegister Src); void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
void punpckldq(Type, XmmRegister Dst, const Address &Src); void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask); void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
void shufps(Type Ty, XmmRegister dst, const Address &src, void shufps(Type Ty, XmmRegister dst, const Address &src,
const Immediate &mask); const Immediate &mask);
......
...@@ -1589,25 +1589,41 @@ void AssemblerX86Base<TraitsType>::pshufd(Type /* Ty */, XmmRegister dst, ...@@ -1589,25 +1589,41 @@ void AssemblerX86Base<TraitsType>::pshufd(Type /* Ty */, XmmRegister dst,
} }
template <typename TraitsType> template <typename TraitsType>
void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst, void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
XmmRegister Src) { XmmRegister Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer); AssemblerBuffer::EnsureCapacity ensured(&Buffer);
emitUint8(0x66); emitUint8(0x66);
emitRexRB(RexTypeIrrelevant, Dst, Src); emitRexRB(RexTypeIrrelevant, Dst, Src);
emitUint8(0x0F); emitUint8(0x0F);
emitUint8(0x62); if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
emitUint8(0x62);
} else if (Ty == IceType_v8i16) {
emitUint8(0x61);
} else if (Ty == IceType_v16i8) {
emitUint8(0x60);
} else {
assert(false && "Unexpected vector unpack operand type");
}
emitXmmRegisterOperand(Dst, Src); emitXmmRegisterOperand(Dst, Src);
} }
template <typename TraitsType> template <typename TraitsType>
void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst, void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
const Address &Src) { const Address &Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer); AssemblerBuffer::EnsureCapacity ensured(&Buffer);
emitUint8(0x66); emitUint8(0x66);
emitAddrSizeOverridePrefix(); emitAddrSizeOverridePrefix();
emitRex(RexTypeIrrelevant, Src, Dst); emitRex(RexTypeIrrelevant, Src, Dst);
emitUint8(0x0F); emitUint8(0x0F);
emitUint8(0x62); if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
emitUint8(0x62);
} else if (Ty == IceType_v8i16) {
emitUint8(0x61);
} else if (Ty == IceType_v16i8) {
emitUint8(0x60);
} else {
assert(false && "Unexpected vector unpack operand type");
}
emitOperand(gprEncoding(Dst), Src); emitOperand(gprEncoding(Dst), Src);
} }
......
...@@ -3658,8 +3658,8 @@ template <typename TraitsType> struct Insts { ...@@ -3658,8 +3658,8 @@ template <typename TraitsType> struct Insts {
template <> \ template <> \
const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp \ const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp \
InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = { \ InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = { \
&InstImpl<TraitsType>::Assembler::punpckldq, \ &InstImpl<TraitsType>::Assembler::punpckl, \
&InstImpl<TraitsType>::Assembler::punpckldq}; \ &InstImpl<TraitsType>::Assembler::punpckl}; \
} \ } \
} }
......
...@@ -5978,10 +5978,6 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -5978,10 +5978,6 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
llvm::report_fatal_error("Unexpected vector type."); llvm::report_fatal_error("Unexpected vector type.");
case IceType_v16i1: case IceType_v16i1:
case IceType_v16i8: { case IceType_v16i8: {
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
static constexpr SizeT ExpectedNumElements = 16; static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == Instr->getNumIndexes()); assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements; (void)ExpectedNumElements;
...@@ -6001,6 +5997,25 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6001,6 +5997,25 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
const SizeT Index13 = Instr->getIndex(13)->getValue(); const SizeT Index13 = Instr->getIndex(13)->getValue();
const SizeT Index14 = Instr->getIndex(14)->getValue(); const SizeT Index14 = Instr->getIndex(14)->getValue();
const SizeT Index15 = Instr->getIndex(15)->getValue(); const SizeT Index15 = Instr->getIndex(15)->getValue();
if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3 &&
Index8 == 4 && Index9 == 4 && Index10 == 5 && Index11 == 5 &&
Index12 == 6 && Index13 == 6 && Index14 == 7 && Index15 == 7) {
auto *T = makeReg(DestTy);
auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
_movp(T, Src0RM);
_punpckl(T, Src1RM);
_movp(Dest, T);
return;
}
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2, lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
Index3, Index4, Index5, Index6, Index7, Index3, Index4, Index5, Index6, Index7,
Index8, Index9, Index10, Index11, Index12, Index8, Index9, Index10, Index11, Index12,
...@@ -6009,10 +6024,6 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6009,10 +6024,6 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
} }
case IceType_v8i1: case IceType_v8i1:
case IceType_v8i16: { case IceType_v8i16: {
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
static constexpr SizeT ExpectedNumElements = 8; static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == Instr->getNumIndexes()); assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements; (void)ExpectedNumElements;
...@@ -6024,6 +6035,23 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6024,6 +6035,23 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
const SizeT Index5 = Instr->getIndex(5)->getValue(); const SizeT Index5 = Instr->getIndex(5)->getValue();
const SizeT Index6 = Instr->getIndex(6)->getValue(); const SizeT Index6 = Instr->getIndex(6)->getValue();
const SizeT Index7 = Instr->getIndex(7)->getValue(); const SizeT Index7 = Instr->getIndex(7)->getValue();
if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3) {
auto *T = makeReg(DestTy);
auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
_movp(T, Src0RM);
_punpckl(T, Src1RM);
_movp(Dest, T);
return;
}
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
#define TO_BYTE_INDEX(I) ((I) << 1) #define TO_BYTE_INDEX(I) ((I) << 1)
lowerShuffleVector_UsingPshufb( lowerShuffleVector_UsingPshufb(
Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1, Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
......
...@@ -1017,57 +1017,76 @@ TEST_F(AssemblerX8632Test, Shufp) { ...@@ -1017,57 +1017,76 @@ TEST_F(AssemblerX8632Test, Shufp) {
#undef TestImplSingleXmmXmm #undef TestImplSingleXmmXmm
} }
TEST_F(AssemblerX8632Test, Punpckldq) { TEST_F(AssemblerX8632Test, Punpckl) {
const Dqword V0(uint64_t(0x1111111122222222ull), const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
uint64_t(0x5555555577777777ull)); uint64_t(0x5555555577777777ull));
const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull), const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
uint64_t(0xCCCCCCCCDDDDDDDDull)); uint64_t(0xCCCCCCCCDDDDDDDDull));
const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull), uint64_t(0xAAAAAAAA11111111ull));
uint64_t(0xAAAAAAAA11111111ull));
const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
#define TestImplXmmXmm(Dst, Src, Inst) \ uint64_t(0x5555666677778888ull));
const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
uint64_t(0xEEEEFFFF00009999ull));
const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
uint64_t(0xAAAA1111BBBB2222ull));
const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
uint64_t(0x99AABBCCDDEEFF00ull));
const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
uint64_t(0xBAADF00DFEEDFACEull));
const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
uint64_t(0xFF11EE22DD33CC44ull));
#define TestImplXmmXmm(Dst, Src, Inst, Ty) \
do { \ do { \
static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")"; \ static constexpr char TestString[] = \
"(" #Dst ", " #Src ", " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \ const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \ const uint32_t T1 = allocateDqword(); \
\ \
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \ __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \ __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, \ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, \
XmmRegister::Encoded_Reg_##Src); \ XmmRegister::Encoded_Reg_##Src); \
\ \
AssembledTest test = assemble(); \ AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \ test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1); \ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \ test.run(); \
\ \
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \ reset(); \
} while (0) } while (0)
#define TestImplXmmAddr(Dst, Inst) \ #define TestImplXmmAddr(Dst, Inst, Ty) \
do { \ do { \
static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")"; \ static constexpr char TestString[] = \
"(" #Dst ", Addr, " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \ const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \ const uint32_t T1 = allocateDqword(); \
\ \
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \ __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
\ \
AssembledTest test = assemble(); \ AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \ test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1); \ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \ test.run(); \
\ \
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \ reset(); \
} while (0) } while (0)
#define TestImpl(Dst, Src) \ #define TestImpl(Dst, Src) \
do { \ do { \
TestImplXmmXmm(Dst, Src, punpckldq); \ TestImplXmmXmm(Dst, Src, punpckl, v4i32); \
TestImplXmmAddr(Dst, punpckldq); \ TestImplXmmAddr(Dst, punpckl, v4i32); \
TestImplXmmXmm(Dst, Src, punpckl, v8i16); \
TestImplXmmAddr(Dst, punpckl, v8i16); \
TestImplXmmXmm(Dst, Src, punpckl, v16i8); \
TestImplXmmAddr(Dst, punpckl, v16i8); \
} while (0) } while (0)
TestImpl(xmm0, xmm1); TestImpl(xmm0, xmm1);
......
...@@ -1083,57 +1083,76 @@ TEST_F(AssemblerX8664Test, Shufp) { ...@@ -1083,57 +1083,76 @@ TEST_F(AssemblerX8664Test, Shufp) {
#undef TestImplSingleXmmXmm #undef TestImplSingleXmmXmm
} }
TEST_F(AssemblerX8664Test, Punpckldq) { TEST_F(AssemblerX8664Test, Punpckl) {
const Dqword V0(uint64_t(0x1111111122222222ull), const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
uint64_t(0x5555555577777777ull)); uint64_t(0x5555555577777777ull));
const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull), const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
uint64_t(0xCCCCCCCCDDDDDDDDull)); uint64_t(0xCCCCCCCCDDDDDDDDull));
const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull), uint64_t(0xAAAAAAAA11111111ull));
uint64_t(0xAAAAAAAA11111111ull));
const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
#define TestImplXmmXmm(Dst, Src, Inst) \ uint64_t(0x5555666677778888ull));
const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
uint64_t(0xEEEEFFFF00009999ull));
const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
uint64_t(0xAAAA1111BBBB2222ull));
const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
uint64_t(0x99AABBCCDDEEFF00ull));
const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
uint64_t(0xBAADF00DFEEDFACEull));
const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
uint64_t(0xFF11EE22DD33CC44ull));
#define TestImplXmmXmm(Dst, Src, Inst, Ty) \
do { \ do { \
static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")"; \ static constexpr char TestString[] = \
"(" #Dst ", " #Src ", " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \ const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \ const uint32_t T1 = allocateDqword(); \
\ \
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \ __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \ __ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, \ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, \
XmmRegister::Encoded_Reg_##Src); \ XmmRegister::Encoded_Reg_##Src); \
\ \
AssembledTest test = assemble(); \ AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \ test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1); \ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \ test.run(); \
\ \
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \ reset(); \
} while (0) } while (0)
#define TestImplXmmAddr(Dst, Inst) \ #define TestImplXmmAddr(Dst, Inst, Ty) \
do { \ do { \
static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")"; \ static constexpr char TestString[] = \
"(" #Dst ", Addr, " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \ const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \ const uint32_t T1 = allocateDqword(); \
\ \
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \ __ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \ __ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
\ \
AssembledTest test = assemble(); \ AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \ test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1); \ test.setDqwordTo(T1, V1_##Ty); \
test.run(); \ test.run(); \
\ \
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \ ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \ reset(); \
} while (0) } while (0)
#define TestImpl(Dst, Src) \ #define TestImpl(Dst, Src) \
do { \ do { \
TestImplXmmXmm(Dst, Src, punpckldq); \ TestImplXmmXmm(Dst, Src, punpckl, v4i32); \
TestImplXmmAddr(Dst, punpckldq); \ TestImplXmmAddr(Dst, punpckl, v4i32); \
TestImplXmmXmm(Dst, Src, punpckl, v8i16); \
TestImplXmmAddr(Dst, punpckl, v8i16); \
TestImplXmmXmm(Dst, Src, punpckl, v16i8); \
TestImplXmmAddr(Dst, punpckl, v16i8); \
} while (0) } while (0)
TestImpl(xmm0, xmm1); TestImpl(xmm0, xmm1);
...@@ -1143,15 +1162,7 @@ TEST_F(AssemblerX8664Test, Punpckldq) { ...@@ -1143,15 +1162,7 @@ TEST_F(AssemblerX8664Test, Punpckldq) {
TestImpl(xmm4, xmm5); TestImpl(xmm4, xmm5);
TestImpl(xmm5, xmm6); TestImpl(xmm5, xmm6);
TestImpl(xmm6, xmm7); TestImpl(xmm6, xmm7);
TestImpl(xmm7, xmm8); TestImpl(xmm7, xmm0);
TestImpl(xmm8, xmm9);
TestImpl(xmm9, xmm10);
TestImpl(xmm10, xmm11);
TestImpl(xmm11, xmm12);
TestImpl(xmm12, xmm13);
TestImpl(xmm13, xmm14);
TestImpl(xmm14, xmm15);
TestImpl(xmm15, xmm0);
#undef TestImpl #undef TestImpl
#undef TestImplXmmAddr #undef TestImplXmmAddr
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment