Commit 71c69373 by Nicolas Capens Committed by Nicolas Capens

Optimize lowering of x86 byte and word vector unpack.

BUG=swiftshader:15 Change-Id: Id0d3bed46d00336fc31501c41a26ebe2d4ddd697 Reviewed-on: https://chromium-review.googlesource.com/392626Reviewed-by: 's avatarJim Stichnoth <stichnot@chromium.org> Tested-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent acfb3df0
......@@ -435,8 +435,8 @@ public:
void pshufd(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
void pshufd(Type Ty, XmmRegister dst, const Address &src,
const Immediate &mask);
void punpckldq(Type, XmmRegister Dst, XmmRegister Src);
void punpckldq(Type, XmmRegister Dst, const Address &Src);
void punpckl(Type Ty, XmmRegister Dst, XmmRegister Src);
void punpckl(Type Ty, XmmRegister Dst, const Address &Src);
void shufps(Type Ty, XmmRegister dst, XmmRegister src, const Immediate &mask);
void shufps(Type Ty, XmmRegister dst, const Address &src,
const Immediate &mask);
......
......@@ -1589,25 +1589,41 @@ void AssemblerX86Base<TraitsType>::pshufd(Type /* Ty */, XmmRegister dst,
}
template <typename TraitsType>
void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
XmmRegister Src) {
void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
XmmRegister Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer);
emitUint8(0x66);
emitRexRB(RexTypeIrrelevant, Dst, Src);
emitUint8(0x0F);
emitUint8(0x62);
if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
emitUint8(0x62);
} else if (Ty == IceType_v8i16) {
emitUint8(0x61);
} else if (Ty == IceType_v16i8) {
emitUint8(0x60);
} else {
assert(false && "Unexpected vector unpack operand type");
}
emitXmmRegisterOperand(Dst, Src);
}
template <typename TraitsType>
void AssemblerX86Base<TraitsType>::punpckldq(Type, XmmRegister Dst,
const Address &Src) {
void AssemblerX86Base<TraitsType>::punpckl(Type Ty, XmmRegister Dst,
const Address &Src) {
AssemblerBuffer::EnsureCapacity ensured(&Buffer);
emitUint8(0x66);
emitAddrSizeOverridePrefix();
emitRex(RexTypeIrrelevant, Src, Dst);
emitUint8(0x0F);
emitUint8(0x62);
if (Ty == IceType_v4i32 || Ty == IceType_v4f32) {
emitUint8(0x62);
} else if (Ty == IceType_v8i16) {
emitUint8(0x61);
} else if (Ty == IceType_v16i8) {
emitUint8(0x60);
} else {
assert(false && "Unexpected vector unpack operand type");
}
emitOperand(gprEncoding(Dst), Src);
}
......
......@@ -3658,8 +3658,8 @@ template <typename TraitsType> struct Insts {
template <> \
const InstImpl<TraitsType>::Assembler::XmmEmitterRegOp \
InstImpl<TraitsType>::InstX86Punpckl::Base::Emitter = { \
&InstImpl<TraitsType>::Assembler::punpckldq, \
&InstImpl<TraitsType>::Assembler::punpckldq}; \
&InstImpl<TraitsType>::Assembler::punpckl, \
&InstImpl<TraitsType>::Assembler::punpckl}; \
} \
}
......
......@@ -5978,10 +5978,6 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
llvm::report_fatal_error("Unexpected vector type.");
case IceType_v16i1:
case IceType_v16i8: {
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
......@@ -6001,6 +5997,25 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
const SizeT Index13 = Instr->getIndex(13)->getValue();
const SizeT Index14 = Instr->getIndex(14)->getValue();
const SizeT Index15 = Instr->getIndex(15)->getValue();
if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3 &&
Index8 == 4 && Index9 == 4 && Index10 == 5 && Index11 == 5 &&
Index12 == 6 && Index13 == 6 && Index14 == 7 && Index15 == 7) {
auto *T = makeReg(DestTy);
auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
_movp(T, Src0RM);
_punpckl(T, Src1RM);
_movp(Dest, T);
return;
}
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
Index3, Index4, Index5, Index6, Index7,
Index8, Index9, Index10, Index11, Index12,
......@@ -6009,10 +6024,6 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
}
case IceType_v8i1:
case IceType_v8i16: {
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
......@@ -6024,6 +6035,23 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
const SizeT Index5 = Instr->getIndex(5)->getValue();
const SizeT Index6 = Instr->getIndex(6)->getValue();
const SizeT Index7 = Instr->getIndex(7)->getValue();
if (Index0 == 0 && Index1 == 0 && Index2 == 1 && Index3 == 1 &&
Index4 == 2 && Index5 == 2 && Index6 == 3 && Index7 == 3) {
auto *T = makeReg(DestTy);
auto *Src0RM = legalize(Src0, Legal_Reg | Legal_Mem);
auto *Src1RM = legalize(Src1, Legal_Reg | Legal_Mem);
_movp(T, Src0RM);
_punpckl(T, Src1RM);
_movp(Dest, T);
return;
}
if (InstructionSet < Traits::SSE4_1) {
// TODO(jpp): figure out how to lower with sse2.
break;
}
#define TO_BYTE_INDEX(I) ((I) << 1)
lowerShuffleVector_UsingPshufb(
Dest, Src0, Src1, TO_BYTE_INDEX(Index0), TO_BYTE_INDEX(Index0) + 1,
......
......@@ -1017,57 +1017,76 @@ TEST_F(AssemblerX8632Test, Shufp) {
#undef TestImplSingleXmmXmm
}
TEST_F(AssemblerX8632Test, Punpckldq) {
const Dqword V0(uint64_t(0x1111111122222222ull),
uint64_t(0x5555555577777777ull));
const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
uint64_t(0xCCCCCCCCDDDDDDDDull));
const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
uint64_t(0xAAAAAAAA11111111ull));
#define TestImplXmmXmm(Dst, Src, Inst) \
TEST_F(AssemblerX8632Test, Punpckl) {
const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
uint64_t(0x5555555577777777ull));
const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
uint64_t(0xCCCCCCCCDDDDDDDDull));
const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
uint64_t(0xAAAAAAAA11111111ull));
const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
uint64_t(0x5555666677778888ull));
const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
uint64_t(0xEEEEFFFF00009999ull));
const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
uint64_t(0xAAAA1111BBBB2222ull));
const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
uint64_t(0x99AABBCCDDEEFF00ull));
const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
uint64_t(0xBAADF00DFEEDFACEull));
const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
uint64_t(0xFF11EE22DD33CC44ull));
#define TestImplXmmXmm(Dst, Src, Inst, Ty) \
do { \
static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")"; \
static constexpr char TestString[] = \
"(" #Dst ", " #Src ", " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, \
__ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, \
XmmRegister::Encoded_Reg_##Src); \
\
AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \
test.setDqwordTo(T1, V1); \
test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
#define TestImplXmmAddr(Dst, Inst) \
#define TestImplXmmAddr(Dst, Inst, Ty) \
do { \
static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")"; \
static constexpr char TestString[] = \
"(" #Dst ", Addr, " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
__ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
\
AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \
test.setDqwordTo(T1, V1); \
test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
#define TestImpl(Dst, Src) \
do { \
TestImplXmmXmm(Dst, Src, punpckldq); \
TestImplXmmAddr(Dst, punpckldq); \
TestImplXmmXmm(Dst, Src, punpckl, v4i32); \
TestImplXmmAddr(Dst, punpckl, v4i32); \
TestImplXmmXmm(Dst, Src, punpckl, v8i16); \
TestImplXmmAddr(Dst, punpckl, v8i16); \
TestImplXmmXmm(Dst, Src, punpckl, v16i8); \
TestImplXmmAddr(Dst, punpckl, v16i8); \
} while (0)
TestImpl(xmm0, xmm1);
......
......@@ -1083,57 +1083,76 @@ TEST_F(AssemblerX8664Test, Shufp) {
#undef TestImplSingleXmmXmm
}
TEST_F(AssemblerX8664Test, Punpckldq) {
const Dqword V0(uint64_t(0x1111111122222222ull),
uint64_t(0x5555555577777777ull));
const Dqword V1(uint64_t(0xAAAAAAAABBBBBBBBull),
uint64_t(0xCCCCCCCCDDDDDDDDull));
const Dqword Expected(uint64_t(0xBBBBBBBB22222222ull),
uint64_t(0xAAAAAAAA11111111ull));
#define TestImplXmmXmm(Dst, Src, Inst) \
TEST_F(AssemblerX8664Test, Punpckl) {
const Dqword V0_v4i32(uint64_t(0x1111111122222222ull),
uint64_t(0x5555555577777777ull));
const Dqword V1_v4i32(uint64_t(0xAAAAAAAABBBBBBBBull),
uint64_t(0xCCCCCCCCDDDDDDDDull));
const Dqword Expected_v4i32(uint64_t(0xBBBBBBBB22222222ull),
uint64_t(0xAAAAAAAA11111111ull));
const Dqword V0_v8i16(uint64_t(0x1111222233334444ull),
uint64_t(0x5555666677778888ull));
const Dqword V1_v8i16(uint64_t(0xAAAABBBBCCCCDDDDull),
uint64_t(0xEEEEFFFF00009999ull));
const Dqword Expected_v8i16(uint64_t(0xCCCC3333DDDD4444ull),
uint64_t(0xAAAA1111BBBB2222ull));
const Dqword V0_v16i8(uint64_t(0x1122334455667788ull),
uint64_t(0x99AABBCCDDEEFF00ull));
const Dqword V1_v16i8(uint64_t(0xFFEEDDCCBBAA9900ull),
uint64_t(0xBAADF00DFEEDFACEull));
const Dqword Expected_v16i8(uint64_t(0xBB55AA6699770088ull),
uint64_t(0xFF11EE22DD33CC44ull));
#define TestImplXmmXmm(Dst, Src, Inst, Ty) \
do { \
static constexpr char TestString[] = "(" #Dst ", " #Src ", " #Inst ")"; \
static constexpr char TestString[] = \
"(" #Dst ", " #Src ", " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ movups(XmmRegister::Encoded_Reg_##Src, dwordAddress(T1)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, \
__ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, \
XmmRegister::Encoded_Reg_##Src); \
\
AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \
test.setDqwordTo(T1, V1); \
test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
#define TestImplXmmAddr(Dst, Inst) \
#define TestImplXmmAddr(Dst, Inst, Ty) \
do { \
static constexpr char TestString[] = "(" #Dst ", Addr, " #Inst ")"; \
static constexpr char TestString[] = \
"(" #Dst ", Addr, " #Inst ", " #Ty ")"; \
const uint32_t T0 = allocateDqword(); \
const uint32_t T1 = allocateDqword(); \
\
__ movups(XmmRegister::Encoded_Reg_##Dst, dwordAddress(T0)); \
__ Inst(IceType_void, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
__ Inst(IceType_##Ty, XmmRegister::Encoded_Reg_##Dst, dwordAddress(T1)); \
\
AssembledTest test = assemble(); \
test.setDqwordTo(T0, V0); \
test.setDqwordTo(T1, V1); \
test.setDqwordTo(T0, V0_##Ty); \
test.setDqwordTo(T1, V1_##Ty); \
test.run(); \
\
ASSERT_EQ(Expected, test.Dst<Dqword>()) << TestString; \
ASSERT_EQ(Expected_##Ty, test.Dst<Dqword>()) << TestString; \
reset(); \
} while (0)
#define TestImpl(Dst, Src) \
do { \
TestImplXmmXmm(Dst, Src, punpckldq); \
TestImplXmmAddr(Dst, punpckldq); \
TestImplXmmXmm(Dst, Src, punpckl, v4i32); \
TestImplXmmAddr(Dst, punpckl, v4i32); \
TestImplXmmXmm(Dst, Src, punpckl, v8i16); \
TestImplXmmAddr(Dst, punpckl, v8i16); \
TestImplXmmXmm(Dst, Src, punpckl, v16i8); \
TestImplXmmAddr(Dst, punpckl, v16i8); \
} while (0)
TestImpl(xmm0, xmm1);
......@@ -1143,15 +1162,7 @@ TEST_F(AssemblerX8664Test, Punpckldq) {
TestImpl(xmm4, xmm5);
TestImpl(xmm5, xmm6);
TestImpl(xmm6, xmm7);
TestImpl(xmm7, xmm8);
TestImpl(xmm8, xmm9);
TestImpl(xmm9, xmm10);
TestImpl(xmm10, xmm11);
TestImpl(xmm11, xmm12);
TestImpl(xmm12, xmm13);
TestImpl(xmm13, xmm14);
TestImpl(xmm14, xmm15);
TestImpl(xmm15, xmm0);
TestImpl(xmm7, xmm0);
#undef TestImpl
#undef TestImplXmmAddr
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment