Commit f6951fa3 by Nicolas Capens Committed by Nicolas Capens

Optimize common vector shuffle patterns for ARM32.

Use VDUP for replicating a single element. Use VZIP for interleaving vectors. Use VMOV Dd, Dm for rearranging quadword vectors. Bug b/67106219 Change-Id: I0de1457454c1db6d467bf870288b7af7cb59ac09 Reviewed-on: https://chromium-review.googlesource.com/695004Reviewed-by: 's avatarJim Stichnoth <stichnot@chromium.org> Reviewed-on: https://swiftshader-review.googlesource.com/12968Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Tested-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent 416cfb9f
...@@ -3418,6 +3418,97 @@ void AssemblerARM32::vmlap(Type ElmtTy, const Operand *OpQd, ...@@ -3418,6 +3418,97 @@ void AssemblerARM32::vmlap(Type ElmtTy, const Operand *OpQd,
emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy); emitSIMDBase(VpaddOpcode, Dd, Dd, Dd + 1, UseQRegs, IsFloatTy);
} }
void AssemblerARM32::vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
IValueT Idx) {
// VDUP (scalar) - ARMv7-A/R section A8.6.302, encoding A1:
// VDUP<c>.<size> <Qd>, <Dm[x]>
//
// 111100111D11iiiiddd011000QM0mmmm where Dddd=<Qd>, Mmmmm=<Dm>, and
// iiii=imm4 encodes <size> and [x].
constexpr const char *Vdup = "vdup";
const IValueT VdupOpcode = B25 | B24 | B23 | B21 | B20 | B11 | B10;
const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vdup));
const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vdup));
constexpr bool UseQRegs = true;
constexpr bool IsFloatTy = false;
IValueT Imm4 = 0;
bool Lower = true;
switch (ElmtTy) {
case IceType_i8:
assert(Idx < 16);
Lower = Idx < 8;
Imm4 = 1 | ((Idx & 0x7) << 1);
break;
case IceType_i16:
assert(Idx < 8);
Lower = Idx < 4;
Imm4 = 2 | ((Idx & 0x3) << 2);
break;
case IceType_i32:
case IceType_f32:
assert(Idx < 4);
Lower = Idx < 2;
Imm4 = 4 | ((Idx & 0x1) << 3);
break;
default:
assert(false && "vdup only supports 8, 16, and 32-bit elements");
break;
}
emitSIMDBase(VdupOpcode, Dd, Imm4, Dn + (Lower ? 0 : 1), UseQRegs, IsFloatTy);
}
void AssemblerARM32::vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) {
// Pseudo-instruction which interleaves the elements of the lower halves of
// two quadword registers.
// Vzip - ARMv7-A/R section A8.6.410, encoding A1:
// VZIP<c>.<size> <Dd>, <Dm>
//
// 111100111D11ss10dddd00011QM0mmmm where Ddddd=<Dd>, Mmmmm=<Dm>, and
// ss=<size>
assert(ElmtTy != IceType_i64 && "vzip on i64 vector not allowed");
constexpr const char *Vzip = "vzip";
const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vzip));
const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vzip));
const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vzip));
constexpr bool UseQRegs = false;
constexpr bool IsFloatTy = false;
// VMOV Dd, Dm
// 111100100D10mmmmdddd0001MQM1mmmm
constexpr IValueT VmovOpcode = B25 | B21 | B8 | B4;
// Copy lower half of second source to upper half of destination.
emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloatTy);
// Copy lower half of first source to lower half of destination.
if (Dd != Dn)
emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloatTy);
constexpr IValueT ElmtShift = 18;
const IValueT ElmtSize = encodeElmtType(ElmtTy);
assert(Utils::IsUint(2, ElmtSize));
if (ElmtTy != IceType_i32 && ElmtTy != IceType_f32) {
constexpr IValueT VzipOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B8 | B7;
// Zip the lower and upper half of destination.
emitSIMDBase(VzipOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
IsFloatTy);
} else {
constexpr IValueT VtrnOpcode = B25 | B24 | B23 | B21 | B20 | B17 | B7;
emitSIMDBase(VtrnOpcode | (ElmtSize << ElmtShift), Dd, 0, Dd + 1, UseQRegs,
IsFloatTy);
}
}
void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn, void AssemblerARM32::vmulqf(const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) { const Operand *OpQm) {
// VMUL (floating-point) - ARM section A8.8.351, encoding A1: // VMUL (floating-point) - ARM section A8.8.351, encoding A1:
...@@ -3448,6 +3539,110 @@ void AssemblerARM32::vmvnq(const Operand *OpQd, const Operand *OpQm) { ...@@ -3448,6 +3539,110 @@ void AssemblerARM32::vmvnq(const Operand *OpQd, const Operand *OpQm) {
mapQRegToDReg(Qm), UseQRegs, IsFloat); mapQRegToDReg(Qm), UseQRegs, IsFloat);
} }
void AssemblerARM32::vmovlq(const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) {
// Pseudo-instruction to copy the first source operand and insert the lower
// half of the second operand into the lower half of the destination.
// VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
// VMOV<c> <Dd>, <Dm>
//
// 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
constexpr const char *Vmov = "vmov";
const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
constexpr bool UseQRegs = false;
constexpr bool IsFloat = false;
const IValueT VmovOpcode = B25 | B21 | B8 | B4;
if (Dd != Dm)
emitSIMDBase(VmovOpcode, Dd, Dm, Dm, UseQRegs, IsFloat);
if (Dd + 1 != Dn + 1)
emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
}
void AssemblerARM32::vmovhq(const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) {
// Pseudo-instruction to copy the first source operand and insert the high
// half of the second operand into the high half of the destination.
// VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
// VMOV<c> <Dd>, <Dm>
//
// 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
constexpr const char *Vmov = "vmov";
const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
constexpr bool UseQRegs = false;
constexpr bool IsFloat = false;
const IValueT VmovOpcode = B25 | B21 | B8 | B4;
if (Dd != Dn)
emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
if (Dd + 1 != Dm + 1)
emitSIMDBase(VmovOpcode, Dd + 1, Dm + 1, Dm + 1, UseQRegs, IsFloat);
}
void AssemblerARM32::vmovhlq(const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) {
// Pseudo-instruction to copy the first source operand and insert the high
// half of the second operand into the lower half of the destination.
// VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
// VMOV<c> <Dd>, <Dm>
//
// 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
constexpr const char *Vmov = "vmov";
const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
constexpr bool UseQRegs = false;
constexpr bool IsFloat = false;
const IValueT VmovOpcode = B25 | B21 | B8 | B4;
if (Dd != Dm + 1)
emitSIMDBase(VmovOpcode, Dd, Dm + 1, Dm + 1, UseQRegs, IsFloat);
if (Dd + 1 != Dn + 1)
emitSIMDBase(VmovOpcode, Dd + 1, Dn + 1, Dn + 1, UseQRegs, IsFloat);
}
void AssemblerARM32::vmovlhq(const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm) {
// Pseudo-instruction to copy the first source operand and insert the lower
// half of the second operand into the high half of the destination.
// VMOV (register) - ARMv7-A/R section A8.6.327, encoding A1:
// VMOV<c> <Dd>, <Dm>
//
// 111100111D110000ddd001011QM0mmm0 where Dddd=Qd, Mmmm=Qm, and Q=0.
constexpr const char *Vmov = "vmov";
const IValueT Dd = mapQRegToDReg(encodeQRegister(OpQd, "Qd", Vmov));
const IValueT Dn = mapQRegToDReg(encodeQRegister(OpQn, "Qn", Vmov));
const IValueT Dm = mapQRegToDReg(encodeQRegister(OpQm, "Qm", Vmov));
constexpr bool UseQRegs = false;
constexpr bool IsFloat = false;
const IValueT VmovOpcode = B25 | B21 | B8 | B4;
if (Dd + 1 != Dm)
emitSIMDBase(VmovOpcode, Dd + 1, Dm, Dm, UseQRegs, IsFloat);
if (Dd != Dn)
emitSIMDBase(VmovOpcode, Dd, Dn, Dn, UseQRegs, IsFloat);
}
void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd, void AssemblerARM32::vnegqs(Type ElmtTy, const Operand *OpQd,
const Operand *OpQm) { const Operand *OpQm) {
// VNEG - ARM section A8.8.355, encoding A1: // VNEG - ARM section A8.8.355, encoding A1:
......
...@@ -546,6 +546,13 @@ public: ...@@ -546,6 +546,13 @@ public:
void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, void vmlap(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm); const Operand *OpQm);
// Vector element replication.
void vdup(Type ElmtTy, const Operand *OpQd, const Operand *OpQn, IValueT Idx);
// Vector interleave lower halves.
void vzip(Type ElmtTy, const Operand *OpQd, const Operand *OpQn,
const Operand *OpQm);
// Float vector multiply. // Float vector multiply.
void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm); void vmulqf(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
...@@ -554,6 +561,11 @@ public: ...@@ -554,6 +561,11 @@ public:
void vmvnq(const Operand *OpQd, const Operand *OpQm); void vmvnq(const Operand *OpQd, const Operand *OpQm);
void vmovlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
void vmovhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
void vmovhlq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
void vmovlhq(const Operand *OpQd, const Operand *OpQn, const Operand *OpQm);
void vnegqs(const Operand *OpQd, const Operand *OpQm); void vnegqs(const Operand *OpQd, const Operand *OpQm);
void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm); void vnegqs(Type ElmtTy, const Operand *OpQd, const Operand *OpQm);
......
...@@ -997,35 +997,45 @@ public: ...@@ -997,35 +997,45 @@ public:
return Indexes[Pos]; return Indexes[Pos];
} }
inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t getIndexValue(SizeT Pos) const { return getIndex(Pos)->getValue(); }
int32_t i4, int32_t i5, int32_t i6, int32_t i7) const {
bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3) const {
static constexpr SizeT ExpectedNumElements = 4;
assert(ExpectedNumElements == getNumIndexes());
(void)ExpectedNumElements;
return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
getIndexValue(2) == i2 && getIndexValue(3) == i3;
}
bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
int32_t i5, int32_t i6, int32_t i7) const {
static constexpr SizeT ExpectedNumElements = 8; static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == getNumIndexes()); assert(ExpectedNumElements == getNumIndexes());
(void)ExpectedNumElements; (void)ExpectedNumElements;
return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 && return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 && getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 && getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7; getIndexValue(6) == i6 && getIndexValue(7) == i7;
} }
inline bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, bool indexesAre(int32_t i0, int32_t i1, int32_t i2, int32_t i3, int32_t i4,
int32_t i4, int32_t i5, int32_t i6, int32_t i7, int32_t i5, int32_t i6, int32_t i7, int32_t i8, int32_t i9,
int32_t i8, int32_t i9, int32_t i10, int32_t i11, int32_t i10, int32_t i11, int32_t i12, int32_t i13,
int32_t i12, int32_t i13, int32_t i14, int32_t i14, int32_t i15) const {
int32_t i15) const {
static constexpr SizeT ExpectedNumElements = 16; static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == getNumIndexes()); assert(ExpectedNumElements == getNumIndexes());
(void)ExpectedNumElements; (void)ExpectedNumElements;
return getIndex(0)->getValue() == i0 && getIndex(1)->getValue() == i1 && return getIndexValue(0) == i0 && getIndexValue(1) == i1 &&
getIndex(2)->getValue() == i2 && getIndex(3)->getValue() == i3 && getIndexValue(2) == i2 && getIndexValue(3) == i3 &&
getIndex(4)->getValue() == i4 && getIndex(5)->getValue() == i5 && getIndexValue(4) == i4 && getIndexValue(5) == i5 &&
getIndex(6)->getValue() == i6 && getIndex(7)->getValue() == i7 && getIndexValue(6) == i6 && getIndexValue(7) == i7 &&
getIndex(8)->getValue() == i8 && getIndex(9)->getValue() == i9 && getIndexValue(8) == i8 && getIndexValue(9) == i9 &&
getIndex(10)->getValue() == i10 && getIndex(11)->getValue() == i11 && getIndexValue(10) == i10 && getIndexValue(11) == i11 &&
getIndex(12)->getValue() == i12 && getIndex(13)->getValue() == i13 && getIndexValue(12) == i12 && getIndexValue(13) == i13 &&
getIndex(14)->getValue() == i14 && getIndex(15)->getValue() == i15; getIndexValue(14) == i14 && getIndexValue(15) == i15;
} }
bool isMemoryWrite() const override { return false; } bool isMemoryWrite() const override { return false; }
......
...@@ -903,6 +903,82 @@ template <> void InstARM32Vmvn::emitIAS(const Cfg *Func) const { ...@@ -903,6 +903,82 @@ template <> void InstARM32Vmvn::emitIAS(const Cfg *Func) const {
} }
} }
template <> void InstARM32Vmovl::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest();
switch (Dest->getType()) {
default:
llvm::report_fatal_error("Vmovlq not defined on type " +
typeStdString(Dest->getType()));
case IceType_v4i1:
case IceType_v8i1:
case IceType_v16i1:
case IceType_v16i8:
case IceType_v8i16:
case IceType_v4i32:
case IceType_v4f32: {
Asm->vmovlq(Dest, getSrc(0), getSrc(1));
} break;
}
}
template <> void InstARM32Vmovh::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest();
switch (Dest->getType()) {
default:
llvm::report_fatal_error("Vmovhq not defined on type " +
typeStdString(Dest->getType()));
case IceType_v4i1:
case IceType_v8i1:
case IceType_v16i1:
case IceType_v16i8:
case IceType_v8i16:
case IceType_v4i32:
case IceType_v4f32: {
Asm->vmovhq(Dest, getSrc(0), getSrc(1));
} break;
}
}
template <> void InstARM32Vmovhl::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest();
switch (Dest->getType()) {
default:
llvm::report_fatal_error("Vmovhlq not defined on type " +
typeStdString(Dest->getType()));
case IceType_v4i1:
case IceType_v8i1:
case IceType_v16i1:
case IceType_v16i8:
case IceType_v8i16:
case IceType_v4i32:
case IceType_v4f32: {
Asm->vmovhlq(Dest, getSrc(0), getSrc(1));
} break;
}
}
template <> void InstARM32Vmovlh::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest();
switch (Dest->getType()) {
default:
llvm::report_fatal_error("Vmovlhq not defined on type " +
typeStdString(Dest->getType()));
case IceType_v4i1:
case IceType_v8i1:
case IceType_v16i1:
case IceType_v16i8:
case IceType_v8i16:
case IceType_v4i32:
case IceType_v4f32: {
Asm->vmovlhq(Dest, getSrc(0), getSrc(1));
} break;
}
}
template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const { template <> void InstARM32Vneg::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest(); const Variable *Dest = getDest();
...@@ -1168,6 +1244,15 @@ template <> void InstARM32Vmlap::emitIAS(const Cfg *Func) const { ...@@ -1168,6 +1244,15 @@ template <> void InstARM32Vmlap::emitIAS(const Cfg *Func) const {
assert(!Asm->needsTextFixup()); assert(!Asm->needsTextFixup());
} }
template <> void InstARM32Vzip::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Operand *Src0 = getSrc(0);
const Operand *Src1 = getSrc(1);
Type DestTy = Dest->getType();
Asm->vzip(typeElementType(DestTy), Dest, Src0, Src1);
assert(!Asm->needsTextFixup());
}
template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const { template <> void InstARM32Vmul::emitIAS(const Cfg *Func) const {
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>(); auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Variable *Dest = getDest(); const Variable *Dest = getDest();
...@@ -1425,6 +1510,12 @@ InstARM32Vstr1::InstARM32Vstr1(Cfg *Func, Variable *Value, OperandARM32Mem *Mem, ...@@ -1425,6 +1510,12 @@ InstARM32Vstr1::InstARM32Vstr1(Cfg *Func, Variable *Value, OperandARM32Mem *Mem,
this->Size = Size; this->Size = Size;
} }
InstARM32Vdup::InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src,
IValueT Idx)
: InstARM32Pred(Func, InstARM32::Vdup, 1, Dest, CondARM32::AL), Idx(Idx) {
addSource(Src);
}
InstARM32Trap::InstARM32Trap(Cfg *Func) InstARM32Trap::InstARM32Trap(Cfg *Func)
: InstARM32(Func, InstARM32::Trap, 0, nullptr) {} : InstARM32(Func, InstARM32::Trap, 0, nullptr) {}
...@@ -1775,6 +1866,10 @@ template <> const char *InstARM32Vmla::Opcode = "vmla"; ...@@ -1775,6 +1866,10 @@ template <> const char *InstARM32Vmla::Opcode = "vmla";
template <> const char *InstARM32Vmls::Opcode = "vmls"; template <> const char *InstARM32Vmls::Opcode = "vmls";
template <> const char *InstARM32Vmul::Opcode = "vmul"; template <> const char *InstARM32Vmul::Opcode = "vmul";
template <> const char *InstARM32Vmvn::Opcode = "vmvn"; template <> const char *InstARM32Vmvn::Opcode = "vmvn";
template <> const char *InstARM32Vmovl::Opcode = "vmovl";
template <> const char *InstARM32Vmovh::Opcode = "vmovh";
template <> const char *InstARM32Vmovhl::Opcode = "vmovhl";
template <> const char *InstARM32Vmovlh::Opcode = "vmovlh";
template <> const char *InstARM32Vorr::Opcode = "vorr"; template <> const char *InstARM32Vorr::Opcode = "vorr";
template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg"; template <> const char *InstARM32UnaryopFP<InstARM32::Vneg>::Opcode = "vneg";
template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl"; template <> const char *InstARM32ThreeAddrFP<InstARM32::Vshl>::Opcode = "vshl";
...@@ -1790,6 +1885,7 @@ template <> ...@@ -1790,6 +1885,7 @@ template <>
const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh"; const char *InstARM32ThreeAddrFP<InstARM32::Vmulh>::Opcode = "vmulh";
template <> template <>
const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap"; const char *InstARM32ThreeAddrFP<InstARM32::Vmlap>::Opcode = "vmlap";
template <> const char *InstARM32ThreeAddrFP<InstARM32::Vzip>::Opcode = "vzip";
// Four-addr ops // Four-addr ops
template <> const char *InstARM32Mla::Opcode = "mla"; template <> const char *InstARM32Mla::Opcode = "mla";
template <> const char *InstARM32Mls::Opcode = "mls"; template <> const char *InstARM32Mls::Opcode = "mls";
...@@ -2805,6 +2901,43 @@ void InstARM32Vstr1::dump(const Cfg *Func) const { ...@@ -2805,6 +2901,43 @@ void InstARM32Vstr1::dump(const Cfg *Func) const {
getSrc(0)->dump(Func); getSrc(0)->dump(Func);
} }
void InstARM32Vdup::emit(const Cfg *Func) const {
if (!BuildDefs::dump())
return;
Ostream &Str = Func->getContext()->getStrEmit();
assert(getSrcSize() == 2);
Type Ty = getSrc(0)->getType();
const char *Opcode = "vdup";
Str << "\t" << Opcode;
Str << getPredicate() << "." << getWidthString(Ty) << getVecElmtBitsize(Ty);
Str << "\t";
getSrc(0)->emit(Func);
Str << ", ";
getSrc(1)->emit(Func);
Str << ", " << Idx;
}
void InstARM32Vdup::emitIAS(const Cfg *Func) const {
assert(getSrcSize() == 1);
auto *Asm = Func->getAssembler<ARM32::AssemblerARM32>();
const Operand *Dest = getDest();
const Operand *Src = getSrc(0);
Type DestTy = Dest->getType();
Asm->vdup(typeElementType(DestTy), Dest, Src, Idx);
}
void InstARM32Vdup::dump(const Cfg *Func) const {
if (!BuildDefs::dump())
return;
Ostream &Str = Func->getContext()->getStrDump();
dumpDest(Func);
Str << " = ";
dumpOpcodePred(Str, "vdup", getDest()->getType());
Str << " ";
dumpSources(Func);
Str << ", " << Idx;
}
void InstARM32Trap::emit(const Cfg *Func) const { void InstARM32Trap::emit(const Cfg *Func) const {
if (!BuildDefs::dump()) if (!BuildDefs::dump())
return; return;
...@@ -3386,6 +3519,7 @@ template class InstARM32LoadBase<InstARM32::Ldr>; ...@@ -3386,6 +3519,7 @@ template class InstARM32LoadBase<InstARM32::Ldr>;
template class InstARM32LoadBase<InstARM32::Ldrex>; template class InstARM32LoadBase<InstARM32::Ldrex>;
template class InstARM32LoadBase<InstARM32::Vldr1d>; template class InstARM32LoadBase<InstARM32::Vldr1d>;
template class InstARM32LoadBase<InstARM32::Vldr1q>; template class InstARM32LoadBase<InstARM32::Vldr1q>;
template class InstARM32ThreeAddrFP<InstARM32::Vzip>;
template class InstARM32TwoAddrGPR<InstARM32::Movt>; template class InstARM32TwoAddrGPR<InstARM32::Movt>;
template class InstARM32UnaryopGPR<InstARM32::Movw, false>; template class InstARM32UnaryopGPR<InstARM32::Movw, false>;
......
...@@ -434,12 +434,17 @@ public: ...@@ -434,12 +434,17 @@ public:
Vcmp, Vcmp,
Vcvt, Vcvt,
Vdiv, Vdiv,
Vdup,
Veor, Veor,
Vldr1d, Vldr1d,
Vldr1q, Vldr1q,
Vmla, Vmla,
Vmlap, Vmlap,
Vmls, Vmls,
Vmovl,
Vmovh,
Vmovhl,
Vmovlh,
Vmrs, Vmrs,
Vmul, Vmul,
Vmulh, Vmulh,
...@@ -453,7 +458,8 @@ public: ...@@ -453,7 +458,8 @@ public:
Vshr, Vshr,
Vsqrt, Vsqrt,
Vstr1, Vstr1,
Vsub Vsub,
Vzip
}; };
static constexpr size_t InstSize = sizeof(uint32_t); static constexpr size_t InstSize = sizeof(uint32_t);
...@@ -1020,6 +1026,10 @@ using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>; ...@@ -1020,6 +1026,10 @@ using InstARM32Vdiv = InstARM32ThreeAddrFP<InstARM32::Vdiv>;
using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>; using InstARM32Veor = InstARM32ThreeAddrFP<InstARM32::Veor>;
using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>; using InstARM32Vmla = InstARM32FourAddrFP<InstARM32::Vmla>;
using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>; using InstARM32Vmls = InstARM32FourAddrFP<InstARM32::Vmls>;
using InstARM32Vmovl = InstARM32ThreeAddrFP<InstARM32::Vmovl>;
using InstARM32Vmovh = InstARM32ThreeAddrFP<InstARM32::Vmovh>;
using InstARM32Vmovhl = InstARM32ThreeAddrFP<InstARM32::Vmovhl>;
using InstARM32Vmovlh = InstARM32ThreeAddrFP<InstARM32::Vmovlh>;
using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>; using InstARM32Vmul = InstARM32ThreeAddrFP<InstARM32::Vmul>;
using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>; using InstARM32Vmvn = InstARM32UnaryopFP<InstARM32::Vmvn>;
using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>; using InstARM32Vneg = InstARM32UnaryopSignAwareFP<InstARM32::Vneg>;
...@@ -1036,6 +1046,7 @@ using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>; ...@@ -1036,6 +1046,7 @@ using InstARM32Ldr = InstARM32LoadBase<InstARM32::Ldr>;
using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>; using InstARM32Ldrex = InstARM32LoadBase<InstARM32::Ldrex>;
using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>; using InstARM32Vldr1d = InstARM32LoadBase<InstARM32::Vldr1d>;
using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>; using InstARM32Vldr1q = InstARM32LoadBase<InstARM32::Vldr1q>;
using InstARM32Vzip = InstARM32ThreeAddrFP<InstARM32::Vzip>;
/// MovT leaves the bottom bits alone so dest is also a source. This helps /// MovT leaves the bottom bits alone so dest is also a source. This helps
/// indicate that a previous MovW setting dest is not dead code. /// indicate that a previous MovW setting dest is not dead code.
using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>; using InstARM32Movt = InstARM32TwoAddrGPR<InstARM32::Movt>;
...@@ -1374,6 +1385,30 @@ private: ...@@ -1374,6 +1385,30 @@ private:
SizeT Size; SizeT Size;
}; };
/// Vector element duplication/replication instruction.
class InstARM32Vdup final : public InstARM32Pred {
InstARM32Vdup() = delete;
InstARM32Vdup(const InstARM32Vdup &) = delete;
InstARM32Vdup &operator=(const InstARM32Vdup &) = delete;
public:
/// Value must be a register.
static InstARM32Vdup *create(Cfg *Func, Variable *Dest, Variable *Src,
IValueT Idx) {
return new (Func->allocate<InstARM32Vdup>())
InstARM32Vdup(Func, Dest, Src, Idx);
}
void emit(const Cfg *Func) const override;
void emitIAS(const Cfg *Func) const override;
void dump(const Cfg *Func) const override;
static bool classof(const Inst *Instr) { return isClassof(Instr, Vdup); }
private:
InstARM32Vdup(Cfg *Func, Variable *Dest, Variable *Src, IValueT Idx);
const IValueT Idx;
};
class InstARM32Trap : public InstARM32 { class InstARM32Trap : public InstARM32 {
InstARM32Trap() = delete; InstARM32Trap() = delete;
InstARM32Trap(const InstARM32Trap &) = delete; InstARM32Trap(const InstARM32Trap &) = delete;
......
...@@ -5357,7 +5357,7 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { ...@@ -5357,7 +5357,7 @@ void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
Func->setError("Unexpected size for LoadSubVector"); Func->setError("Unexpected size for LoadSubVector");
return; return;
} }
_mov(Dest, T); // FIXME: necessary? _mov(Dest, T);
return; return;
} }
case Intrinsics::StoreSubVector: { case Intrinsics::StoreSubVector: {
...@@ -5975,8 +5975,121 @@ void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) { ...@@ -5975,8 +5975,121 @@ void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
const Type DestTy = Dest->getType(); const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy); auto *T = makeReg(DestTy);
auto *Src0 = Instr->getSrc(0);
auto *Src1 = Instr->getSrc(1);
const SizeT NumElements = typeNumElements(DestTy);
const Type ElementType = typeElementType(DestTy);
bool Replicate = true;
for (SizeT I = 1; Replicate && I < Instr->getNumIndexes(); ++I) {
if (Instr->getIndexValue(I) != Instr->getIndexValue(0)) {
Replicate = false;
}
}
if (Replicate) {
Variable *Src0Var = legalizeToReg(Src0);
_vdup(T, Src0Var, Instr->getIndexValue(0));
_mov(Dest, T);
return;
}
switch (DestTy) { switch (DestTy) {
case IceType_v8i1:
case IceType_v8i16: {
static constexpr SizeT ExpectedNumElements = 8;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3)) {
Variable *Src0R = legalizeToReg(Src0);
_vzip(T, Src0R, Src0R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(0, 8, 1, 9, 2, 10, 3, 11)) {
Variable *Src0R = legalizeToReg(Src0);
Variable *Src1R = legalizeToReg(Src1);
_vzip(T, Src0R, Src1R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(0, 2, 4, 6, 0, 2, 4, 6)) {
Variable *Src0R = legalizeToReg(Src0);
_vqmovn2(T, Src0R, Src0R, false, false);
_mov(Dest, T);
return;
}
} break;
case IceType_v16i1:
case IceType_v16i8: {
static constexpr SizeT ExpectedNumElements = 16;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
if (Instr->indexesAre(0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7)) {
Variable *Src0R = legalizeToReg(Src0);
_vzip(T, Src0R, Src0R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7,
23)) {
Variable *Src0R = legalizeToReg(Src0);
Variable *Src1R = legalizeToReg(Src1);
_vzip(T, Src0R, Src1R);
_mov(Dest, T);
return;
}
} break;
case IceType_v4i1:
case IceType_v4i32:
case IceType_v4f32: {
static constexpr SizeT ExpectedNumElements = 4;
assert(ExpectedNumElements == Instr->getNumIndexes());
(void)ExpectedNumElements;
if (Instr->indexesAre(0, 0, 1, 1)) {
Variable *Src0R = legalizeToReg(Src0);
_vzip(T, Src0R, Src0R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(0, 4, 1, 5)) {
Variable *Src0R = legalizeToReg(Src0);
Variable *Src1R = legalizeToReg(Src1);
_vzip(T, Src0R, Src1R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(0, 1, 4, 5)) {
Variable *Src0R = legalizeToReg(Src0);
Variable *Src1R = legalizeToReg(Src1);
_vmovlh(T, Src0R, Src1R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(2, 3, 2, 3)) {
Variable *Src0R = legalizeToReg(Src0);
_vmovhl(T, Src0R, Src0R);
_mov(Dest, T);
return;
}
if (Instr->indexesAre(2, 3, 6, 7)) {
Variable *Src0R = legalizeToReg(Src0);
Variable *Src1R = legalizeToReg(Src1);
_vmovhl(T, Src1R, Src0R);
_mov(Dest, T);
return;
}
} break;
default: default:
break; break;
// TODO(jpp): figure out how to properly lower this without scalarization. // TODO(jpp): figure out how to properly lower this without scalarization.
...@@ -5984,10 +6097,6 @@ void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) { ...@@ -5984,10 +6097,6 @@ void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
// Unoptimized shuffle. Perform a series of inserts and extracts. // Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T); Context.insert<InstFakeDef>(T);
auto *Src0 = Instr->getSrc(0);
auto *Src1 = Instr->getSrc(1);
const SizeT NumElements = typeNumElements(DestTy);
const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) { for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I); auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue(); const SizeT Elem = Index->getValue();
......
...@@ -885,6 +885,9 @@ protected: ...@@ -885,6 +885,9 @@ protected:
CondARM32::Cond Pred = CondARM32::AL) { CondARM32::Cond Pred = CondARM32::AL) {
Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred); Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred);
} }
void _vdup(Variable *Dest, Variable *Src, int Idx) {
Context.insert<InstARM32Vdup>(Dest, Src, Idx);
}
void _veor(Variable *Dest, Variable *Src0, Variable *Src1) { void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Veor>(Dest, Src0, Src1); Context.insert<InstARM32Veor>(Dest, Src0, Src1);
} }
...@@ -908,6 +911,18 @@ protected: ...@@ -908,6 +911,18 @@ protected:
void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) { void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmls>(Dest, Src0, Src1); Context.insert<InstARM32Vmls>(Dest, Src0, Src1);
} }
void _vmovl(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmovl>(Dest, Src0, Src1);
}
void _vmovh(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmovh>(Dest, Src0, Src1);
}
void _vmovhl(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmovhl>(Dest, Src0, Src1);
}
void _vmovlh(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmovlh>(Dest, Src0, Src1);
}
void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) { void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vmul>(Dest, Src0, Src1); Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
} }
...@@ -966,6 +981,9 @@ protected: ...@@ -966,6 +981,9 @@ protected:
void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) { void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vsub>(Dest, Src0, Src1); Context.insert<InstARM32Vsub>(Dest, Src0, Src1);
} }
void _vzip(Variable *Dest, Variable *Src0, Variable *Src1) {
Context.insert<InstARM32Vzip>(Dest, Src0, Src1);
}
// Iterates over the CFG and determines the maximum outgoing stack arguments // Iterates over the CFG and determines the maximum outgoing stack arguments
// bytes. This information is later used during addProlog() to pre-allocate // bytes. This information is later used during addProlog() to pre-allocate
......
...@@ -6304,22 +6304,22 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6304,22 +6304,22 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
break; break;
} }
const SizeT Index0 = Instr->getIndex(0)->getValue(); const SizeT Index0 = Instr->getIndexValue(0);
const SizeT Index1 = Instr->getIndex(1)->getValue(); const SizeT Index1 = Instr->getIndexValue(1);
const SizeT Index2 = Instr->getIndex(2)->getValue(); const SizeT Index2 = Instr->getIndexValue(2);
const SizeT Index3 = Instr->getIndex(3)->getValue(); const SizeT Index3 = Instr->getIndexValue(3);
const SizeT Index4 = Instr->getIndex(4)->getValue(); const SizeT Index4 = Instr->getIndexValue(4);
const SizeT Index5 = Instr->getIndex(5)->getValue(); const SizeT Index5 = Instr->getIndexValue(5);
const SizeT Index6 = Instr->getIndex(6)->getValue(); const SizeT Index6 = Instr->getIndexValue(6);
const SizeT Index7 = Instr->getIndex(7)->getValue(); const SizeT Index7 = Instr->getIndexValue(7);
const SizeT Index8 = Instr->getIndex(8)->getValue(); const SizeT Index8 = Instr->getIndexValue(8);
const SizeT Index9 = Instr->getIndex(9)->getValue(); const SizeT Index9 = Instr->getIndexValue(9);
const SizeT Index10 = Instr->getIndex(10)->getValue(); const SizeT Index10 = Instr->getIndexValue(10);
const SizeT Index11 = Instr->getIndex(11)->getValue(); const SizeT Index11 = Instr->getIndexValue(11);
const SizeT Index12 = Instr->getIndex(12)->getValue(); const SizeT Index12 = Instr->getIndexValue(12);
const SizeT Index13 = Instr->getIndex(13)->getValue(); const SizeT Index13 = Instr->getIndexValue(13);
const SizeT Index14 = Instr->getIndex(14)->getValue(); const SizeT Index14 = Instr->getIndexValue(14);
const SizeT Index15 = Instr->getIndex(15)->getValue(); const SizeT Index15 = Instr->getIndexValue(15);
lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2, lowerShuffleVector_UsingPshufb(Dest, Src0, Src1, Index0, Index1, Index2,
Index3, Index4, Index5, Index6, Index7, Index3, Index4, Index5, Index6, Index7,
...@@ -6376,14 +6376,14 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6376,14 +6376,14 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
break; break;
} }
const SizeT Index0 = Instr->getIndex(0)->getValue(); const SizeT Index0 = Instr->getIndexValue(0);
const SizeT Index1 = Instr->getIndex(1)->getValue(); const SizeT Index1 = Instr->getIndexValue(1);
const SizeT Index2 = Instr->getIndex(2)->getValue(); const SizeT Index2 = Instr->getIndexValue(2);
const SizeT Index3 = Instr->getIndex(3)->getValue(); const SizeT Index3 = Instr->getIndexValue(3);
const SizeT Index4 = Instr->getIndex(4)->getValue(); const SizeT Index4 = Instr->getIndexValue(4);
const SizeT Index5 = Instr->getIndex(5)->getValue(); const SizeT Index5 = Instr->getIndexValue(5);
const SizeT Index6 = Instr->getIndex(6)->getValue(); const SizeT Index6 = Instr->getIndexValue(6);
const SizeT Index7 = Instr->getIndex(7)->getValue(); const SizeT Index7 = Instr->getIndexValue(7);
#define TO_BYTE_INDEX(I) ((I) << 1) #define TO_BYTE_INDEX(I) ((I) << 1)
lowerShuffleVector_UsingPshufb( lowerShuffleVector_UsingPshufb(
...@@ -6403,10 +6403,10 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6403,10 +6403,10 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
case IceType_v4f32: { case IceType_v4f32: {
static constexpr SizeT ExpectedNumElements = 4; static constexpr SizeT ExpectedNumElements = 4;
assert(ExpectedNumElements == Instr->getNumIndexes()); assert(ExpectedNumElements == Instr->getNumIndexes());
const SizeT Index0 = Instr->getIndex(0)->getValue(); const SizeT Index0 = Instr->getIndexValue(0);
const SizeT Index1 = Instr->getIndex(1)->getValue(); const SizeT Index1 = Instr->getIndexValue(1);
const SizeT Index2 = Instr->getIndex(2)->getValue(); const SizeT Index2 = Instr->getIndexValue(2);
const SizeT Index3 = Instr->getIndex(3)->getValue(); const SizeT Index3 = Instr->getIndexValue(3);
Variable *T = nullptr; Variable *T = nullptr;
switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) { switch (makeSrcSwitchMask(Index0, Index1, Index2, Index3)) {
#define CASE_SRCS_IN(S0, S1, S2, S3) \ #define CASE_SRCS_IN(S0, S1, S2, S3) \
...@@ -6611,8 +6611,7 @@ void TargetX86Base<TraitsType>::lowerShuffleVector( ...@@ -6611,8 +6611,7 @@ void TargetX86Base<TraitsType>::lowerShuffleVector(
InstExtractElement::create(Func, ExtElmt, Src0, Index)); InstExtractElement::create(Func, ExtElmt, Src0, Index));
} else { } else {
lowerExtractElement(InstExtractElement::create( lowerExtractElement(InstExtractElement::create(
Func, ExtElmt, Src1, Func, ExtElmt, Src1, Ctx->getConstantInt32(Elem - NumElements)));
Ctx->getConstantInt32(Index->getValue() - NumElements)));
} }
auto *NewT = makeReg(DestTy); auto *NewT = makeReg(DestTy);
lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt, lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment