Commit a47c11c7 by John Porto

Subzero. Rematerializes shufflevector instructions.

This CL is a first step towards optimizing vector shuffles in Subzero. PNaCl bitcode does not support the shufflevector instruction, so pnacl-clang emits a series of extractelement/insertelement. pnacl-llc is then responsible for performing a pattern match on the output bitcode and rematerialize the shufflevector. With this CL, we enable shufflevector rematerialization in Subzero. To keep this CL simple, we introduce no efficient shufflevector lowering. Instead, we scalarize the rematerialized instructions. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4136 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1897243002 .
parent 64045042
...@@ -802,6 +802,312 @@ void Cfg::doAddressOpt() { ...@@ -802,6 +802,312 @@ void Cfg::doAddressOpt() {
Node->doAddressOpt(); Node->doAddressOpt();
} }
namespace {
// ShuffleVectorUtils implements helper functions for rematerializing
// shufflevector instructions from a sequence of extractelement/insertelement
// instructions. It looks for the following pattern:
//
// %t0 = extractelement A, %n0
// %t1 = extractelement B, %n1
// %t2 = extractelement C, %n2
// ...
// %tN = extractelement N, %nN
// %d0 = insertelement undef, %t0, 0
// %d1 = insertelement %d0, %t1, 1
// %d2 = insertelement %d1, %t2, 2
// ...
// %dest = insertelement %d_N-1, %tN, N
//
// where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two
// distinct variables.
namespace ShuffleVectorUtils {
// findAllInserts is used when searching for all the insertelements that are
// used in a shufflevector operation. This function works recursively, when
// invoked with I = i, the function assumes Insts[i] is the last found
// insertelement in the chain. The next insertelement insertruction is saved in
// Insts[i+1].
bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
CfgVector<const Inst *> *Insts, SizeT I = 0) {
const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
if (I > Insts->size()) {
if (Verbose) {
Ctx->getStrDump() << "\tToo many inserts.\n";
}
return false;
}
const auto *LastInsert = Insts->at(I);
assert(llvm::isa<InstInsertElement>(LastInsert));
if (I == Insts->size() - 1) {
// Matching against undef is not really needed because the value in Src(0)
// will be totally overwritten. We still enforce it anyways because the
// PNaCl toolchain generates the bitcode with it.
if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) {
if (Verbose) {
Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " "
<< Insts->size();
LastInsert->dump(Func);
Ctx->getStrDump() << "\n";
}
return false;
}
// The following loop ensures that the insertelements are sorted. In theory,
// we could relax this restriction and allow any order. As long as each
// index appears exactly once, this chain is still a candidate for becoming
// a shufflevector. The Insts vector is traversed backwards because the
// instructions are "enqueued" in reverse order.
int32_t ExpectedElement = 0;
for (const auto *I : reverse_range(*Insts)) {
if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() !=
ExpectedElement) {
return false;
}
++ExpectedElement;
}
return true;
}
const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0));
const auto *Def = VM->getSingleDefinition(Src0V);
// Only optimize if the first operand in
//
// Dest = insertelement A, B, 10
//
// is singly-def'ed.
if (Def == nullptr) {
if (Verbose) {
Ctx->getStrDump() << "\tmulti-def: ";
(*Insts)[I]->dump(Func);
Ctx->getStrDump() << "\n";
}
return false;
}
// We also require the (single) definition to come from an insertelement
// instruction.
if (!llvm::isa<InstInsertElement>(Def)) {
if (Verbose) {
Ctx->getStrDump() << "\tnot insert element: ";
Def->dump(Func);
Ctx->getStrDump() << "\n";
}
return false;
}
// Everything seems fine, so we save Def in Insts, and delegate the decision
// to findAllInserts.
(*Insts)[I + 1] = Def;
return findAllInserts(Func, Ctx, VM, Insts, I + 1);
}
// insertsLastElement returns true if Insert is inserting an element in the last
// position of a vector.
bool insertsLastElement(const Inst &Insert) {
const Type DestTy = Insert.getDest()->getType();
assert(isVectorType(DestTy));
const SizeT Elem =
llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue();
return Elem == typeNumElements(DestTy) - 1;
}
// findAllExtracts goes over all the insertelement instructions that are
// candidates to be replaced by a shufflevector, and searches for all the
// definitions of the elements being inserted. If all of the elements are the
// result of an extractelement instruction, and all of the extractelements
// operate on at most two different sources, than the instructions can be
// replaced by a shufflevector.
bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM,
const CfgVector<const Inst *> &Insts, Variable **Src0,
Variable **Src1, CfgVector<const Inst *> *Extracts) {
const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat);
*Src0 = nullptr;
*Src1 = nullptr;
assert(Insts.size() > 0);
for (SizeT I = 0; I < Insts.size(); ++I) {
const auto *Insert = Insts.at(I);
const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1));
if (Src1V == nullptr) {
if (Verbose) {
Ctx->getStrDump() << "src(1) is not a variable: ";
Insert->dump(Func);
Ctx->getStrDump() << "\n";
}
return false;
}
const auto *Def = VM->getSingleDefinition(Src1V);
if (Def == nullptr) {
if (Verbose) {
Ctx->getStrDump() << "multi-def src(1): ";
Insert->dump(Func);
Ctx->getStrDump() << "\n";
}
return false;
}
if (!llvm::isa<InstExtractElement>(Def)) {
if (Verbose) {
Ctx->getStrDump() << "not extractelement: ";
Def->dump(Func);
Ctx->getStrDump() << "\n";
}
return false;
}
auto *Src = llvm::cast<Variable>(Def->getSrc(0));
if (*Src0 == nullptr) {
// No sources yet. Save Src to Src0.
*Src0 = Src;
} else if (*Src1 == nullptr) {
// We already have a source, so we might save Src in Src1 -- but only if
// Src0 is not Src.
if (*Src0 != Src) {
*Src1 = Src;
}
} else if (Src != *Src0 && Src != *Src1) {
// More than two sources, so we can't rematerialize the shufflevector
// instruction.
if (Verbose) {
Ctx->getStrDump() << "Can't shuffle more than two sources.\n";
}
return false;
}
(*Extracts)[I] = Def;
}
// We should have seen at least one source operand.
assert(*Src0 != nullptr);
// If a second source was not seen, then we just make Src1 = Src0 to simplify
// things down stream. This should not matter, as all of the indexes in the
// shufflevector instruction will point to Src0.
if (*Src1 == nullptr) {
*Src1 = *Src0;
}
return true;
}
} // end of namespace ShuffleVectorUtils
} // end of anonymous namespace
void Cfg::materializeVectorShuffles() {
const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat);
std::unique_ptr<OstreamLocker> L;
if (Verbose) {
L.reset(new OstreamLocker(getContext()));
getContext()->getStrDump() << "\nShuffle materialization:\n";
}
// MaxVectorElements is the maximum number of elements in the vector types
// handled by Subzero. We use it to create the Inserts and Extracts vectors
// with the appropriate size, thus avoiding resize() calls.
const SizeT MaxVectorElements = typeNumElements(IceType_v16i8);
CfgVector<const Inst *> Inserts(MaxVectorElements);
CfgVector<const Inst *> Extracts(MaxVectorElements);
TimerMarker T(TimerStack::TT_materializeVectorShuffles, this);
for (CfgNode *Node : Nodes) {
for (auto &Instr : Node->getInsts()) {
if (!llvm::isa<InstInsertElement>(Instr)) {
continue;
}
if (!ShuffleVectorUtils::insertsLastElement(Instr)) {
// To avoid wasting time, we only start the pattern match at the last
// insertelement instruction -- and go backwards from there.
continue;
}
if (Verbose) {
getContext()->getStrDump() << "\tCandidate: ";
Instr.dump(this);
getContext()->getStrDump() << "\n";
}
Inserts.resize(typeNumElements(Instr.getDest()->getType()));
Inserts[0] = &Instr;
if (!ShuffleVectorUtils::findAllInserts(this, getContext(),
VMetadata.get(), &Inserts)) {
// If we fail to find a sequence of insertelements, we stop the
// optimization.
if (Verbose) {
getContext()->getStrDump() << "\tFalse alarm.\n";
}
continue;
}
if (Verbose) {
getContext()->getStrDump() << "\tFound the following insertelement: \n";
for (auto *I : reverse_range(Inserts)) {
getContext()->getStrDump() << "\t\t";
I->dump(this);
getContext()->getStrDump() << "\n";
}
}
Extracts.resize(Inserts.size());
Variable *Src0;
Variable *Src1;
if (!ShuffleVectorUtils::findAllExtracts(this, getContext(),
VMetadata.get(), Inserts, &Src0,
&Src1, &Extracts)) {
// If we fail to match the definitions of the insertelements' sources
// with extractelement instructions -- or if those instructions operate
// on more than two different variables -- we stop the optimization.
if (Verbose) {
getContext()->getStrDump() << "\tFailed to match extractelements.\n";
}
continue;
}
if (Verbose) {
getContext()->getStrDump()
<< "\tFound the following insert/extract element pairs: \n";
for (SizeT I = 0; I < Inserts.size(); ++I) {
const SizeT Pos = Inserts.size() - I - 1;
getContext()->getStrDump() << "\t\tInsert : ";
Inserts[Pos]->dump(this);
getContext()->getStrDump() << "\n\t\tExtract: ";
Extracts[Pos]->dump(this);
getContext()->getStrDump() << "\n";
}
}
assert(Src0 != nullptr);
assert(Src1 != nullptr);
auto *ShuffleVector =
InstShuffleVector::create(this, Instr.getDest(), Src0, Src1);
assert(ShuffleVector->getSrc(0) == Src0);
assert(ShuffleVector->getSrc(1) == Src1);
for (SizeT I = 0; I < Extracts.size(); ++I) {
const SizeT Pos = Extracts.size() - I - 1;
auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1));
if (Src0 == Extracts[Pos]->getSrc(0)) {
ShuffleVector->addIndex(Index);
} else {
ShuffleVector->addIndex(llvm::cast<ConstantInteger32>(
Ctx->getConstantInt32(Index->getValue() + Extracts.size())));
}
}
if (Verbose) {
getContext()->getStrDump() << "Created: ";
ShuffleVector->dump(this);
getContext()->getStrDump() << "\n";
}
Instr.setDeleted();
auto &LoweringContext = getTarget()->getContext();
LoweringContext.setInsertPoint(Instr);
LoweringContext.insert(ShuffleVector);
}
}
}
void Cfg::doNopInsertion() { void Cfg::doNopInsertion() {
if (!getFlags().getShouldDoNopInsertion()) if (!getFlags().getShouldDoNopInsertion())
return; return;
......
...@@ -206,6 +206,9 @@ public: ...@@ -206,6 +206,9 @@ public:
/// entry block and emit stack or frame pointer-relative addressing. /// entry block and emit stack or frame pointer-relative addressing.
void processAllocas(bool SortAndCombine); void processAllocas(bool SortAndCombine);
void doAddressOpt(); void doAddressOpt();
/// Find clusters of insertelement/extractelement instructions that can be
/// replaced by a shufflevector instruction.
void materializeVectorShuffles();
void doArgLowering(); void doArgLowering();
void doNopInsertion(); void doNopInsertion();
void genCode(); void genCode();
......
...@@ -327,6 +327,8 @@ struct dev_list_flag {}; ...@@ -327,6 +327,8 @@ struct dev_list_flag {};
clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"), \ clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"), \
clEnumValN(Ice::IceV_Loop, "loop", "Loop nest depth analysis"), \ clEnumValN(Ice::IceV_Loop, "loop", "Loop nest depth analysis"), \
clEnumValN(Ice::IceV_Mem, "mem", "Memory usage details"), \ clEnumValN(Ice::IceV_Mem, "mem", "Memory usage details"), \
clEnumValN(Ice::IceV_ShufMat, "shufvec", \
"Shufflevector rematerialization"), \
clEnumValN(Ice::IceV_Status, "status", \ clEnumValN(Ice::IceV_Status, "status", \
"Print the name of the function being translated"), \ "Print the name of the function being translated"), \
clEnumValN(Ice::IceV_AvailableRegs, "registers", \ clEnumValN(Ice::IceV_AvailableRegs, "registers", \
......
...@@ -342,6 +342,7 @@ enum VerboseItem { ...@@ -342,6 +342,7 @@ enum VerboseItem {
IceV_GlobalInit = 1 << 22, IceV_GlobalInit = 1 << 22,
IceV_ConstPoolStats = 1 << 23, IceV_ConstPoolStats = 1 << 23,
IceV_Wasm = 1 << 24, IceV_Wasm = 1 << 24,
IceV_ShufMat = 1 << 25,
IceV_All = ~IceV_None, IceV_All = ~IceV_None,
IceV_Most = IceV_Most =
IceV_All & ~IceV_LinearScan & ~IceV_GlobalInit & ~IceV_ConstPoolStats IceV_All & ~IceV_LinearScan & ~IceV_GlobalInit & ~IceV_ConstPoolStats
......
...@@ -112,6 +112,7 @@ const char *Inst::getInstName() const { ...@@ -112,6 +112,7 @@ const char *Inst::getInstName() const {
X(FakeUse, "fakeuse"); X(FakeUse, "fakeuse");
X(FakeKill, "fakekill"); X(FakeKill, "fakekill");
X(JumpTable, "jumptable"); X(JumpTable, "jumptable");
X(ShuffleVector, "shufflevector");
#undef X #undef X
default: default:
assert(Kind >= Target); assert(Kind >= Target);
...@@ -574,6 +575,15 @@ InstFakeUse::InstFakeUse(Cfg *Func, Variable *Src, uint32_t Weight) ...@@ -574,6 +575,15 @@ InstFakeUse::InstFakeUse(Cfg *Func, Variable *Src, uint32_t Weight)
InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked) InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked)
: InstHighLevel(Func, Inst::FakeKill, 0, nullptr), Linked(Linked) {} : InstHighLevel(Func, Inst::FakeKill, 0, nullptr), Linked(Linked) {}
InstShuffleVector::InstShuffleVector(Cfg *Func, Variable *Dest, Variable *Src0,
Variable *Src1)
: InstHighLevel(Func, Inst::ShuffleVector, 2, Dest),
NumIndexes(typeNumElements(Dest->getType())) {
addSource(Src0);
addSource(Src1);
Indexes = Func->allocateArrayOf<ConstantInteger32 *>(NumIndexes);
}
namespace { namespace {
GlobalString makeName(Cfg *Func, const SizeT Id) { GlobalString makeName(Cfg *Func, const SizeT Id) {
const auto FuncName = Func->getFunctionName(); const auto FuncName = Func->getFunctionName();
...@@ -1032,6 +1042,21 @@ void InstFakeKill::dump(const Cfg *Func) const { ...@@ -1032,6 +1042,21 @@ void InstFakeKill::dump(const Cfg *Func) const {
Str << "kill.pseudo scratch_regs"; Str << "kill.pseudo scratch_regs";
} }
void InstShuffleVector::dump(const Cfg *Func) const {
if (!BuildDefs::dump())
return;
Ostream &Str = Func->getContext()->getStrDump();
Str << "shufflevector ";
dumpDest(Func);
Str << " = ";
dumpSources(Func);
for (SizeT I = 0; I < NumIndexes; ++I) {
Str << ", ";
Indexes[I]->dump(Func);
}
Str << "\n";
}
void InstJumpTable::dump(const Cfg *Func) const { void InstJumpTable::dump(const Cfg *Func) const {
if (!BuildDefs::dump()) if (!BuildDefs::dump())
return; return;
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "IceDefs.h" #include "IceDefs.h"
#include "IceInst.def" #include "IceInst.def"
#include "IceIntrinsics.h" #include "IceIntrinsics.h"
#include "IceOperand.h"
#include "IceSwitchLowering.h" #include "IceSwitchLowering.h"
#include "IceTypes.h" #include "IceTypes.h"
...@@ -61,14 +62,15 @@ public: ...@@ -61,14 +62,15 @@ public:
Select, Select,
Store, Store,
Switch, Switch,
Assign, // not part of LLVM/PNaCl bitcode Assign, // not part of LLVM/PNaCl bitcode
Breakpoint, // not part of LLVM/PNaCl bitcode Breakpoint, // not part of LLVM/PNaCl bitcode
BundleLock, // not part of LLVM/PNaCl bitcode BundleLock, // not part of LLVM/PNaCl bitcode
BundleUnlock, // not part of LLVM/PNaCl bitcode BundleUnlock, // not part of LLVM/PNaCl bitcode
FakeDef, // not part of LLVM/PNaCl bitcode FakeDef, // not part of LLVM/PNaCl bitcode
FakeUse, // not part of LLVM/PNaCl bitcode FakeUse, // not part of LLVM/PNaCl bitcode
FakeKill, // not part of LLVM/PNaCl bitcode FakeKill, // not part of LLVM/PNaCl bitcode
JumpTable, // not part of LLVM/PNaCl bitcode JumpTable, // not part of LLVM/PNaCl bitcode
ShuffleVector, // not part of LLVM/PNaCl bitcode
// Anything >= Target is an InstTarget subclass. Note that the value-spaces // Anything >= Target is an InstTarget subclass. Note that the value-spaces
// are shared across targets. To avoid confusion over the definition of // are shared across targets. To avoid confusion over the definition of
// shared values, an object specific to one target should never be passed // shared values, an object specific to one target should never be passed
...@@ -917,6 +919,52 @@ private: ...@@ -917,6 +919,52 @@ private:
const Inst *Linked; const Inst *Linked;
}; };
/// ShuffleVector instruction. This represents a shuffle operation on vector
/// types. This instruction is not part of the PNaCl bitcode: it is generated
/// by Subzero when it matches the pattern used by pnacl-clang when compiling
/// to bitcode.
class InstShuffleVector : public InstHighLevel {
InstShuffleVector() = delete;
InstShuffleVector(const InstShuffleVector &) = delete;
InstShuffleVector &operator=(const InstShuffleVector &) = delete;
public:
static InstShuffleVector *create(Cfg *Func, Variable *Dest, Variable *Src0,
Variable *Src1) {
return new (Func->allocate<InstShuffleVector>())
InstShuffleVector(Func, Dest, Src0, Src1);
}
SizeT getNumIndexes() const { return NumIndexes; }
void addIndex(ConstantInteger32 *Index) {
assert(CurrentIndex < NumIndexes);
Indexes[CurrentIndex++] = Index;
}
ConstantInteger32 *getIndex(SizeT Pos) const {
assert(Pos < NumIndexes);
return Indexes[Pos];
}
void dump(const Cfg *Func) const override;
static bool classof(const Inst *Instr) {
return Instr->getKind() == ShuffleVector;
}
private:
InstShuffleVector(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1);
void destroy(Cfg *Func) override {
Func->deallocateArrayOf<ConstantInteger32 *>(Indexes);
Inst::destroy(Func);
}
ConstantInteger32 **Indexes;
SizeT CurrentIndex = 0;
const SizeT NumIndexes;
};
/// JumpTable instruction. This represents a jump table that will be stored in /// JumpTable instruction. This represents a jump table that will be stored in
/// the .rodata section. This is used to track and repoint the target CfgNodes /// the .rodata section. This is used to track and repoint the target CfgNodes
/// which may change, for example due to splitting for phi lowering. /// which may change, for example due to splitting for phi lowering.
......
...@@ -440,6 +440,9 @@ void TargetLowering::lower() { ...@@ -440,6 +440,9 @@ void TargetLowering::lower() {
case Inst::Select: case Inst::Select:
lowerSelect(llvm::cast<InstSelect>(Instr)); lowerSelect(llvm::cast<InstSelect>(Instr));
break; break;
case Inst::ShuffleVector:
lowerShuffleVector(llvm::cast<InstShuffleVector>(Instr));
break;
case Inst::Store: case Inst::Store:
lowerStore(llvm::cast<InstStore>(Instr)); lowerStore(llvm::cast<InstStore>(Instr));
break; break;
......
...@@ -387,6 +387,7 @@ protected: ...@@ -387,6 +387,7 @@ protected:
virtual void lowerPhi(const InstPhi *Instr) = 0; virtual void lowerPhi(const InstPhi *Instr) = 0;
virtual void lowerRet(const InstRet *Instr) = 0; virtual void lowerRet(const InstRet *Instr) = 0;
virtual void lowerSelect(const InstSelect *Instr) = 0; virtual void lowerSelect(const InstSelect *Instr) = 0;
virtual void lowerShuffleVector(const InstShuffleVector *Instr) = 0;
virtual void lowerStore(const InstStore *Instr) = 0; virtual void lowerStore(const InstStore *Instr) = 0;
virtual void lowerSwitch(const InstSwitch *Instr) = 0; virtual void lowerSwitch(const InstSwitch *Instr) = 0;
virtual void lowerUnreachable(const InstUnreachable *Instr) = 0; virtual void lowerUnreachable(const InstUnreachable *Instr) = 0;
......
...@@ -1020,6 +1020,7 @@ void TargetARM32::translateO2() { ...@@ -1020,6 +1020,7 @@ void TargetARM32::translateO2() {
// Address mode optimization. // Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs); Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt(); Func->doAddressOpt();
Func->materializeVectorShuffles();
// Argument lowering // Argument lowering
Func->doArgLowering(); Func->doArgLowering();
...@@ -5812,6 +5813,44 @@ void TargetARM32::lowerRet(const InstRet *Instr) { ...@@ -5812,6 +5813,44 @@ void TargetARM32::lowerRet(const InstRet *Instr) {
Context.insert<InstFakeUse>(SP); Context.insert<InstFakeUse>(SP);
} }
void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
auto *Dest = Instr->getDest();
const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy);
switch (DestTy) {
default:
break;
// TODO(jpp): figure out how to properly lower this without scalarization.
}
// Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T);
auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
const SizeT NumElements = typeNumElements(DestTy);
const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue();
auto *ExtElmt = makeReg(ElementType);
if (Elem < NumElements) {
lowerExtractElement(
InstExtractElement::create(Func, ExtElmt, Src0, Index));
} else {
lowerExtractElement(InstExtractElement::create(
Func, ExtElmt, Src1,
Ctx->getConstantInt32(Index->getValue() - NumElements)));
}
auto *NewT = makeReg(DestTy);
lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
Ctx->getConstantInt32(I)));
T = NewT;
}
_mov(Dest, T);
}
void TargetARM32::lowerSelect(const InstSelect *Instr) { void TargetARM32::lowerSelect(const InstSelect *Instr) {
Variable *Dest = Instr->getDest(); Variable *Dest = Instr->getDest();
Type DestTy = Dest->getType(); Type DestTy = Dest->getType();
......
...@@ -285,6 +285,7 @@ protected: ...@@ -285,6 +285,7 @@ protected:
void lowerPhi(const InstPhi *Instr) override; void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override; void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override; void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override; void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override; void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override; void lowerUnreachable(const InstUnreachable *Instr) override;
......
...@@ -1145,6 +1145,10 @@ void TargetMIPS32::lowerSelect(const InstSelect *Instr) { ...@@ -1145,6 +1145,10 @@ void TargetMIPS32::lowerSelect(const InstSelect *Instr) {
UnimplementedLoweringError(this, Instr); UnimplementedLoweringError(this, Instr);
} }
void TargetMIPS32::lowerShuffleVector(const InstShuffleVector *Instr) {
UnimplementedLoweringError(this, Instr);
}
void TargetMIPS32::lowerStore(const InstStore *Instr) { void TargetMIPS32::lowerStore(const InstStore *Instr) {
UnimplementedLoweringError(this, Instr); UnimplementedLoweringError(this, Instr);
} }
......
...@@ -299,6 +299,7 @@ protected: ...@@ -299,6 +299,7 @@ protected:
void lowerPhi(const InstPhi *Instr) override; void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override; void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override; void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override; void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override; void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override; void lowerUnreachable(const InstUnreachable *Instr) override;
......
...@@ -260,6 +260,7 @@ protected: ...@@ -260,6 +260,7 @@ protected:
void lowerPhi(const InstPhi *Instr) override; void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override; void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override; void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override; void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override; void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override; void lowerUnreachable(const InstUnreachable *Instr) override;
......
...@@ -428,6 +428,7 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { ...@@ -428,6 +428,7 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
// Address mode optimization. // Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs); Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt(); Func->doAddressOpt();
Func->materializeVectorShuffles();
// Find read-modify-write opportunities. Do this after address mode // Find read-modify-write opportunities. Do this after address mode
// optimization so that doAddressOpt() doesn't need to be applied to RMW // optimization so that doAddressOpt() doesn't need to be applied to RMW
...@@ -5570,6 +5571,46 @@ void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) { ...@@ -5570,6 +5571,46 @@ void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
} }
template <typename TraitsType> template <typename TraitsType>
void TargetX86Base<TraitsType>::lowerShuffleVector(
const InstShuffleVector *Instr) {
auto *Dest = Instr->getDest();
const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy);
switch (DestTy) {
default:
break;
// TODO(jpp): figure out how to properly lower this without scalarization.
}
// Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T);
auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
const SizeT NumElements = typeNumElements(DestTy);
const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue();
auto *ExtElmt = makeReg(ElementType);
if (Elem < NumElements) {
lowerExtractElement(
InstExtractElement::create(Func, ExtElmt, Src0, Index));
} else {
lowerExtractElement(InstExtractElement::create(
Func, ExtElmt, Src1,
Ctx->getConstantInt32(Index->getValue() - NumElements)));
}
auto *NewT = makeReg(DestTy);
lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
Ctx->getConstantInt32(I)));
T = NewT;
}
_movp(Dest, T);
}
template <typename TraitsType>
void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) { void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
Variable *Dest = Select->getDest(); Variable *Dest = Select->getDest();
......
...@@ -15,55 +15,56 @@ ...@@ -15,55 +15,56 @@
#ifndef SUBZERO_SRC_ICETIMERTREE_DEF #ifndef SUBZERO_SRC_ICETIMERTREE_DEF
#define SUBZERO_SRC_ICETIMERTREE_DEF #define SUBZERO_SRC_ICETIMERTREE_DEF
#define TIMERTREE_TABLE \ #define TIMERTREE_TABLE \
/* enum value */ \ /* enum value */ \
X(O2) \ X(O2) \
X(Om1) \ X(Om1) \
X(advancedPhiLowering) \ X(advancedPhiLowering) \
X(alloca) \ X(alloca) \
X(computeLoopNestDepth) \ X(computeLoopNestDepth) \
X(convertToIce) \ X(convertToIce) \
X(deletePhis) \ X(deletePhis) \
X(doAddressOpt) \ X(doAddressOpt) \
X(doArgLowering) \ X(doArgLowering) \
X(doBranchOpt) \ X(doBranchOpt) \
X(doNopInsertion) \ X(doNopInsertion) \
X(emitAsm) \ X(emitAsm) \
X(emitGlobalInitializers) \ X(emitGlobalInitializers) \
X(findRMW) \ X(findRMW) \
X(genCode) \ X(genCode) \
X(genFrame) \ X(genFrame) \
X(genHelpers) \ X(genHelpers) \
X(initUnhandled) \ X(initUnhandled) \
X(linearScan) \ X(linearScan) \
X(liveRange) \ X(liveRange) \
X(liveness) \ X(liveness) \
X(livenessLightweight) \ X(livenessLightweight) \
X(llvmConvert) \ X(llvmConvert) \
X(loadOpt) \ X(loadOpt) \
X(lowerPhiAssignments) \ X(lowerPhiAssignments) \
X(parse) \ X(materializeVectorShuffles) \
X(parseConstants) \ X(parse) \
X(parseFunctions) \ X(parseConstants) \
X(parseFunctionValuesymtabs) \ X(parseFunctions) \
X(parseGlobals) \ X(parseFunctionValuesymtabs) \
X(parseModule) \ X(parseGlobals) \
X(parseModuleValuesymtabs) \ X(parseModule) \
X(parseTypes) \ X(parseModuleValuesymtabs) \
X(phiValidation) \ X(parseTypes) \
X(placePhiLoads) \ X(phiValidation) \
X(placePhiStores) \ X(placePhiLoads) \
X(qEmitPop) \ X(placePhiStores) \
X(qEmitPush) \ X(qEmitPop) \
X(qTransPop) \ X(qEmitPush) \
X(qTransPush) \ X(qTransPop) \
X(regAlloc) \ X(qTransPush) \
X(renumberInstructions) \ X(regAlloc) \
X(szmain) \ X(renumberInstructions) \
X(translate) \ X(szmain) \
X(translateFunctions) \ X(translate) \
X(validateLiveness) \ X(translateFunctions) \
X(vmetadata) \ X(validateLiveness) \
X(vmetadata) \
X(writeELF) X(writeELF)
//#define X(tag) //#define X(tag)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment