Commit a47c11c7 by John Porto

Subzero. Rematerializes shufflevector instructions.

This CL is a first step towards optimizing vector shuffles in Subzero. PNaCl bitcode does not support the shufflevector instruction, so pnacl-clang emits a series of extractelement/insertelement. pnacl-llc is then responsible for performing a pattern match on the output bitcode and rematerialize the shufflevector. With this CL, we enable shufflevector rematerialization in Subzero. To keep this CL simple, we introduce no efficient shufflevector lowering. Instead, we scalarize the rematerialized instructions. BUG= https://bugs.chromium.org/p/nativeclient/issues/detail?id=4136 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1897243002 .
parent 64045042
...@@ -206,6 +206,9 @@ public: ...@@ -206,6 +206,9 @@ public:
/// entry block and emit stack or frame pointer-relative addressing. /// entry block and emit stack or frame pointer-relative addressing.
void processAllocas(bool SortAndCombine); void processAllocas(bool SortAndCombine);
void doAddressOpt(); void doAddressOpt();
/// Find clusters of insertelement/extractelement instructions that can be
/// replaced by a shufflevector instruction.
void materializeVectorShuffles();
void doArgLowering(); void doArgLowering();
void doNopInsertion(); void doNopInsertion();
void genCode(); void genCode();
......
...@@ -327,6 +327,8 @@ struct dev_list_flag {}; ...@@ -327,6 +327,8 @@ struct dev_list_flag {};
clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"), \ clEnumValN(Ice::IceV_RMW, "rmw", "ReadModifyWrite optimization"), \
clEnumValN(Ice::IceV_Loop, "loop", "Loop nest depth analysis"), \ clEnumValN(Ice::IceV_Loop, "loop", "Loop nest depth analysis"), \
clEnumValN(Ice::IceV_Mem, "mem", "Memory usage details"), \ clEnumValN(Ice::IceV_Mem, "mem", "Memory usage details"), \
clEnumValN(Ice::IceV_ShufMat, "shufvec", \
"Shufflevector rematerialization"), \
clEnumValN(Ice::IceV_Status, "status", \ clEnumValN(Ice::IceV_Status, "status", \
"Print the name of the function being translated"), \ "Print the name of the function being translated"), \
clEnumValN(Ice::IceV_AvailableRegs, "registers", \ clEnumValN(Ice::IceV_AvailableRegs, "registers", \
......
...@@ -342,6 +342,7 @@ enum VerboseItem { ...@@ -342,6 +342,7 @@ enum VerboseItem {
IceV_GlobalInit = 1 << 22, IceV_GlobalInit = 1 << 22,
IceV_ConstPoolStats = 1 << 23, IceV_ConstPoolStats = 1 << 23,
IceV_Wasm = 1 << 24, IceV_Wasm = 1 << 24,
IceV_ShufMat = 1 << 25,
IceV_All = ~IceV_None, IceV_All = ~IceV_None,
IceV_Most = IceV_Most =
IceV_All & ~IceV_LinearScan & ~IceV_GlobalInit & ~IceV_ConstPoolStats IceV_All & ~IceV_LinearScan & ~IceV_GlobalInit & ~IceV_ConstPoolStats
......
...@@ -112,6 +112,7 @@ const char *Inst::getInstName() const { ...@@ -112,6 +112,7 @@ const char *Inst::getInstName() const {
X(FakeUse, "fakeuse"); X(FakeUse, "fakeuse");
X(FakeKill, "fakekill"); X(FakeKill, "fakekill");
X(JumpTable, "jumptable"); X(JumpTable, "jumptable");
X(ShuffleVector, "shufflevector");
#undef X #undef X
default: default:
assert(Kind >= Target); assert(Kind >= Target);
...@@ -574,6 +575,15 @@ InstFakeUse::InstFakeUse(Cfg *Func, Variable *Src, uint32_t Weight) ...@@ -574,6 +575,15 @@ InstFakeUse::InstFakeUse(Cfg *Func, Variable *Src, uint32_t Weight)
InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked) InstFakeKill::InstFakeKill(Cfg *Func, const Inst *Linked)
: InstHighLevel(Func, Inst::FakeKill, 0, nullptr), Linked(Linked) {} : InstHighLevel(Func, Inst::FakeKill, 0, nullptr), Linked(Linked) {}
InstShuffleVector::InstShuffleVector(Cfg *Func, Variable *Dest, Variable *Src0,
Variable *Src1)
: InstHighLevel(Func, Inst::ShuffleVector, 2, Dest),
NumIndexes(typeNumElements(Dest->getType())) {
addSource(Src0);
addSource(Src1);
Indexes = Func->allocateArrayOf<ConstantInteger32 *>(NumIndexes);
}
namespace { namespace {
GlobalString makeName(Cfg *Func, const SizeT Id) { GlobalString makeName(Cfg *Func, const SizeT Id) {
const auto FuncName = Func->getFunctionName(); const auto FuncName = Func->getFunctionName();
...@@ -1032,6 +1042,21 @@ void InstFakeKill::dump(const Cfg *Func) const { ...@@ -1032,6 +1042,21 @@ void InstFakeKill::dump(const Cfg *Func) const {
Str << "kill.pseudo scratch_regs"; Str << "kill.pseudo scratch_regs";
} }
void InstShuffleVector::dump(const Cfg *Func) const {
if (!BuildDefs::dump())
return;
Ostream &Str = Func->getContext()->getStrDump();
Str << "shufflevector ";
dumpDest(Func);
Str << " = ";
dumpSources(Func);
for (SizeT I = 0; I < NumIndexes; ++I) {
Str << ", ";
Indexes[I]->dump(Func);
}
Str << "\n";
}
void InstJumpTable::dump(const Cfg *Func) const { void InstJumpTable::dump(const Cfg *Func) const {
if (!BuildDefs::dump()) if (!BuildDefs::dump())
return; return;
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
#include "IceDefs.h" #include "IceDefs.h"
#include "IceInst.def" #include "IceInst.def"
#include "IceIntrinsics.h" #include "IceIntrinsics.h"
#include "IceOperand.h"
#include "IceSwitchLowering.h" #include "IceSwitchLowering.h"
#include "IceTypes.h" #include "IceTypes.h"
...@@ -61,14 +62,15 @@ public: ...@@ -61,14 +62,15 @@ public:
Select, Select,
Store, Store,
Switch, Switch,
Assign, // not part of LLVM/PNaCl bitcode Assign, // not part of LLVM/PNaCl bitcode
Breakpoint, // not part of LLVM/PNaCl bitcode Breakpoint, // not part of LLVM/PNaCl bitcode
BundleLock, // not part of LLVM/PNaCl bitcode BundleLock, // not part of LLVM/PNaCl bitcode
BundleUnlock, // not part of LLVM/PNaCl bitcode BundleUnlock, // not part of LLVM/PNaCl bitcode
FakeDef, // not part of LLVM/PNaCl bitcode FakeDef, // not part of LLVM/PNaCl bitcode
FakeUse, // not part of LLVM/PNaCl bitcode FakeUse, // not part of LLVM/PNaCl bitcode
FakeKill, // not part of LLVM/PNaCl bitcode FakeKill, // not part of LLVM/PNaCl bitcode
JumpTable, // not part of LLVM/PNaCl bitcode JumpTable, // not part of LLVM/PNaCl bitcode
ShuffleVector, // not part of LLVM/PNaCl bitcode
// Anything >= Target is an InstTarget subclass. Note that the value-spaces // Anything >= Target is an InstTarget subclass. Note that the value-spaces
// are shared across targets. To avoid confusion over the definition of // are shared across targets. To avoid confusion over the definition of
// shared values, an object specific to one target should never be passed // shared values, an object specific to one target should never be passed
...@@ -917,6 +919,52 @@ private: ...@@ -917,6 +919,52 @@ private:
const Inst *Linked; const Inst *Linked;
}; };
/// ShuffleVector instruction. This represents a shuffle operation on vector
/// types. This instruction is not part of the PNaCl bitcode: it is generated
/// by Subzero when it matches the pattern used by pnacl-clang when compiling
/// to bitcode.
class InstShuffleVector : public InstHighLevel {
InstShuffleVector() = delete;
InstShuffleVector(const InstShuffleVector &) = delete;
InstShuffleVector &operator=(const InstShuffleVector &) = delete;
public:
static InstShuffleVector *create(Cfg *Func, Variable *Dest, Variable *Src0,
Variable *Src1) {
return new (Func->allocate<InstShuffleVector>())
InstShuffleVector(Func, Dest, Src0, Src1);
}
SizeT getNumIndexes() const { return NumIndexes; }
void addIndex(ConstantInteger32 *Index) {
assert(CurrentIndex < NumIndexes);
Indexes[CurrentIndex++] = Index;
}
ConstantInteger32 *getIndex(SizeT Pos) const {
assert(Pos < NumIndexes);
return Indexes[Pos];
}
void dump(const Cfg *Func) const override;
static bool classof(const Inst *Instr) {
return Instr->getKind() == ShuffleVector;
}
private:
InstShuffleVector(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1);
void destroy(Cfg *Func) override {
Func->deallocateArrayOf<ConstantInteger32 *>(Indexes);
Inst::destroy(Func);
}
ConstantInteger32 **Indexes;
SizeT CurrentIndex = 0;
const SizeT NumIndexes;
};
/// JumpTable instruction. This represents a jump table that will be stored in /// JumpTable instruction. This represents a jump table that will be stored in
/// the .rodata section. This is used to track and repoint the target CfgNodes /// the .rodata section. This is used to track and repoint the target CfgNodes
/// which may change, for example due to splitting for phi lowering. /// which may change, for example due to splitting for phi lowering.
......
...@@ -440,6 +440,9 @@ void TargetLowering::lower() { ...@@ -440,6 +440,9 @@ void TargetLowering::lower() {
case Inst::Select: case Inst::Select:
lowerSelect(llvm::cast<InstSelect>(Instr)); lowerSelect(llvm::cast<InstSelect>(Instr));
break; break;
case Inst::ShuffleVector:
lowerShuffleVector(llvm::cast<InstShuffleVector>(Instr));
break;
case Inst::Store: case Inst::Store:
lowerStore(llvm::cast<InstStore>(Instr)); lowerStore(llvm::cast<InstStore>(Instr));
break; break;
......
...@@ -387,6 +387,7 @@ protected: ...@@ -387,6 +387,7 @@ protected:
virtual void lowerPhi(const InstPhi *Instr) = 0; virtual void lowerPhi(const InstPhi *Instr) = 0;
virtual void lowerRet(const InstRet *Instr) = 0; virtual void lowerRet(const InstRet *Instr) = 0;
virtual void lowerSelect(const InstSelect *Instr) = 0; virtual void lowerSelect(const InstSelect *Instr) = 0;
virtual void lowerShuffleVector(const InstShuffleVector *Instr) = 0;
virtual void lowerStore(const InstStore *Instr) = 0; virtual void lowerStore(const InstStore *Instr) = 0;
virtual void lowerSwitch(const InstSwitch *Instr) = 0; virtual void lowerSwitch(const InstSwitch *Instr) = 0;
virtual void lowerUnreachable(const InstUnreachable *Instr) = 0; virtual void lowerUnreachable(const InstUnreachable *Instr) = 0;
......
...@@ -1020,6 +1020,7 @@ void TargetARM32::translateO2() { ...@@ -1020,6 +1020,7 @@ void TargetARM32::translateO2() {
// Address mode optimization. // Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs); Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt(); Func->doAddressOpt();
Func->materializeVectorShuffles();
// Argument lowering // Argument lowering
Func->doArgLowering(); Func->doArgLowering();
...@@ -5812,6 +5813,44 @@ void TargetARM32::lowerRet(const InstRet *Instr) { ...@@ -5812,6 +5813,44 @@ void TargetARM32::lowerRet(const InstRet *Instr) {
Context.insert<InstFakeUse>(SP); Context.insert<InstFakeUse>(SP);
} }
void TargetARM32::lowerShuffleVector(const InstShuffleVector *Instr) {
auto *Dest = Instr->getDest();
const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy);
switch (DestTy) {
default:
break;
// TODO(jpp): figure out how to properly lower this without scalarization.
}
// Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T);
auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
const SizeT NumElements = typeNumElements(DestTy);
const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue();
auto *ExtElmt = makeReg(ElementType);
if (Elem < NumElements) {
lowerExtractElement(
InstExtractElement::create(Func, ExtElmt, Src0, Index));
} else {
lowerExtractElement(InstExtractElement::create(
Func, ExtElmt, Src1,
Ctx->getConstantInt32(Index->getValue() - NumElements)));
}
auto *NewT = makeReg(DestTy);
lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
Ctx->getConstantInt32(I)));
T = NewT;
}
_mov(Dest, T);
}
void TargetARM32::lowerSelect(const InstSelect *Instr) { void TargetARM32::lowerSelect(const InstSelect *Instr) {
Variable *Dest = Instr->getDest(); Variable *Dest = Instr->getDest();
Type DestTy = Dest->getType(); Type DestTy = Dest->getType();
......
...@@ -285,6 +285,7 @@ protected: ...@@ -285,6 +285,7 @@ protected:
void lowerPhi(const InstPhi *Instr) override; void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override; void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override; void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override; void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override; void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override; void lowerUnreachable(const InstUnreachable *Instr) override;
......
...@@ -1145,6 +1145,10 @@ void TargetMIPS32::lowerSelect(const InstSelect *Instr) { ...@@ -1145,6 +1145,10 @@ void TargetMIPS32::lowerSelect(const InstSelect *Instr) {
UnimplementedLoweringError(this, Instr); UnimplementedLoweringError(this, Instr);
} }
void TargetMIPS32::lowerShuffleVector(const InstShuffleVector *Instr) {
UnimplementedLoweringError(this, Instr);
}
void TargetMIPS32::lowerStore(const InstStore *Instr) { void TargetMIPS32::lowerStore(const InstStore *Instr) {
UnimplementedLoweringError(this, Instr); UnimplementedLoweringError(this, Instr);
} }
......
...@@ -299,6 +299,7 @@ protected: ...@@ -299,6 +299,7 @@ protected:
void lowerPhi(const InstPhi *Instr) override; void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override; void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override; void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override; void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override; void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override; void lowerUnreachable(const InstUnreachable *Instr) override;
......
...@@ -260,6 +260,7 @@ protected: ...@@ -260,6 +260,7 @@ protected:
void lowerPhi(const InstPhi *Instr) override; void lowerPhi(const InstPhi *Instr) override;
void lowerRet(const InstRet *Instr) override; void lowerRet(const InstRet *Instr) override;
void lowerSelect(const InstSelect *Instr) override; void lowerSelect(const InstSelect *Instr) override;
void lowerShuffleVector(const InstShuffleVector *Instr) override;
void lowerStore(const InstStore *Instr) override; void lowerStore(const InstStore *Instr) override;
void lowerSwitch(const InstSwitch *Instr) override; void lowerSwitch(const InstSwitch *Instr) override;
void lowerUnreachable(const InstUnreachable *Instr) override; void lowerUnreachable(const InstUnreachable *Instr) override;
......
...@@ -428,6 +428,7 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { ...@@ -428,6 +428,7 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
// Address mode optimization. // Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs); Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt(); Func->doAddressOpt();
Func->materializeVectorShuffles();
// Find read-modify-write opportunities. Do this after address mode // Find read-modify-write opportunities. Do this after address mode
// optimization so that doAddressOpt() doesn't need to be applied to RMW // optimization so that doAddressOpt() doesn't need to be applied to RMW
...@@ -5570,6 +5571,46 @@ void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) { ...@@ -5570,6 +5571,46 @@ void TargetX86Base<TraitsType>::lowerRet(const InstRet *Instr) {
} }
template <typename TraitsType> template <typename TraitsType>
void TargetX86Base<TraitsType>::lowerShuffleVector(
const InstShuffleVector *Instr) {
auto *Dest = Instr->getDest();
const Type DestTy = Dest->getType();
auto *T = makeReg(DestTy);
switch (DestTy) {
default:
break;
// TODO(jpp): figure out how to properly lower this without scalarization.
}
// Unoptimized shuffle. Perform a series of inserts and extracts.
Context.insert<InstFakeDef>(T);
auto *Src0 = llvm::cast<Variable>(Instr->getSrc(0));
auto *Src1 = llvm::cast<Variable>(Instr->getSrc(1));
const SizeT NumElements = typeNumElements(DestTy);
const Type ElementType = typeElementType(DestTy);
for (SizeT I = 0; I < Instr->getNumIndexes(); ++I) {
auto *Index = Instr->getIndex(I);
const SizeT Elem = Index->getValue();
auto *ExtElmt = makeReg(ElementType);
if (Elem < NumElements) {
lowerExtractElement(
InstExtractElement::create(Func, ExtElmt, Src0, Index));
} else {
lowerExtractElement(InstExtractElement::create(
Func, ExtElmt, Src1,
Ctx->getConstantInt32(Index->getValue() - NumElements)));
}
auto *NewT = makeReg(DestTy);
lowerInsertElement(InstInsertElement::create(Func, NewT, T, ExtElmt,
Ctx->getConstantInt32(I)));
T = NewT;
}
_movp(Dest, T);
}
template <typename TraitsType>
void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) { void TargetX86Base<TraitsType>::lowerSelect(const InstSelect *Select) {
Variable *Dest = Select->getDest(); Variable *Dest = Select->getDest();
......
...@@ -15,55 +15,56 @@ ...@@ -15,55 +15,56 @@
#ifndef SUBZERO_SRC_ICETIMERTREE_DEF #ifndef SUBZERO_SRC_ICETIMERTREE_DEF
#define SUBZERO_SRC_ICETIMERTREE_DEF #define SUBZERO_SRC_ICETIMERTREE_DEF
#define TIMERTREE_TABLE \ #define TIMERTREE_TABLE \
/* enum value */ \ /* enum value */ \
X(O2) \ X(O2) \
X(Om1) \ X(Om1) \
X(advancedPhiLowering) \ X(advancedPhiLowering) \
X(alloca) \ X(alloca) \
X(computeLoopNestDepth) \ X(computeLoopNestDepth) \
X(convertToIce) \ X(convertToIce) \
X(deletePhis) \ X(deletePhis) \
X(doAddressOpt) \ X(doAddressOpt) \
X(doArgLowering) \ X(doArgLowering) \
X(doBranchOpt) \ X(doBranchOpt) \
X(doNopInsertion) \ X(doNopInsertion) \
X(emitAsm) \ X(emitAsm) \
X(emitGlobalInitializers) \ X(emitGlobalInitializers) \
X(findRMW) \ X(findRMW) \
X(genCode) \ X(genCode) \
X(genFrame) \ X(genFrame) \
X(genHelpers) \ X(genHelpers) \
X(initUnhandled) \ X(initUnhandled) \
X(linearScan) \ X(linearScan) \
X(liveRange) \ X(liveRange) \
X(liveness) \ X(liveness) \
X(livenessLightweight) \ X(livenessLightweight) \
X(llvmConvert) \ X(llvmConvert) \
X(loadOpt) \ X(loadOpt) \
X(lowerPhiAssignments) \ X(lowerPhiAssignments) \
X(parse) \ X(materializeVectorShuffles) \
X(parseConstants) \ X(parse) \
X(parseFunctions) \ X(parseConstants) \
X(parseFunctionValuesymtabs) \ X(parseFunctions) \
X(parseGlobals) \ X(parseFunctionValuesymtabs) \
X(parseModule) \ X(parseGlobals) \
X(parseModuleValuesymtabs) \ X(parseModule) \
X(parseTypes) \ X(parseModuleValuesymtabs) \
X(phiValidation) \ X(parseTypes) \
X(placePhiLoads) \ X(phiValidation) \
X(placePhiStores) \ X(placePhiLoads) \
X(qEmitPop) \ X(placePhiStores) \
X(qEmitPush) \ X(qEmitPop) \
X(qTransPop) \ X(qEmitPush) \
X(qTransPush) \ X(qTransPop) \
X(regAlloc) \ X(qTransPush) \
X(renumberInstructions) \ X(regAlloc) \
X(szmain) \ X(renumberInstructions) \
X(translate) \ X(szmain) \
X(translateFunctions) \ X(translate) \
X(validateLiveness) \ X(translateFunctions) \
X(vmetadata) \ X(validateLiveness) \
X(vmetadata) \
X(writeELF) X(writeELF)
//#define X(tag) //#define X(tag)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment