Commit f47d520c by Manasij Mukherjee

Loop Invariant Code Motion

Implemented behind the new -licm flag. Hoists invariant arithmetic instructions from loop bodies to pre-headers. Does not trigger for loops where headers have two incoming edges from outside the loop. Also enables multi block address optimization, because most of the instructions hoisted are address calculations coming from gep. Does not touch memory operations. This algorithm does not seem to work well for load-hoisting. BUG=none R=jpp@chromium.org, stichnot@chromium.org Review URL: https://codereview.chromium.org/2138443002 .
parent c930d59b
...@@ -640,6 +640,103 @@ void Cfg::localCSE() { ...@@ -640,6 +640,103 @@ void Cfg::localCSE() {
} }
} }
void Cfg::loopInvariantCodeMotion() {
TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this);
// Does not introduce new nodes as of now.
for (auto &Pair : LoopInfo) {
CfgNode *Header = Nodes[Pair.first];
assert(Header);
if (Header->getLoopNestDepth() < 1)
continue;
CfgNode *PreHeader = nullptr;
for (auto *Pred : Header->getInEdges()) {
if (Pred->getLoopNestDepth() == Header->getLoopNestDepth() - 1) {
if (PreHeader == nullptr) {
PreHeader = Pred;
} else {
PreHeader = nullptr;
break;
// Do not consider cases with two incoming edges.
// Will require insertion of nodes.
}
}
}
if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) {
continue; // to next loop
}
auto &Insts = PreHeader->getInsts();
auto &LastInst = Insts.back();
Insts.pop_back();
for (auto *Inst : findLoopInvariantInstructions(Pair.first)) {
PreHeader->appendInst(Inst);
}
PreHeader->appendInst(&LastInst);
}
}
Ice::CfgVector<Inst *>
Cfg::findLoopInvariantInstructions(Ice::SizeT LoopHeaderIndex) {
CfgUnorderedSet<Inst *> InvariantInsts;
CfgUnorderedSet<Variable *> InvariantVars;
for (auto *Var : getArgs()) {
InvariantVars.insert(Var);
}
bool Changed = false;
do {
Changed = false;
for (auto NodeIndex : LoopInfo[LoopHeaderIndex]) {
auto *Node = Nodes[NodeIndex];
CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(),
Node->getInsts().end());
for (auto &InstRef : Insts) {
auto &Inst = InstRef.get();
if (Inst.isDeleted() ||
InvariantInsts.find(&Inst) != InvariantInsts.end())
continue;
switch (Inst.getKind()) {
case Inst::InstKind::Alloca:
case Inst::InstKind::Br:
case Inst::InstKind::Ret:
case Inst::InstKind::Phi:
case Inst::InstKind::Call:
case Inst::InstKind::IntrinsicCall:
case Inst::InstKind::Load:
case Inst::InstKind::Store:
case Inst::InstKind::Switch:
continue;
default:
break;
}
bool IsInvariant = true;
for (SizeT i = 0; i < Inst.getSrcSize(); ++i) {
if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) {
if (InvariantVars.find(Var) == InvariantVars.end()) {
IsInvariant = false;
}
}
}
if (IsInvariant) {
Changed = true;
InvariantInsts.insert(&Inst);
Node->getInsts().remove(Inst);
if (Inst.getDest() != nullptr) {
InvariantVars.insert(Inst.getDest());
}
}
}
}
} while (Changed);
CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end());
std::sort(InstVector.begin(), InstVector.end(),
[](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); });
return InstVector;
}
void Cfg::shortCircuitJumps() { void Cfg::shortCircuitJumps() {
// Split Nodes whenever an early jump is possible. // Split Nodes whenever an early jump is possible.
// __N : // __N :
...@@ -1338,10 +1435,9 @@ void Cfg::genFrame() { ...@@ -1338,10 +1435,9 @@ void Cfg::genFrame() {
getTarget()->addEpilog(Node); getTarget()->addEpilog(Node);
} }
void Cfg::computeLoopNestDepth() { void Cfg::generateLoopInfo() {
TimerMarker T(TimerStack::TT_computeLoopNestDepth, this); TimerMarker T(TimerStack::TT_computeLoopNestDepth, this);
LoopAnalyzer LA(this); LoopInfo = LoopAnalyzer(this).getLoopInfo();
LA.computeLoopNestDepth();
} }
// This is a lightweight version of live-range-end calculation. Marks the last // This is a lightweight version of live-range-end calculation. Marks the last
......
...@@ -202,6 +202,7 @@ public: ...@@ -202,6 +202,7 @@ public:
void shuffleNodes(); void shuffleNodes();
void localCSE(); void localCSE();
void shortCircuitJumps(); void shortCircuitJumps();
void loopInvariantCodeMotion();
/// Scan allocas to determine whether we need to use a frame pointer. /// Scan allocas to determine whether we need to use a frame pointer.
/// If SortAndCombine == true, merge all the fixed-size allocas in the /// If SortAndCombine == true, merge all the fixed-size allocas in the
...@@ -215,7 +216,7 @@ public: ...@@ -215,7 +216,7 @@ public:
void doNopInsertion(); void doNopInsertion();
void genCode(); void genCode();
void genFrame(); void genFrame();
void computeLoopNestDepth(); void generateLoopInfo();
void livenessLightweight(); void livenessLightweight();
void liveness(LivenessMode Mode); void liveness(LivenessMode Mode);
bool validateLiveness() const; bool validateLiveness() const;
...@@ -300,6 +301,7 @@ private: ...@@ -300,6 +301,7 @@ private:
uint32_t CombinedAlignment, InstList &Insts, uint32_t CombinedAlignment, InstList &Insts,
AllocaBaseVariableType BaseVariableType); AllocaBaseVariableType BaseVariableType);
void findRematerializable(); void findRematerializable();
CfgVector<Inst *> findLoopInvariantInstructions(SizeT LoopHeaderIndex);
GlobalContext *Ctx; GlobalContext *Ctx;
uint32_t SequenceNumber; /// output order for emission uint32_t SequenceNumber; /// output order for emission
...@@ -330,7 +332,7 @@ private: ...@@ -330,7 +332,7 @@ private:
/// Globals required by this CFG. Mostly used for the profiler's globals. /// Globals required by this CFG. Mostly used for the profiler's globals.
std::unique_ptr<VariableDeclarationList> GlobalInits; std::unique_ptr<VariableDeclarationList> GlobalInits;
CfgVector<InstJumpTable *> JumpTables; CfgVector<InstJumpTable *> JumpTables;
CfgUnorderedMap<SizeT, CfgVector<SizeT>> LoopInfo;
/// CurrentNode is maintained during dumping/emitting just for validating /// CurrentNode is maintained during dumping/emitting just for validating
/// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but /// Variable::DefNode. Normally, a traversal over CfgNodes maintains this, but
/// before global operations like register allocation, resetCurrentNode() /// before global operations like register allocation, resetCurrentNode()
......
...@@ -186,6 +186,9 @@ struct dev_list_flag {}; ...@@ -186,6 +186,9 @@ struct dev_list_flag {};
X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters", \ X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters", \
cl::desc("Number of times local-cse is run on a block"), cl::init(2)) \ cl::desc("Number of times local-cse is run on a block"), cl::init(2)) \
\ \
X(LoopInvariantCodeMotion, bool, dev_opt_flag, "licm", \
cl::desc("Hoist loop invariant arithmetic operations"), cl::init(false)) \
\
X(LogFilename, std::string, dev_opt_flag, "log", \ X(LogFilename, std::string, dev_opt_flag, "log", \
cl::desc("Set log filename"), cl::init("-"), cl::value_desc("filename")) \ cl::desc("Set log filename"), cl::init("-"), cl::value_desc("filename")) \
\ \
......
...@@ -54,6 +54,7 @@ LoopAnalyzer::LoopAnalyzer(Cfg *Fn) : Func(Fn) { ...@@ -54,6 +54,7 @@ LoopAnalyzer::LoopAnalyzer(Cfg *Fn) : Func(Fn) {
// Create the LoopNodes from the function's CFG // Create the LoopNodes from the function's CFG
for (CfgNode *Node : Nodes) for (CfgNode *Node : Nodes)
AllNodes.emplace_back(Node); AllNodes.emplace_back(Node);
computeLoopNestDepth();
} }
void LoopAnalyzer::computeLoopNestDepth() { void LoopAnalyzer::computeLoopNestDepth() {
...@@ -141,6 +142,12 @@ LoopAnalyzer::processNode(LoopAnalyzer::LoopNode &Node) { ...@@ -141,6 +142,12 @@ LoopAnalyzer::processNode(LoopAnalyzer::LoopNode &Node) {
if (*It == &Node) { if (*It == &Node) {
(*It)->setDeleted(); (*It)->setDeleted();
++NumDeletedNodes; ++NumDeletedNodes;
CfgVector<SizeT> LoopNodes;
for (auto LoopIter = It.base() - 1; LoopIter != LoopStack.end();
++LoopIter) {
LoopNodes.push_back((*LoopIter)->getNode()->getIndex());
}
Loops[(*It)->getNode()->getIndex()] = LoopNodes;
LoopStack.erase(It.base() - 1, LoopStack.end()); LoopStack.erase(It.base() - 1, LoopStack.end());
break; break;
} }
......
...@@ -39,9 +39,10 @@ public: ...@@ -39,9 +39,10 @@ public:
// is bounded linear. ncbray suggests another algorithm which is linear in // is bounded linear. ncbray suggests another algorithm which is linear in
// practice but not bounded linear. I think it also finds dominators. // practice but not bounded linear. I think it also finds dominators.
// http://lenx.100871.net/papers/loop-SAS.pdf // http://lenx.100871.net/papers/loop-SAS.pdf
void computeLoopNestDepth(); CfgUnorderedMap<SizeT, CfgVector<SizeT>> getLoopInfo() { return Loops; }
private: private:
void computeLoopNestDepth();
using IndexT = uint32_t; using IndexT = uint32_t;
static constexpr IndexT UndefinedIndex = 0; static constexpr IndexT UndefinedIndex = 0;
static constexpr IndexT FirstDefinedIndex = 1; static constexpr IndexT FirstDefinedIndex = 1;
...@@ -80,6 +81,8 @@ private: ...@@ -80,6 +81,8 @@ private:
void incrementLoopNestDepth(); void incrementLoopNestDepth();
bool hasSelfEdge() const; bool hasSelfEdge() const;
CfgNode *getNode() { return BB; }
private: private:
CfgNode *BB; CfgNode *BB;
NodeList::const_iterator Succ; NodeList::const_iterator Succ;
...@@ -110,6 +113,8 @@ private: ...@@ -110,6 +113,8 @@ private:
/// The number of nodes which have been marked deleted. This is used to track /// The number of nodes which have been marked deleted. This is used to track
/// when the iteration should end. /// when the iteration should end.
LoopNodePtrList::size_type NumDeletedNodes = 0; LoopNodePtrList::size_type NumDeletedNodes = 0;
/// Detailed loop information
CfgUnorderedMap<SizeT, CfgVector<SizeT>> Loops;
}; };
} // end of namespace Ice } // end of namespace Ice
......
...@@ -443,6 +443,18 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { ...@@ -443,6 +443,18 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
Func->processAllocas(SortAndCombineAllocas); Func->processAllocas(SortAndCombineAllocas);
Func->dump("After Alloca processing"); Func->dump("After Alloca processing");
// Run this early so it can be used to focus optimizations on potentially hot
// code.
// TODO(stichnot,ascull): currently only used for regalloc not
// expensive high level optimizations which could be focused on potentially
// hot code.
Func->generateLoopInfo();
Func->dump("After loop analysis");
if (getFlags().getLoopInvariantCodeMotion()) {
Func->loopInvariantCodeMotion();
Func->dump("After LICM");
}
if (getFlags().getEnableExperimental()) { if (getFlags().getEnableExperimental()) {
Func->localCSE(); Func->localCSE();
Func->dump("After Local CSE"); Func->dump("After Local CSE");
...@@ -466,14 +478,6 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { ...@@ -466,14 +478,6 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
Func->dump("After Phi lowering"); Func->dump("After Phi lowering");
} }
// Run this early so it can be used to focus optimizations on potentially hot
// code.
// TODO(stichnot,ascull): currently only used for regalloc not
// expensive high level optimizations which could be focused on potentially
// hot code.
Func->computeLoopNestDepth();
Func->dump("After loop nest depth analysis");
// Address mode optimization. // Address mode optimization.
Func->getVMetadata()->init(VMK_SingleDefs); Func->getVMetadata()->init(VMK_SingleDefs);
Func->doAddressOpt(); Func->doAddressOpt();
...@@ -5415,10 +5419,13 @@ TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType, ...@@ -5415,10 +5419,13 @@ TargetX86Base<TypeTraits>::computeAddressOpt(const Inst *Instr, Type MemType,
// don't go further. Alternatively (?), never consider a transformation that // don't go further. Alternatively (?), never consider a transformation that
// would change a variable that is currently *not* live across basic block // would change a variable that is currently *not* live across basic block
// boundaries into one that *is*. // boundaries into one that *is*.
if (!getFlags().getLoopInvariantCodeMotion()) {
// Need multi block address opt when licm is enabled.
// Might make sense to restrict to current node and loop header.
if (Func->getVMetadata()->isMultiBlock( if (Func->getVMetadata()->isMultiBlock(
NewAddr.Base) /* || Base->getUseCount() > 1*/) NewAddr.Base) /* || Base->getUseCount() > 1*/)
return nullptr; return nullptr;
}
AddressOptimizer AddrOpt(Func); AddressOptimizer AddrOpt(Func);
const bool MockBounds = getFlags().getMockBoundsCheck(); const bool MockBounds = getFlags().getMockBoundsCheck();
const Inst *Reason = nullptr; const Inst *Reason = nullptr;
......
...@@ -42,6 +42,7 @@ ...@@ -42,6 +42,7 @@
X(llvmConvert) \ X(llvmConvert) \
X(loadOpt) \ X(loadOpt) \
X(localCse) \ X(localCse) \
X(loopInvariantCodeMotion) \
X(lowerPhiAssignments) \ X(lowerPhiAssignments) \
X(materializeVectorShuffles) \ X(materializeVectorShuffles) \
X(parse) \ X(parse) \
......
; Tests if the licm flag successfully hoists the add from loop0 to entry
; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
; RUN: -O2 -licm | FileCheck --check-prefix ENABLE %s
; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
; RUN: -O2 | FileCheck --check-prefix NOENABLE %s
define internal void @dummy() {
entry:
ret void
}
define internal i32 @test_licm(i32 %a32, i32 %b, i32 %c) {
entry:
%a = trunc i32 %a32 to i1
br label %loop0
loop0: ; <-+
call void @dummy() ; |
%add1 = add i32 %b, %c ; |
br label %loop1 ; |
loop1: ; |
br i1 %a, label %loop0, label %out ; --+
out:
ret i32 %add1
}
; CHECK-LABEL: test_licm
; ENABLE: add
; ENABLE: call
; NOENABLE: call
; NOENABLE-NEXT: mov
; NOENABLE-NEXT: add
; CHECK: ret
\ No newline at end of file
...@@ -20,7 +20,7 @@ out: ...@@ -20,7 +20,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0: ; CHECK-NEXT: loop0:
...@@ -48,7 +48,7 @@ out: ...@@ -48,7 +48,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0: ; CHECK-NEXT: loop0:
...@@ -78,7 +78,7 @@ out: ; <---+ ...@@ -78,7 +78,7 @@ out: ; <---+
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0: ; CHECK-NEXT: loop0:
...@@ -110,7 +110,7 @@ out: ...@@ -110,7 +110,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0: ; CHECK-NEXT: loop0_0:
...@@ -148,7 +148,7 @@ out: ...@@ -148,7 +148,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0: ; CHECK-NEXT: loop0_0:
...@@ -190,7 +190,7 @@ out: ...@@ -190,7 +190,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0: ; CHECK-NEXT: loop0_0:
...@@ -235,7 +235,7 @@ out: ...@@ -235,7 +235,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: loop0_0: ; CHECK-NEXT: loop0_0:
...@@ -269,7 +269,7 @@ out: ...@@ -269,7 +269,7 @@ out:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: left: ; CHECK-NEXT: left:
...@@ -296,7 +296,7 @@ exit: ...@@ -296,7 +296,7 @@ exit:
ret void ret void
} }
; CHECK-LABEL: After loop nest depth analysis ; CHECK-LABEL: After loop analysis
; CHECK-NEXT: entry: ; CHECK-NEXT: entry:
; CHECK-NEXT: LoopNestDepth = 0 ; CHECK-NEXT: LoopNestDepth = 0
; CHECK-NEXT: body: ; CHECK-NEXT: body:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment