Commit 53c8fbdf by Manasij Mukherjee

Enable Local CSE by default

Reduce the default number of iterations to 1 Put the optional code behind the -lcse-no-ssa flag, which is disabled by default. This brings down the overhead of enabling this to about 2%. BUG= R=stichnot@chromium.org Review URL: https://codereview.chromium.org/2185193002 .
parent b9a84728
...@@ -514,19 +514,23 @@ void Cfg::shuffleNodes() { ...@@ -514,19 +514,23 @@ void Cfg::shuffleNodes() {
dump("After basic block shuffling"); dump("After basic block shuffling");
} }
void Cfg::localCSE() { void Cfg::localCSE(bool AssumeSSA) {
// Performs basic-block local common-subexpression elimination // Performs basic-block local common-subexpression elimination
// If we have // If we have
// t1 = op b c // t1 = op b c
// t2 = op b c // t2 = op b c
// This pass will replace future references to t2 in a basic block by t1 // This pass will replace future references to t2 in a basic block by t1
// Points to note: // Points to note:
// 1. Does not assume SSA, but not tested on non-SSA input yet as it is run // 1. Assumes SSA by default. To change this, use -lcse=no-ssa
// at the beginning. // This is needed if this pass is moved to a point later in the pipeline.
// If variables have a single definition (in the node), CSE can work just
// on the basis of an equality compare on instructions (sans Dest). When
// variables can be updated (hence, non-SSA) the result of a previous
// instruction which used that variable as an operand can not be reused.
// 2. Leaves removal of instructions to DCE. // 2. Leaves removal of instructions to DCE.
// 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected
// to take care of cases not arising from GEP simplification. // to take care of cases not arising from GEP simplification.
// 4. By default, two passes are made over each basic block. Control this // 4. By default, a single pass is made over each basic block. Control this
// with -lcse-max-iters=N // with -lcse-max-iters=N
TimerMarker T(TimerStack::TT_localCse, this); TimerMarker T(TimerStack::TT_localCse, this);
...@@ -575,41 +579,45 @@ void Cfg::localCSE() { ...@@ -575,41 +579,45 @@ void Cfg::localCSE() {
for (CfgNode *Node : getNodes()) { for (CfgNode *Node : getNodes()) {
CfgUnorderedSet<Inst *, InstHash, InstEq> Seen; CfgUnorderedSet<Inst *, InstHash, InstEq> Seen;
// Stores currently available instructions.
CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements; CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements;
// Combining the above two into a single data structure might consume less // Combining the above two into a single data structure might consume less
// memory but will be slower i.e map of Instruction -> Set of Variables // memory but will be slower i.e map of Instruction -> Set of Variables
CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency; CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency;
// Not necessary for SSA, still keeping it in case this pass is not run at // Maps a variable to the Instructions that depend on it.
// the beginning. Remove to improve performace. // a = op1 b c
// x = op2 c d
// Will result in the map : b -> {a}, c -> {a, x}, d -> {x}
// Not necessary for SSA as dependencies will never be invalidated, and the
// container will use minimal memory when left unused.
int IterCount = getFlags().getLocalCseMaxIterations(); auto IterCount = getFlags().getLocalCseMaxIterations();
while (IterCount--) { for (uint32_t i = 0; i < IterCount; ++i) {
// TODO : Stats on IterCount -> performance // TODO(manasijm): Stats on IterCount -> performance
for (Inst &Instr : Node->getInsts()) { for (Inst &Instr : Node->getInsts()) {
if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr)) if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr))
continue; continue;
if (!AssumeSSA) {
// Invalidate replacements
auto Iter = Replacements.find(Instr.getDest());
if (Iter != Replacements.end()) {
Replacements.erase(Iter);
}
// Invalidate replacements // Invalidate 'seen' instructions whose operands were just updated.
auto Iter = Replacements.find(Instr.getDest()); auto DepIter = Dependency.find(Instr.getDest());
if (Iter != Replacements.end()) { if (DepIter != Dependency.end()) {
Replacements.erase(Iter); for (auto *DepInst : DepIter->second) {
} Seen.erase(DepInst);
}
// Invalidate 'seen' instructions whose operands were just updated.
auto DepIter = Dependency.find(Instr.getDest());
if (DepIter != Dependency.end()) {
for (auto DepInst : DepIter->second) {
Seen.erase(DepInst);
} }
} }
// The above two can be removed if SSA is assumed.
// Replace - doing this before checking for repetitions might enable // Replace - doing this before checking for repetitions might enable
// more // more optimizations
// optimizations
for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
auto *Opnd = Instr.getSrc(i); auto *Opnd = Instr.getSrc(i);
if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
...@@ -627,11 +635,13 @@ void Cfg::localCSE() { ...@@ -627,11 +635,13 @@ void Cfg::localCSE() {
} else { // new } else { // new
Seen.insert(&Instr); Seen.insert(&Instr);
// Update dependencies if (!AssumeSSA) {
for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { // Update dependencies
auto *Opnd = Instr.getSrc(i); for (SizeT i = 0; i < Instr.getSrcSize(); ++i) {
if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { auto *Opnd = Instr.getSrc(i);
Dependency[Var].push_back(&Instr); if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) {
Dependency[Var].push_back(&Instr);
}
} }
} }
} }
......
...@@ -201,7 +201,7 @@ public: ...@@ -201,7 +201,7 @@ public:
void advancedPhiLowering(); void advancedPhiLowering();
void reorderNodes(); void reorderNodes();
void shuffleNodes(); void shuffleNodes();
void localCSE(); void localCSE(bool AssumeSSA);
void shortCircuitJumps(); void shortCircuitJumps();
void loopInvariantCodeMotion(); void loopInvariantCodeMotion();
......
...@@ -140,9 +140,14 @@ struct dev_list_flag {}; ...@@ -140,9 +140,14 @@ struct dev_list_flag {};
"information to stdout at the end of program execution."), \ "information to stdout at the end of program execution."), \
cl::init(false)) \ cl::init(false)) \
\ \
X(EnableExperimental, bool, dev_opt_flag, "enable-experimental", \ X(LocalCSE, Ice::LCSEOptions, dev_opt_flag, "lcse", \
cl::desc("Enable Optimizations not yet part of O2"), \ cl::desc("Local common subexpression elimination"), \
cl::init(false)) \ cl::init(Ice::LCSE_EnabledSSA), \
cl::values( \
clEnumValN(Ice::LCSE_Disabled, "0", "disabled"), \
clEnumValN(Ice::LCSE_EnabledSSA, "enabled", "assume-ssa"), \
clEnumValN(Ice::LCSE_EnabledNoSSA, "no-ssa", "no-assume-ssa"), \
clEnumValEnd)) \
\ \
X(EnablePhiEdgeSplit, bool, dev_opt_flag, "phi-edge-split", \ X(EnablePhiEdgeSplit, bool, dev_opt_flag, "phi-edge-split", \
cl::desc("Enable edge splitting for Phi lowering"), cl::init(true)) \ cl::desc("Enable edge splitting for Phi lowering"), cl::init(true)) \
...@@ -186,8 +191,8 @@ struct dev_list_flag {}; ...@@ -186,8 +191,8 @@ struct dev_list_flag {};
"building LLVM IR first"), \ "building LLVM IR first"), \
cl::init(false)) \ cl::init(false)) \
\ \
X(LocalCseMaxIterations, int, dev_opt_flag, "lcse-max-iters", \ X(LocalCseMaxIterations, uint32_t, dev_opt_flag, "lcse-max-iters", \
cl::desc("Number of times local-cse is run on a block"), cl::init(2)) \ cl::desc("Number of times local-cse is run on a block"), cl::init(1)) \
\ \
X(LoopInvariantCodeMotion, bool, dev_opt_flag, "licm", \ X(LoopInvariantCodeMotion, bool, dev_opt_flag, "licm", \
cl::desc("Hoist loop invariant arithmetic operations"), cl::init(false)) \ cl::desc("Hoist loop invariant arithmetic operations"), cl::init(false)) \
......
...@@ -308,6 +308,12 @@ enum LivenessMode { ...@@ -308,6 +308,12 @@ enum LivenessMode {
Liveness_Intervals Liveness_Intervals
}; };
enum LCSEOptions {
LCSE_Disabled,
LCSE_EnabledSSA, // Default Mode, assumes SSA.
LCSE_EnabledNoSSA // Does not assume SSA, to be enabled if CSE is done later.
};
enum RegAllocKind { enum RegAllocKind {
RAK_Unknown, RAK_Unknown,
RAK_Global, /// full, global register allocation RAK_Global, /// full, global register allocation
......
...@@ -457,8 +457,8 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() { ...@@ -457,8 +457,8 @@ template <typename TraitsType> void TargetX86Base<TraitsType>::translateO2() {
Func->dump("After LICM"); Func->dump("After LICM");
} }
if (getFlags().getEnableExperimental()) { if (getFlags().getLocalCSE() != Ice::LCSE_Disabled) {
Func->localCSE(); Func->localCSE(getFlags().getLocalCSE() == Ice::LCSE_EnabledSSA);
Func->dump("After Local CSE"); Func->dump("After Local CSE");
} }
if (getFlags().getEnableShortCircuit()) { if (getFlags().getEnableShortCircuit()) {
......
; Tests local-cse on x8632 and x8664 ; Tests local-cse on x8632 and x8664
; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \ ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8632 \ ; RUN: -O2 | FileCheck --check-prefix=X8632 \
; RUN: --check-prefix=X8632EXP %s ; RUN: --check-prefix=X8632EXP %s
; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \ ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8632 --args \
; RUN: -O2 | FileCheck --check-prefix=X8632 --check-prefix X8632NOEXP %s ; RUN: -O2 -lcse=0| FileCheck --check-prefix=X8632 --check-prefix X8632NOEXP %s
; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \ ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
; RUN: -O2 -enable-experimental | FileCheck --check-prefix=X8664 \ ; RUN: -O2 | FileCheck --check-prefix=X8664 \
; RUN: --check-prefix=X8664EXP %s ; RUN: --check-prefix=X8664EXP %s
; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \ ; RUN: %p2i -i %s --filetype=obj --disassemble --target x8664 --args \
; RUN: -O2 | FileCheck --check-prefix=X8664 --check-prefix X8664NOEXP %s ; RUN: -O2 -lcse=0| FileCheck --check-prefix=X8664 --check-prefix X8664NOEXP %s
define internal i32 @local_cse_test(i32 %a, i32 %b) { define internal i32 @local_cse_test(i32 %a, i32 %b) {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment