Commit ff9c7063 by Jim Stichnoth

Subzero: Add branch optimization.

1. Unconditional branch to the next basic block is removed. 2. For a conditional branch with a "false" edge to the next basic block, remove the unconditional branch to the fallthrough block. 3. For a conditional branch with a "true" edge to the next basic block, invert the condition and do like #2. This is enabled only for O2, particularly because inverting the branch condition is a marginally risky operation. This decreases the instruction count by about 5-6%. Also, --stats prints a final tally to make it easier to post-process the output. BUG= none R=jvoung@chromium.org Review URL: https://codereview.chromium.org/580903005
parent d1a971a1
......@@ -290,6 +290,14 @@ bool Cfg::validateLiveness() const {
return Valid;
}
void Cfg::doBranchOpt() {
for (NodeList::iterator I = Nodes.begin(), E = Nodes.end(); I != E; ++I) {
NodeList::iterator NextNode = I;
++NextNode;
(*I)->doBranchOpt(*NextNode);
}
}
// ======================== Dump routines ======================== //
void Cfg::emit() {
......
......@@ -94,6 +94,7 @@ public:
void livenessLightweight();
void liveness(LivenessMode Mode);
bool validateLiveness() const;
void doBranchOpt();
// Manage the CurrentNode field, which is used for validating the
// Variable::DefNode field during dumping/emitting.
......
......@@ -457,6 +457,19 @@ void CfgNode::livenessPostprocess(LivenessMode Mode, Liveness *Liveness) {
}
}
void CfgNode::doBranchOpt(const CfgNode *NextNode) {
TargetLowering *Target = Func->getTarget();
// Check every instruction for a branch optimization opportunity.
// It may be more efficient to iterate in reverse and stop after the
// first opportunity, unless there is some target lowering where we
// have the possibility of multiple such optimizations per block
// (currently not the case for x86 lowering).
for (InstList::const_iterator I = Insts.begin(), E = Insts.end(); I != E;
++I) {
Target->doBranchOpt(*I, NextNode);
}
}
// ======================== Dump routines ======================== //
void CfgNode::emit(Cfg *Func) const {
......
......@@ -65,6 +65,7 @@ public:
void livenessLightweight();
bool liveness(Liveness *Liveness);
void livenessPostprocess(LivenessMode Mode, Liveness *Liveness);
void doBranchOpt(const CfgNode *NextNode);
void emit(Cfg *Func) const;
void dump(Cfg *Func) const;
......
......@@ -384,10 +384,14 @@ ConstantList GlobalContext::getConstantPool(Type Ty) const {
llvm_unreachable("Unknown type");
}
void GlobalContext::dumpStats(const IceString &Name) {
void GlobalContext::dumpStats(const IceString &Name, bool Final) {
if (Flags.DumpStats) {
StatsFunction.dump(Name, getStrDump());
StatsCumulative.dump("_TOTAL_", getStrDump());
if (Final) {
StatsCumulative.dump(Name, getStrDump());
} else {
StatsFunction.dump(Name, getStrDump());
StatsCumulative.dump("_TOTAL_", getStrDump());
}
}
}
......
......@@ -132,7 +132,7 @@ public:
// Reset stats at the beginning of a function.
void resetStats() { StatsFunction.reset(); }
void dumpStats(const IceString &Name);
void dumpStats(const IceString &Name, bool Final = false);
void statsUpdateEmitted(uint32_t InstCount) {
StatsFunction.updateEmitted(InstCount);
StatsCumulative.updateEmitted(InstCount);
......
......@@ -24,11 +24,12 @@ namespace Ice {
namespace {
const struct InstX8632BrAttributes_ {
InstX8632::BrCond Opposite;
const char *DisplayString;
const char *EmitString;
} InstX8632BrAttributes[] = {
#define X(tag, dump, emit) \
{ dump, emit } \
#define X(tag, opp, dump, emit) \
{ InstX8632::opp, dump, emit } \
,
ICEINSTX8632BR_TABLE
#undef X
......@@ -128,11 +129,52 @@ IceString InstX8632Label::getName(const Cfg *Func) const {
return ".L" + Func->getFunctionName() + "$local$__" + buf;
}
InstX8632Br::InstX8632Br(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
InstX8632Label *Label, InstX8632::BrCond Condition)
InstX8632Br::InstX8632Br(Cfg *Func, const CfgNode *TargetTrue,
const CfgNode *TargetFalse,
const InstX8632Label *Label,
InstX8632::BrCond Condition)
: InstX8632(Func, InstX8632::Br, 0, NULL), Condition(Condition),
TargetTrue(TargetTrue), TargetFalse(TargetFalse), Label(Label) {}
bool InstX8632Br::optimizeBranch(const CfgNode *NextNode) {
// If there is no next block, then there can be no fallthrough to
// optimize.
if (NextNode == NULL)
return false;
// Intra-block conditional branches can't be optimized.
if (Label)
return false;
// If there is no fallthrough node, such as a non-default case label
// for a switch instruction, then there is no opportunity to
// optimize.
if (getTargetFalse() == NULL)
return false;
// Unconditional branch to the next node can be removed.
if (Condition == Br_None && getTargetFalse() == NextNode) {
assert(getTargetTrue() == NULL);
setDeleted();
return true;
}
// If the fallthrough is to the next node, set fallthrough to NULL
// to indicate.
if (getTargetFalse() == NextNode) {
TargetFalse = NULL;
return true;
}
// If TargetTrue is the next node, and TargetFalse is non-NULL
// (which was already tested above), then invert the branch
// condition, swap the targets, and set new fallthrough to NULL.
if (getTargetTrue() == NextNode) {
assert(Condition != Br_None);
Condition = InstX8632BrAttributes[Condition].Opposite;
TargetTrue = getTargetFalse();
TargetFalse = NULL;
return true;
}
return false;
}
InstX8632Call::InstX8632Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
: InstX8632(Func, InstX8632::Call, 1, Dest) {
HasSideEffects = true;
......
......@@ -51,20 +51,20 @@
//#define X(val, name)
#define ICEINSTX8632BR_TABLE \
/* enum value, dump, emit */ \
X(Br_a, "a", "ja") \
X(Br_ae, "ae", "jae") \
X(Br_b, "b", "jb") \
X(Br_be, "be", "jbe") \
X(Br_e, "e", "je") \
X(Br_g, "g", "jg") \
X(Br_ge, "ge", "jge") \
X(Br_l, "l", "jl") \
X(Br_le, "le", "jle") \
X(Br_ne, "ne", "jne") \
X(Br_np, "np", "jnp") \
X(Br_p, "p", "jp") \
//#define X(tag, dump, emit)
/* enum value, opposite, dump, emit */ \
X(Br_a, Br_be, "a", "ja") \
X(Br_ae, Br_b, "ae", "jae") \
X(Br_b, Br_ae, "b", "jb") \
X(Br_be, Br_a, "be", "jbe") \
X(Br_e, Br_ne, "e", "je") \
X(Br_g, Br_le, "g", "jg") \
X(Br_ge, Br_l, "ge", "jge") \
X(Br_l, Br_ge, "l", "jl") \
X(Br_le, Br_g, "le", "jle") \
X(Br_ne, Br_e, "ne", "jne") \
X(Br_np, Br_p, "np", "jnp") \
X(Br_p, Br_np, "p", "jp") \
//#define X(tag, opp, dump, emit)
#define ICEINSTX8632CMPPS_TABLE \
/* enum value, emit */ \
......
......@@ -225,7 +225,7 @@ public:
};
enum BrCond {
#define X(tag, dump, emit) tag,
#define X(tag, opp, dump, emit) tag,
ICEINSTX8632BR_TABLE
#undef X
Br_None
......@@ -309,53 +309,62 @@ public:
// Create a conditional branch to a node.
static InstX8632Br *create(Cfg *Func, CfgNode *TargetTrue,
CfgNode *TargetFalse, BrCond Condition) {
const InstX8632Label *NoLabel = NULL;
return new (Func->allocate<InstX8632Br>())
InstX8632Br(Func, TargetTrue, TargetFalse, NULL, Condition);
InstX8632Br(Func, TargetTrue, TargetFalse, NoLabel, Condition);
}
// Create an unconditional branch to a node.
static InstX8632Br *create(Cfg *Func, CfgNode *Target) {
const CfgNode *NoCondTarget = NULL;
const InstX8632Label *NoLabel = NULL;
return new (Func->allocate<InstX8632Br>())
InstX8632Br(Func, NULL, Target, NULL, Br_None);
InstX8632Br(Func, NoCondTarget, Target, NoLabel, Br_None);
}
// Create a non-terminator conditional branch to a node, with a
// fallthrough to the next instruction in the current node. This is
// used for switch lowering.
static InstX8632Br *create(Cfg *Func, CfgNode *Target, BrCond Condition) {
const CfgNode *NoUncondTarget = NULL;
const InstX8632Label *NoLabel = NULL;
return new (Func->allocate<InstX8632Br>())
InstX8632Br(Func, Target, NULL, NULL, Condition);
InstX8632Br(Func, Target, NoUncondTarget, NoLabel, Condition);
}
// Create a conditional intra-block branch (or unconditional, if
// Condition==Br_None) to a label in the current block.
static InstX8632Br *create(Cfg *Func, InstX8632Label *Label,
BrCond Condition) {
const CfgNode *NoCondTarget = NULL;
const CfgNode *NoUncondTarget = NULL;
return new (Func->allocate<InstX8632Br>())
InstX8632Br(Func, NULL, NULL, Label, Condition);
InstX8632Br(Func, NoCondTarget, NoUncondTarget, Label, Condition);
}
CfgNode *getTargetTrue() const { return TargetTrue; }
CfgNode *getTargetFalse() const { return TargetFalse; }
const CfgNode *getTargetTrue() const { return TargetTrue; }
const CfgNode *getTargetFalse() const { return TargetFalse; }
bool optimizeBranch(const CfgNode *NextNode);
virtual uint32_t getEmitInstCount() const {
uint32_t Sum = 0;
if (Label)
return 1;
if (Condition == Br_None)
return 1;
++Sum;
if (getTargetTrue())
++Sum;
if (getTargetFalse())
return 2;
return 1;
++Sum;
return Sum;
}
virtual void emit(const Cfg *Func) const;
virtual void dump(const Cfg *Func) const;
static bool classof(const Inst *Inst) { return isClassof(Inst, Br); }
private:
InstX8632Br(Cfg *Func, CfgNode *TargetTrue, CfgNode *TargetFalse,
InstX8632Label *Label, BrCond Condition);
InstX8632Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
const InstX8632Label *Label, BrCond Condition);
InstX8632Br(const InstX8632Br &) LLVM_DELETED_FUNCTION;
InstX8632Br &operator=(const InstX8632Br &) LLVM_DELETED_FUNCTION;
virtual ~InstX8632Br() {}
BrCond Condition;
CfgNode *TargetTrue;
CfgNode *TargetFalse;
InstX8632Label *Label; // Intra-block branch target
const CfgNode *TargetTrue;
const CfgNode *TargetFalse;
const InstX8632Label *Label; // Intra-block branch target
};
// AdjustStack instruction - subtracts esp by the given amount and
......
......@@ -125,6 +125,11 @@ public:
void doNopInsertion();
// Lowers a single instruction.
void lower();
// Tries to do branch optimization on a single instruction. Returns
// true if some optimization was done.
virtual bool doBranchOpt(Inst * /*I*/, const CfgNode * /*NextNode*/) {
return false;
}
// Returns a variable pre-colored to the specified physical
// register. This is generally used to get very direct access to
......
......@@ -395,6 +395,14 @@ void TargetX8632::translateO2() {
T_genFrame.printElapsedUs(Context, "genFrame()");
Func->dump("After stack frame mapping");
// Branch optimization. This needs to be done just before code
// emission. In particular, no transformations that insert or
// reorder CfgNodes should be done after branch optimization. We go
// ahead and do it before nop insertion to reduce the amount of work
// needed for searching for opportunities.
Func->doBranchOpt();
Func->dump("After branch optimization");
// Nop insertion
if (shouldDoNopInsertion()) {
Func->doNopInsertion();
......@@ -444,6 +452,13 @@ void TargetX8632::translateOm1() {
}
}
bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
return Br->optimizeBranch(NextNode);
}
return false;
}
IceString TargetX8632::RegNames[] = {
#define X(val, init, name, name16, name8, scratch, preserved, stackptr, \
frameptr, isI8, isInt, isFP) \
......
......@@ -28,6 +28,7 @@ public:
virtual void translateOm1();
virtual void translateO2();
virtual bool doBranchOpt(Inst *I, const CfgNode *NextNode);
virtual Variable *getPhysicalRegister(SizeT RegNum);
virtual IceString getRegName(SizeT RegNum, Type Ty) const;
......
......@@ -2247,7 +2247,6 @@ void PNaClTranslator::translate(const std::string &IRFilename) {
<< "\n";
ErrorStatus = true;
}
return;
}
} // end of namespace Ice
......@@ -166,10 +166,11 @@ int main(int argc, char **argv) {
Ice::GlobalContext Ctx(Ls, Os, VMask, TargetArch, OptLevel, TestPrefix,
Flags);
int ErrorStatus = 0;
if (BuildOnRead) {
Ice::PNaClTranslator Translator(&Ctx, Flags);
Translator.translate(IRFilename);
return Translator.getErrorStatus();
ErrorStatus = Translator.getErrorStatus();
} else {
// Parse the input LLVM IR file into a module.
SMDiagnostic Err;
......@@ -189,6 +190,9 @@ int main(int argc, char **argv) {
Ice::Converter Converter(Mod, &Ctx, Flags);
Converter.convertToIce();
return Converter.getErrorStatus();
ErrorStatus = Converter.getErrorStatus();
}
const bool FinalStats = true;
Ctx.dumpStats("_FINAL_", FinalStats);
return ErrorStatus;
}
; Tests the branch optimizations under O2 (against a lack of
; optimizations under Om1).
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
; RUN: | llvm-objdump -d -symbolize -x86-asm-syntax=intel - \
; RUN: | FileCheck --check-prefix=O2 %s
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -triple=i686-none-nacl -x86-asm-syntax=intel -filetype=obj \
; RUN: | llvm-objdump -d -symbolize -x86-asm-syntax=intel - \
; RUN: | FileCheck --check-prefix=OM1 %s
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
; RUN: | FileCheck --check-prefix=DUMP %s
declare void @dummy()
; An unconditional branch to the next block should be removed.
define void @testUncondToNextBlock() {
entry:
call void @dummy()
br label %next
next:
call void @dummy()
ret void
}
; O2-LABEL: testUncondToNextBlock
; O2: call
; O2-NEXT: call
; OM1-LABEL: testUncondToNextBlock
; OM1: call
; OM1-NEXT: jmp
; OM1-NEXT: call
; For a conditional branch with a fallthrough to the next block, the
; fallthrough branch should be removed.
define void @testCondFallthroughToNextBlock(i32 %arg) {
entry:
%cmp = icmp sge i32 %arg, 123
br i1 %cmp, label %target, label %fallthrough
fallthrough:
call void @dummy()
ret void
target:
call void @dummy()
ret void
}
; O2-LABEL: testCondFallthroughToNextBlock
; O2: cmp {{.*}}, 123
; O2-NEXT: jge
; O2-NEXT: call
; O2: ret
; O2: call
; O2: ret
; OM1-LABEL: testCondFallthroughToNextBlock
; OM1: cmp {{.*}}, 123
; OM1: jge
; OM1: cmp
; OM1: jne
; OM1: jmp
; OM1: call
; OM1: ret
; OM1: call
; OM1: ret
; For a conditional branch with the next block as the target and a
; different block as the fallthrough, the branch condition should be
; inverted, the fallthrough block changed to the target, and the
; branch to the next block removed.
define void @testCondTargetNextBlock(i32 %arg) {
entry:
%cmp = icmp sge i32 %arg, 123
br i1 %cmp, label %fallthrough, label %target
fallthrough:
call void @dummy()
ret void
target:
call void @dummy()
ret void
}
; O2-LABEL: testCondTargetNextBlock
; O2: cmp {{.*}}, 123
; O2-NEXT: jl
; O2-NEXT: call
; O2: ret
; O2: call
; O2: ret
; OM1-LABEL: testCondTargetNextBlock
; OM1: cmp {{.*}}, 123
; OM1: jge
; OM1: cmp
; OM1: jne
; OM1: jmp
; OM1: call
; OM1: ret
; OM1: call
; OM1: ret
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
......@@ -5,7 +5,7 @@
; RUN: %llvm2ice -Om1 --target=x8632 --verbose none %s | FileCheck %s
declare i32 @memcpy_helper2(i32 %buf, i32 %buf2, i32 %n);
declare i32 @memcpy_helper2(i32 %buf, i32 %buf2, i32 %n)
define i32 @memcpy_helper(i32 %buf, i32 %n) {
entry:
......
......@@ -20,7 +20,7 @@ declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
; reuse the flags set by the cmpxchg instruction itself.
; This is only expected to work w/ O2, based on lightweight liveness.
; (Or if we had other means to detect the only use).
declare void @use_value(i32);
declare void @use_value(i32)
define i32 @test_atomic_cmpxchg_loop(i32 %iptr, i32 %expected, i32 %desired) {
entry:
......@@ -46,7 +46,7 @@ done:
; Make sure the phi assignment for succeeded_first_try is still there.
; O2: mov {{.*}}, 2
; O2-NOT: cmp
; O2: je
; O2: jne
; Make sure the call isn't accidentally deleted.
; O2: call
;
......@@ -78,7 +78,7 @@ done:
; O2: lock
; O2-NEXT: cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2-NOT: cmp
; O2: je
; O2: jne
; Still works if the compare operands are constants.
......@@ -102,7 +102,7 @@ done:
; Should be using NEXT: see issue 3929
; O2: cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2-NOT: cmp
; O2: je
; O2: jne
; This is a case where the flags cannot be reused (compare is for some
; other condition).
......@@ -126,7 +126,7 @@ done:
; O2-NEXT: cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2: mov {{.*}}
; O2: cmp
; O2: jg
; O2: jle
; Another case where the flags cannot be reused (the comparison result
; is used somewhere else).
......
......@@ -213,8 +213,7 @@ branch2:
}
; CHECK-LABEL: could_have_hoisted_loads
; CHECK: dword ptr [0]
; CHECK: je {{.*}}
; CHECK: jmp {{.*}}
; CHECK: jne {{.*}}
; CHECK: mov {{.*}}, dword ptr
; CHECK: ret
; CHECK: mfence
......
......@@ -30,10 +30,8 @@ target:
; CHECK: mov {{.*}}, 0
; CHECK: mov [[PHI:.*]],
; CHECK: cmp {{.*}}, 0
; CHECK: jne
; CHECK: :
; CHECK: je
; CHECK: mov [[PHI]], 0
; CHECK: :
; CHECK: movzx {{.*}}, [[PHI]]
define internal i32 @testPhi2(i32 %arg) {
......@@ -50,10 +48,8 @@ target:
; CHECK-LABEL: testPhi2
; CHECK: mov {{.*}}, 12345
; CHECK: cmp {{.*}}, 0
; CHECK-NEXT: jg
; CHECK: :
; CHECK-NEXT: jle
; CHECK: mov [[PHI:.*]], 54321
; CHECK: :
; CHECK: mov {{.*}}, [[PHI]]
; ERRORS-NOT: ICE translation error
......@@ -100,11 +96,9 @@ exit:
; CHECK-LABEL: testPhi3
; CHECK: push [[EBX:.*]]
; CHECK: mov {{.*}}, dword ptr [esp
; CHECK: jmp
; CHECK: mov
; CHECK: mov {{.*}}[[ADDR:.*1000]]
; CHECK: cmp {{.*}}, 0
; CHECK: je
; CHECK: jmp
; CHECK: jne
; CHECK: mov {{.*}}[[ADDR]]
; CHECK: pop [[EBX]]
......@@ -38,10 +38,9 @@ for.end:
; CHECK-LABEL: simple_loop
; CHECK: mov ecx, dword ptr [esp{{.*}}+{{.*}}{{[0-9]+}}]
; CHECK: cmp ecx, 0
; CHECK-NEXT: jg {{[0-9]}}
; CHECK-NEXT: jle {{[0-9]}}
; NaCl bundle padding
; CHECK-NEXT: nop
; CHECK-NEXT: jmp {{[0-9]}}
; TODO: the mov from ebx to esi seems redundant here - so this may need to be
; modified later
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment