Commit a59ae6ff by Jim Stichnoth

Subzero: Fold icmp into br/select lowering.

Originally there was a peephole-style optimization in lowerIcmp() that looks ahead to see if the next instruction is a conditional branch with the right properties, and if so, folds the icmp and br into a single lowering sequence. However, sometimes extra instructions come between the icmp and br instructions, disabling the folding even though it would still be possible. One thought is to do the folding inside lowerBr() instead of lowerIcmp(), by looking backward for a suitable icmp instruction. The problem here is that the icmp lowering code may leave lowered instructions that can't easily be dead-code eliminated, e.g. instructions lacking a dest variable. Instead, before lowering a basic block, we do a prepass on the block to identify folding candidates. For the icmp/br example, the prepass would tentatively delete the icmp instruction and then the br lowering would fold in the icmp. This folding can also be extended to several producers: icmp (i32 operands), icmp (i64 operands), fcmp, trunc .. to i1 and several consumers: br, select, sext, zext This CL starts with 2 combinations: icmp32 paired with br & select. Other combinations will be added in later CLs. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4162 BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1141213004
parent 33492e76
...@@ -516,6 +516,7 @@ void CfgNode::genCode() { ...@@ -516,6 +516,7 @@ void CfgNode::genCode() {
LoweringContext &Context = Target->getContext(); LoweringContext &Context = Target->getContext();
// Lower the regular instructions. // Lower the regular instructions.
Context.init(this); Context.init(this);
Target->initNodeForLowering(this);
while (!Context.atEnd()) { while (!Context.atEnd()) {
InstList::iterator Orig = Context.getCur(); InstList::iterator Orig = Context.getCur();
if (llvm::isa<InstRet>(*Orig)) if (llvm::isa<InstRet>(*Orig))
......
...@@ -195,6 +195,7 @@ cl::list<Ice::VerboseItem> VerboseList( ...@@ -195,6 +195,7 @@ cl::list<Ice::VerboseItem> VerboseList(
clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"), clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"), clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
clEnumValN(Ice::IceV_Random, "random", "Randomization details"), clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
clEnumValN(Ice::IceV_Folding, "fold", "Instruction folding details"),
clEnumValN(Ice::IceV_All, "all", "Use all verbose options"), clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
clEnumValN(Ice::IceV_Most, "most", clEnumValN(Ice::IceV_Most, "most",
"Use all verbose options except 'regalloc'"), "Use all verbose options except 'regalloc'"),
......
...@@ -172,6 +172,7 @@ enum VerboseItem { ...@@ -172,6 +172,7 @@ enum VerboseItem {
IceV_Frame = 1 << 8, IceV_Frame = 1 << 8,
IceV_AddrOpt = 1 << 9, IceV_AddrOpt = 1 << 9,
IceV_Random = 1 << 10, IceV_Random = 1 << 10,
IceV_Folding = 1 << 11,
IceV_All = ~IceV_None, IceV_All = ~IceV_None,
IceV_Most = IceV_All & ~IceV_LinearScan IceV_Most = IceV_All & ~IceV_LinearScan
}; };
......
...@@ -81,6 +81,7 @@ public: ...@@ -81,6 +81,7 @@ public:
bool isDeleted() const { return Deleted; } bool isDeleted() const { return Deleted; }
void setDeleted() { Deleted = true; } void setDeleted() { Deleted = true; }
void setDead(bool Value = true) { Dead = Value; }
void deleteIfDead(); void deleteIfDead();
bool hasSideEffects() const { return HasSideEffects; } bool hasSideEffects() const { return HasSideEffects; }
...@@ -178,7 +179,9 @@ protected: ...@@ -178,7 +179,9 @@ protected:
InstNumberT Number; InstNumberT Number;
// Deleted means irrevocably deleted. // Deleted means irrevocably deleted.
bool Deleted; bool Deleted;
// Dead means pending deletion after liveness analysis converges. // Dead means one of two things depending on context: (1) pending
// deletion after liveness analysis converges, or (2) marked for
// deletion during lowering due to a folded bool operation.
bool Dead; bool Dead;
// HasSideEffects means the instruction is something like a function // HasSideEffects means the instruction is something like a function
// call or a volatile load that can't be removed even if its Dest // call or a volatile load that can't be removed even if its Dest
......
...@@ -126,68 +126,70 @@ void TargetLowering::doNopInsertion() { ...@@ -126,68 +126,70 @@ void TargetLowering::doNopInsertion() {
void TargetLowering::lower() { void TargetLowering::lower() {
assert(!Context.atEnd()); assert(!Context.atEnd());
Inst *Inst = Context.getCur(); Inst *Inst = Context.getCur();
Inst->deleteIfDead();
if (!Inst->isDeleted()) {
// Mark the current instruction as deleted before lowering, // Mark the current instruction as deleted before lowering,
// otherwise the Dest variable will likely get marked as non-SSA. // otherwise the Dest variable will likely get marked as non-SSA.
// See Variable::setDefinition(). // See Variable::setDefinition().
Inst->setDeleted(); Inst->setDeleted();
switch (Inst->getKind()) { switch (Inst->getKind()) {
case Inst::Alloca: case Inst::Alloca:
lowerAlloca(llvm::dyn_cast<InstAlloca>(Inst)); lowerAlloca(llvm::cast<InstAlloca>(Inst));
break; break;
case Inst::Arithmetic: case Inst::Arithmetic:
lowerArithmetic(llvm::dyn_cast<InstArithmetic>(Inst)); lowerArithmetic(llvm::cast<InstArithmetic>(Inst));
break; break;
case Inst::Assign: case Inst::Assign:
lowerAssign(llvm::dyn_cast<InstAssign>(Inst)); lowerAssign(llvm::cast<InstAssign>(Inst));
break; break;
case Inst::Br: case Inst::Br:
lowerBr(llvm::dyn_cast<InstBr>(Inst)); lowerBr(llvm::cast<InstBr>(Inst));
break; break;
case Inst::Call: case Inst::Call:
lowerCall(llvm::dyn_cast<InstCall>(Inst)); lowerCall(llvm::cast<InstCall>(Inst));
break; break;
case Inst::Cast: case Inst::Cast:
lowerCast(llvm::dyn_cast<InstCast>(Inst)); lowerCast(llvm::cast<InstCast>(Inst));
break; break;
case Inst::ExtractElement: case Inst::ExtractElement:
lowerExtractElement(llvm::dyn_cast<InstExtractElement>(Inst)); lowerExtractElement(llvm::cast<InstExtractElement>(Inst));
break; break;
case Inst::Fcmp: case Inst::Fcmp:
lowerFcmp(llvm::dyn_cast<InstFcmp>(Inst)); lowerFcmp(llvm::cast<InstFcmp>(Inst));
break; break;
case Inst::Icmp: case Inst::Icmp:
lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst)); lowerIcmp(llvm::cast<InstIcmp>(Inst));
break; break;
case Inst::InsertElement: case Inst::InsertElement:
lowerInsertElement(llvm::dyn_cast<InstInsertElement>(Inst)); lowerInsertElement(llvm::cast<InstInsertElement>(Inst));
break; break;
case Inst::IntrinsicCall: { case Inst::IntrinsicCall: {
InstIntrinsicCall *Call = llvm::dyn_cast<InstIntrinsicCall>(Inst); InstIntrinsicCall *Call = llvm::cast<InstIntrinsicCall>(Inst);
if (Call->getIntrinsicInfo().ReturnsTwice) if (Call->getIntrinsicInfo().ReturnsTwice)
setCallsReturnsTwice(true); setCallsReturnsTwice(true);
lowerIntrinsicCall(Call); lowerIntrinsicCall(Call);
break; break;
} }
case Inst::Load: case Inst::Load:
lowerLoad(llvm::dyn_cast<InstLoad>(Inst)); lowerLoad(llvm::cast<InstLoad>(Inst));
break; break;
case Inst::Phi: case Inst::Phi:
lowerPhi(llvm::dyn_cast<InstPhi>(Inst)); lowerPhi(llvm::cast<InstPhi>(Inst));
break; break;
case Inst::Ret: case Inst::Ret:
lowerRet(llvm::dyn_cast<InstRet>(Inst)); lowerRet(llvm::cast<InstRet>(Inst));
break; break;
case Inst::Select: case Inst::Select:
lowerSelect(llvm::dyn_cast<InstSelect>(Inst)); lowerSelect(llvm::cast<InstSelect>(Inst));
break; break;
case Inst::Store: case Inst::Store:
lowerStore(llvm::dyn_cast<InstStore>(Inst)); lowerStore(llvm::cast<InstStore>(Inst));
break; break;
case Inst::Switch: case Inst::Switch:
lowerSwitch(llvm::dyn_cast<InstSwitch>(Inst)); lowerSwitch(llvm::cast<InstSwitch>(Inst));
break; break;
case Inst::Unreachable: case Inst::Unreachable:
lowerUnreachable(llvm::dyn_cast<InstUnreachable>(Inst)); lowerUnreachable(llvm::cast<InstUnreachable>(Inst));
break; break;
case Inst::BundleLock: case Inst::BundleLock:
case Inst::BundleUnlock: case Inst::BundleUnlock:
...@@ -202,6 +204,7 @@ void TargetLowering::lower() { ...@@ -202,6 +204,7 @@ void TargetLowering::lower() {
} }
postLower(); postLower();
}
Context.advanceCur(); Context.advanceCur();
Context.advanceNext(); Context.advanceNext();
......
...@@ -221,6 +221,7 @@ public: ...@@ -221,6 +221,7 @@ public:
// Performs target-specific argument lowering. // Performs target-specific argument lowering.
virtual void lowerArguments() = 0; virtual void lowerArguments() = 0;
virtual void initNodeForLowering(CfgNode *) {}
virtual void addProlog(CfgNode *Node) = 0; virtual void addProlog(CfgNode *Node) = 0;
virtual void addEpilog(CfgNode *Node) = 0; virtual void addEpilog(CfgNode *Node) = 0;
......
...@@ -16,14 +16,81 @@ ...@@ -16,14 +16,81 @@
#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H
#define SUBZERO_SRC_ICETARGETLOWERINGX8632_H #define SUBZERO_SRC_ICETARGETLOWERINGX8632_H
#include <unordered_map>
#include "assembler_ia32.h" #include "assembler_ia32.h"
#include "IceDefs.h" #include "IceDefs.h"
#include "IceInst.h"
#include "IceInstX8632.h" #include "IceInstX8632.h"
#include "IceRegistersX8632.h" #include "IceRegistersX8632.h"
#include "IceTargetLowering.h" #include "IceTargetLowering.h"
namespace Ice { namespace Ice {
class BoolFoldingEntry {
BoolFoldingEntry(const BoolFoldingEntry &) = delete;
public:
BoolFoldingEntry()
: Instr(nullptr), IsComplex(false), IsLiveOut(true), NumUses(0) {}
explicit BoolFoldingEntry(Inst *I);
BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
// Instr is the instruction producing the i1-type variable of interest.
Inst *Instr;
// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
bool IsComplex;
// IsLiveOut is initialized conservatively to true, and is set to false when
// we encounter an instruction that ends Var's live range. We disable the
// folding optimization when Var is live beyond this basic block. Note that
// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
// always be true and the folding optimization will never be performed.
bool IsLiveOut;
// NumUses counts the number of times Var is used as a source operand in the
// basic block. If IsComplex is true and there is more than one use of Var,
// then the folding optimization is disabled for Var.
uint32_t NumUses;
};
class BoolFolding {
public:
enum BoolFoldingProducerKind {
PK_None,
PK_Icmp32,
PK_Icmp64,
PK_Fcmp,
PK_Trunc
};
// Currently the actual enum values are not used (other than CK_None), but we
// go
// ahead and produce them anyway for symmetry with the
// BoolFoldingProducerKind.
enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
private:
BoolFolding(const BoolFolding &) = delete;
BoolFolding &operator=(const BoolFolding &) = delete;
public:
BoolFolding() {}
static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
static bool hasComplexLowering(const Inst *Instr);
void init(CfgNode *Node);
const Inst *getProducerFor(const Operand *Opnd) const;
void dump(const Cfg *Func) const;
private:
// Returns true if Producers contains a valid entry for the given VarNum.
bool containsValid(SizeT VarNum) const {
auto Element = Producers.find(VarNum);
return Element != Producers.end() && Element->second.Instr != nullptr;
}
void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
// Producers maps Variable::Number to a BoolFoldingEntry.
std::unordered_map<SizeT, BoolFoldingEntry> Producers;
};
class TargetX8632 : public TargetLowering { class TargetX8632 : public TargetLowering {
TargetX8632() = delete; TargetX8632() = delete;
TargetX8632(const TargetX8632 &) = delete; TargetX8632(const TargetX8632 &) = delete;
...@@ -63,6 +130,7 @@ public: ...@@ -63,6 +130,7 @@ public:
void emit(const ConstantDouble *C) const final; void emit(const ConstantDouble *C) const final;
void lowerArguments() override; void lowerArguments() override;
void initNodeForLowering(CfgNode *Node) override;
void addProlog(CfgNode *Node) override; void addProlog(CfgNode *Node) override;
void addEpilog(CfgNode *Node) override; void addEpilog(CfgNode *Node) override;
// Ensure that a 64-bit Variable has been split into 2 32-bit // Ensure that a 64-bit Variable has been split into 2 32-bit
...@@ -157,6 +225,8 @@ protected: ...@@ -157,6 +225,8 @@ protected:
Operand *legalize(Operand *From, LegalMask Allowed = Legal_All, Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
int32_t RegNum = Variable::NoRegister); int32_t RegNum = Variable::NoRegister);
Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister); Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
// Legalize the first source operand for use in the cmp instruction.
Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
// Turn a pointer operand into a memory operand that can be // Turn a pointer operand into a memory operand that can be
// used by a real load/store operation. Legalizes the operand as well. // used by a real load/store operation. Legalizes the operand as well.
// This is a nop if the operand is already a legal memory operand. // This is a nop if the operand is already a legal memory operand.
...@@ -507,6 +577,7 @@ protected: ...@@ -507,6 +577,7 @@ protected:
private: private:
~TargetX8632() override {} ~TargetX8632() override {}
BoolFolding FoldingInfo;
}; };
class TargetDataX8632 : public TargetDataLowering { class TargetDataX8632 : public TargetDataLowering {
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s ; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
declare void @useInt(i32 %x)
define internal i32 @add8Bit(i32 %a, i32 %b) { define internal i32 @add8Bit(i32 %a, i32 %b) {
entry: entry:
%a_8 = trunc i32 %a to i8 %a_8 = trunc i32 %a to i8
...@@ -278,6 +280,9 @@ entry: ...@@ -278,6 +280,9 @@ entry:
%cmp = icmp slt i8 %a_8, %b_8 %cmp = icmp slt i8 %a_8, %b_8
%ret = select i1 %cmp, i8 %a_8, i8 %b_8 %ret = select i1 %cmp, i8 %a_8, i8 %b_8
%ret_ext = zext i8 %ret to i32 %ret_ext = zext i8 %ret to i32
; Create a "fake" use of %cmp to prevent O2 bool folding.
%d1 = zext i1 %cmp to i32
call void @useInt(i32 %d1)
ret i32 %ret_ext ret i32 %ret_ext
} }
; CHECK-LABEL: selectI8Var ; CHECK-LABEL: selectI8Var
......
; This tests the optimization where producers and consumers of i1 (bool)
; variables are combined to implicitly use flags instead of explicitly using
; stack or register variables.
; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
declare void @use_value(i32)
; Basic cmp/branch folding.
define i32 @fold_cmp_br(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: fold_cmp_br
; CHECK: cmp
; CHECK: jge
; Cmp/branch folding with intervening instructions.
define i32 @fold_cmp_br_intervening_insts(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
call void @use_value(i32 %arg1)
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: fold_cmp_br_intervening_insts
; CHECK-NOT: cmp
; CHECK: call
; CHECK: cmp
; CHECK: jge
; Cmp/branch non-folding because of live-out.
define i32 @no_fold_cmp_br_liveout(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
br label %next
next:
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: no_fold_cmp_br_liveout
; CHECK: cmp
; CHECK: set
; CHECK: cmp
; CHECK: je
; Cmp/branch non-folding because of extra non-whitelisted uses.
define i32 @no_fold_cmp_br_non_whitelist(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%result = zext i1 %cmp1 to i32
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 %result
branch2:
ret i32 2
}
; CHECK-LABEL: no_fold_cmp_br_non_whitelist
; CHECK: cmp
; CHECK: set
; CHECK: movzx
; CHECK: cmp
; CHECK: je
; Basic cmp/select folding.
define i32 @fold_cmp_select(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%result = select i1 %cmp1, i32 %arg1, i32 %arg2
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; 64-bit cmp/select folding.
define i64 @fold_cmp_select_64(i64 %arg1, i64 %arg2) {
entry:
%arg1_trunc = trunc i64 %arg1 to i32
%arg2_trunc = trunc i64 %arg2 to i32
%cmp1 = icmp slt i32 %arg1_trunc, %arg2_trunc
%result = select i1 %cmp1, i64 %arg1, i64 %arg2
ret i64 %result
}
; CHECK-LABEL: fold_cmp_select_64
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; CHECK: mov
; Cmp/select folding with intervening instructions.
define i32 @fold_cmp_select_intervening_insts(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
call void @use_value(i32 %arg1)
%result = select i1 %cmp1, i32 %arg1, i32 %arg2
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select_intervening_insts
; CHECK-NOT: cmp
; CHECK: call
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; Cmp/multi-select folding.
define i32 @fold_cmp_select_multi(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
%c = select i1 %cmp1, i32 123, i32 %arg1
%partial = add i32 %a, %b
%result = add i32 %partial, %c
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select_multi
; CHECK: cmp
; CHECK: jl
; CHECK: cmp
; CHECK: jl
; CHECK: cmp
; CHECK: jl
; CHECK: add
; CHECK: add
; Cmp/multi-select non-folding because of live-out.
define i32 @no_fold_cmp_select_multi_liveout(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
br label %next
next:
%c = select i1 %cmp1, i32 123, i32 %arg1
%partial = add i32 %a, %b
%result = add i32 %partial, %c
ret i32 %result
}
; CHECK-LABEL: no_fold_cmp_select_multi_liveout
; CHECK: set
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: add
; CHECK: add
; Cmp/multi-select non-folding because of extra non-whitelisted uses.
define i32 @no_fold_cmp_select_multi_non_whitelist(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
%c = select i1 %cmp1, i32 123, i32 %arg1
%ext = zext i1 %cmp1 to i32
%partial1 = add i32 %a, %b
%partial2 = add i32 %partial1, %c
%result = add i32 %partial2, %ext
ret i32 %result
}
; CHECK-LABEL: no_fold_cmp_select_multi_non_whitelist
; CHECK: set
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: movzx
; CHECK: add
; CHECK: add
; CHECK: add
...@@ -14,6 +14,11 @@ entry: ...@@ -14,6 +14,11 @@ entry:
%cmp1 = icmp sgt i32 %a, %b %cmp1 = icmp sgt i32 %a, %b
%cond2 = select i1 %cmp1, i32 10, i32 20 %cond2 = select i1 %cmp1, i32 10, i32 20
tail call void @useInt(i32 %cond2) tail call void @useInt(i32 %cond2)
; Create "fake" uses of %cmp and %cmp1 to prevent O2 bool folding.
%d1 = zext i1 %cmp to i32
call void @useInt(i32 %d1)
%d2 = zext i1 %cmp1 to i32
call void @useInt(i32 %d2)
ret void ret void
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment