Commit a59ae6ff by Jim Stichnoth

Subzero: Fold icmp into br/select lowering.

Originally there was a peephole-style optimization in lowerIcmp() that looks ahead to see if the next instruction is a conditional branch with the right properties, and if so, folds the icmp and br into a single lowering sequence. However, sometimes extra instructions come between the icmp and br instructions, disabling the folding even though it would still be possible. One thought is to do the folding inside lowerBr() instead of lowerIcmp(), by looking backward for a suitable icmp instruction. The problem here is that the icmp lowering code may leave lowered instructions that can't easily be dead-code eliminated, e.g. instructions lacking a dest variable. Instead, before lowering a basic block, we do a prepass on the block to identify folding candidates. For the icmp/br example, the prepass would tentatively delete the icmp instruction and then the br lowering would fold in the icmp. This folding can also be extended to several producers: icmp (i32 operands), icmp (i64 operands), fcmp, trunc .. to i1 and several consumers: br, select, sext, zext This CL starts with 2 combinations: icmp32 paired with br & select. Other combinations will be added in later CLs. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4162 BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1141213004
parent 33492e76
......@@ -516,6 +516,7 @@ void CfgNode::genCode() {
LoweringContext &Context = Target->getContext();
// Lower the regular instructions.
Context.init(this);
Target->initNodeForLowering(this);
while (!Context.atEnd()) {
InstList::iterator Orig = Context.getCur();
if (llvm::isa<InstRet>(*Orig))
......
......@@ -195,6 +195,7 @@ cl::list<Ice::VerboseItem> VerboseList(
clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
clEnumValN(Ice::IceV_Folding, "fold", "Instruction folding details"),
clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
clEnumValN(Ice::IceV_Most, "most",
"Use all verbose options except 'regalloc'"),
......
......@@ -172,6 +172,7 @@ enum VerboseItem {
IceV_Frame = 1 << 8,
IceV_AddrOpt = 1 << 9,
IceV_Random = 1 << 10,
IceV_Folding = 1 << 11,
IceV_All = ~IceV_None,
IceV_Most = IceV_All & ~IceV_LinearScan
};
......
......@@ -81,6 +81,7 @@ public:
bool isDeleted() const { return Deleted; }
void setDeleted() { Deleted = true; }
void setDead(bool Value = true) { Dead = Value; }
void deleteIfDead();
bool hasSideEffects() const { return HasSideEffects; }
......@@ -178,7 +179,9 @@ protected:
InstNumberT Number;
// Deleted means irrevocably deleted.
bool Deleted;
// Dead means pending deletion after liveness analysis converges.
// Dead means one of two things depending on context: (1) pending
// deletion after liveness analysis converges, or (2) marked for
// deletion during lowering due to a folded bool operation.
bool Dead;
// HasSideEffects means the instruction is something like a function
// call or a volatile load that can't be removed even if its Dest
......
......@@ -126,82 +126,85 @@ void TargetLowering::doNopInsertion() {
void TargetLowering::lower() {
assert(!Context.atEnd());
Inst *Inst = Context.getCur();
// Mark the current instruction as deleted before lowering,
// otherwise the Dest variable will likely get marked as non-SSA.
// See Variable::setDefinition().
Inst->setDeleted();
switch (Inst->getKind()) {
case Inst::Alloca:
lowerAlloca(llvm::dyn_cast<InstAlloca>(Inst));
break;
case Inst::Arithmetic:
lowerArithmetic(llvm::dyn_cast<InstArithmetic>(Inst));
break;
case Inst::Assign:
lowerAssign(llvm::dyn_cast<InstAssign>(Inst));
break;
case Inst::Br:
lowerBr(llvm::dyn_cast<InstBr>(Inst));
break;
case Inst::Call:
lowerCall(llvm::dyn_cast<InstCall>(Inst));
break;
case Inst::Cast:
lowerCast(llvm::dyn_cast<InstCast>(Inst));
break;
case Inst::ExtractElement:
lowerExtractElement(llvm::dyn_cast<InstExtractElement>(Inst));
break;
case Inst::Fcmp:
lowerFcmp(llvm::dyn_cast<InstFcmp>(Inst));
break;
case Inst::Icmp:
lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst));
break;
case Inst::InsertElement:
lowerInsertElement(llvm::dyn_cast<InstInsertElement>(Inst));
break;
case Inst::IntrinsicCall: {
InstIntrinsicCall *Call = llvm::dyn_cast<InstIntrinsicCall>(Inst);
if (Call->getIntrinsicInfo().ReturnsTwice)
setCallsReturnsTwice(true);
lowerIntrinsicCall(Call);
break;
}
case Inst::Load:
lowerLoad(llvm::dyn_cast<InstLoad>(Inst));
break;
case Inst::Phi:
lowerPhi(llvm::dyn_cast<InstPhi>(Inst));
break;
case Inst::Ret:
lowerRet(llvm::dyn_cast<InstRet>(Inst));
break;
case Inst::Select:
lowerSelect(llvm::dyn_cast<InstSelect>(Inst));
break;
case Inst::Store:
lowerStore(llvm::dyn_cast<InstStore>(Inst));
break;
case Inst::Switch:
lowerSwitch(llvm::dyn_cast<InstSwitch>(Inst));
break;
case Inst::Unreachable:
lowerUnreachable(llvm::dyn_cast<InstUnreachable>(Inst));
break;
case Inst::BundleLock:
case Inst::BundleUnlock:
case Inst::FakeDef:
case Inst::FakeUse:
case Inst::FakeKill:
case Inst::Target:
// These are all Target instruction types and shouldn't be
// encountered at this stage.
Func->setError("Can't lower unsupported instruction type");
break;
}
Inst->deleteIfDead();
if (!Inst->isDeleted()) {
// Mark the current instruction as deleted before lowering,
// otherwise the Dest variable will likely get marked as non-SSA.
// See Variable::setDefinition().
Inst->setDeleted();
switch (Inst->getKind()) {
case Inst::Alloca:
lowerAlloca(llvm::cast<InstAlloca>(Inst));
break;
case Inst::Arithmetic:
lowerArithmetic(llvm::cast<InstArithmetic>(Inst));
break;
case Inst::Assign:
lowerAssign(llvm::cast<InstAssign>(Inst));
break;
case Inst::Br:
lowerBr(llvm::cast<InstBr>(Inst));
break;
case Inst::Call:
lowerCall(llvm::cast<InstCall>(Inst));
break;
case Inst::Cast:
lowerCast(llvm::cast<InstCast>(Inst));
break;
case Inst::ExtractElement:
lowerExtractElement(llvm::cast<InstExtractElement>(Inst));
break;
case Inst::Fcmp:
lowerFcmp(llvm::cast<InstFcmp>(Inst));
break;
case Inst::Icmp:
lowerIcmp(llvm::cast<InstIcmp>(Inst));
break;
case Inst::InsertElement:
lowerInsertElement(llvm::cast<InstInsertElement>(Inst));
break;
case Inst::IntrinsicCall: {
InstIntrinsicCall *Call = llvm::cast<InstIntrinsicCall>(Inst);
if (Call->getIntrinsicInfo().ReturnsTwice)
setCallsReturnsTwice(true);
lowerIntrinsicCall(Call);
break;
}
case Inst::Load:
lowerLoad(llvm::cast<InstLoad>(Inst));
break;
case Inst::Phi:
lowerPhi(llvm::cast<InstPhi>(Inst));
break;
case Inst::Ret:
lowerRet(llvm::cast<InstRet>(Inst));
break;
case Inst::Select:
lowerSelect(llvm::cast<InstSelect>(Inst));
break;
case Inst::Store:
lowerStore(llvm::cast<InstStore>(Inst));
break;
case Inst::Switch:
lowerSwitch(llvm::cast<InstSwitch>(Inst));
break;
case Inst::Unreachable:
lowerUnreachable(llvm::cast<InstUnreachable>(Inst));
break;
case Inst::BundleLock:
case Inst::BundleUnlock:
case Inst::FakeDef:
case Inst::FakeUse:
case Inst::FakeKill:
case Inst::Target:
// These are all Target instruction types and shouldn't be
// encountered at this stage.
Func->setError("Can't lower unsupported instruction type");
break;
}
postLower();
postLower();
}
Context.advanceCur();
Context.advanceNext();
......
......@@ -221,6 +221,7 @@ public:
// Performs target-specific argument lowering.
virtual void lowerArguments() = 0;
virtual void initNodeForLowering(CfgNode *) {}
virtual void addProlog(CfgNode *Node) = 0;
virtual void addEpilog(CfgNode *Node) = 0;
......
......@@ -16,14 +16,81 @@
#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H
#define SUBZERO_SRC_ICETARGETLOWERINGX8632_H
#include <unordered_map>
#include "assembler_ia32.h"
#include "IceDefs.h"
#include "IceInst.h"
#include "IceInstX8632.h"
#include "IceRegistersX8632.h"
#include "IceTargetLowering.h"
namespace Ice {
class BoolFoldingEntry {
BoolFoldingEntry(const BoolFoldingEntry &) = delete;
public:
BoolFoldingEntry()
: Instr(nullptr), IsComplex(false), IsLiveOut(true), NumUses(0) {}
explicit BoolFoldingEntry(Inst *I);
BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
// Instr is the instruction producing the i1-type variable of interest.
Inst *Instr;
// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
bool IsComplex;
// IsLiveOut is initialized conservatively to true, and is set to false when
// we encounter an instruction that ends Var's live range. We disable the
// folding optimization when Var is live beyond this basic block. Note that
// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
// always be true and the folding optimization will never be performed.
bool IsLiveOut;
// NumUses counts the number of times Var is used as a source operand in the
// basic block. If IsComplex is true and there is more than one use of Var,
// then the folding optimization is disabled for Var.
uint32_t NumUses;
};
class BoolFolding {
public:
enum BoolFoldingProducerKind {
PK_None,
PK_Icmp32,
PK_Icmp64,
PK_Fcmp,
PK_Trunc
};
// Currently the actual enum values are not used (other than CK_None), but we
// go
// ahead and produce them anyway for symmetry with the
// BoolFoldingProducerKind.
enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
private:
BoolFolding(const BoolFolding &) = delete;
BoolFolding &operator=(const BoolFolding &) = delete;
public:
BoolFolding() {}
static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
static bool hasComplexLowering(const Inst *Instr);
void init(CfgNode *Node);
const Inst *getProducerFor(const Operand *Opnd) const;
void dump(const Cfg *Func) const;
private:
// Returns true if Producers contains a valid entry for the given VarNum.
bool containsValid(SizeT VarNum) const {
auto Element = Producers.find(VarNum);
return Element != Producers.end() && Element->second.Instr != nullptr;
}
void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
// Producers maps Variable::Number to a BoolFoldingEntry.
std::unordered_map<SizeT, BoolFoldingEntry> Producers;
};
class TargetX8632 : public TargetLowering {
TargetX8632() = delete;
TargetX8632(const TargetX8632 &) = delete;
......@@ -63,6 +130,7 @@ public:
void emit(const ConstantDouble *C) const final;
void lowerArguments() override;
void initNodeForLowering(CfgNode *Node) override;
void addProlog(CfgNode *Node) override;
void addEpilog(CfgNode *Node) override;
// Ensure that a 64-bit Variable has been split into 2 32-bit
......@@ -157,6 +225,8 @@ protected:
Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
int32_t RegNum = Variable::NoRegister);
Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
// Legalize the first source operand for use in the cmp instruction.
Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
// Turn a pointer operand into a memory operand that can be
// used by a real load/store operation. Legalizes the operand as well.
// This is a nop if the operand is already a legal memory operand.
......@@ -507,6 +577,7 @@ protected:
private:
~TargetX8632() override {}
BoolFolding FoldingInfo;
};
class TargetDataX8632 : public TargetDataLowering {
......
......@@ -3,6 +3,8 @@
; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
declare void @useInt(i32 %x)
define internal i32 @add8Bit(i32 %a, i32 %b) {
entry:
%a_8 = trunc i32 %a to i8
......@@ -278,6 +280,9 @@ entry:
%cmp = icmp slt i8 %a_8, %b_8
%ret = select i1 %cmp, i8 %a_8, i8 %b_8
%ret_ext = zext i8 %ret to i32
; Create a "fake" use of %cmp to prevent O2 bool folding.
%d1 = zext i1 %cmp to i32
call void @useInt(i32 %d1)
ret i32 %ret_ext
}
; CHECK-LABEL: selectI8Var
......
; This tests the optimization where producers and consumers of i1 (bool)
; variables are combined to implicitly use flags instead of explicitly using
; stack or register variables.
; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
declare void @use_value(i32)
; Basic cmp/branch folding.
define i32 @fold_cmp_br(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: fold_cmp_br
; CHECK: cmp
; CHECK: jge
; Cmp/branch folding with intervening instructions.
define i32 @fold_cmp_br_intervening_insts(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
call void @use_value(i32 %arg1)
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: fold_cmp_br_intervening_insts
; CHECK-NOT: cmp
; CHECK: call
; CHECK: cmp
; CHECK: jge
; Cmp/branch non-folding because of live-out.
define i32 @no_fold_cmp_br_liveout(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
br label %next
next:
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: no_fold_cmp_br_liveout
; CHECK: cmp
; CHECK: set
; CHECK: cmp
; CHECK: je
; Cmp/branch non-folding because of extra non-whitelisted uses.
define i32 @no_fold_cmp_br_non_whitelist(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%result = zext i1 %cmp1 to i32
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 %result
branch2:
ret i32 2
}
; CHECK-LABEL: no_fold_cmp_br_non_whitelist
; CHECK: cmp
; CHECK: set
; CHECK: movzx
; CHECK: cmp
; CHECK: je
; Basic cmp/select folding.
define i32 @fold_cmp_select(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%result = select i1 %cmp1, i32 %arg1, i32 %arg2
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; 64-bit cmp/select folding.
define i64 @fold_cmp_select_64(i64 %arg1, i64 %arg2) {
entry:
%arg1_trunc = trunc i64 %arg1 to i32
%arg2_trunc = trunc i64 %arg2 to i32
%cmp1 = icmp slt i32 %arg1_trunc, %arg2_trunc
%result = select i1 %cmp1, i64 %arg1, i64 %arg2
ret i64 %result
}
; CHECK-LABEL: fold_cmp_select_64
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; CHECK: mov
; Cmp/select folding with intervening instructions.
define i32 @fold_cmp_select_intervening_insts(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
call void @use_value(i32 %arg1)
%result = select i1 %cmp1, i32 %arg1, i32 %arg2
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select_intervening_insts
; CHECK-NOT: cmp
; CHECK: call
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; Cmp/multi-select folding.
define i32 @fold_cmp_select_multi(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
%c = select i1 %cmp1, i32 123, i32 %arg1
%partial = add i32 %a, %b
%result = add i32 %partial, %c
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select_multi
; CHECK: cmp
; CHECK: jl
; CHECK: cmp
; CHECK: jl
; CHECK: cmp
; CHECK: jl
; CHECK: add
; CHECK: add
; Cmp/multi-select non-folding because of live-out.
define i32 @no_fold_cmp_select_multi_liveout(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
br label %next
next:
%c = select i1 %cmp1, i32 123, i32 %arg1
%partial = add i32 %a, %b
%result = add i32 %partial, %c
ret i32 %result
}
; CHECK-LABEL: no_fold_cmp_select_multi_liveout
; CHECK: set
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: add
; CHECK: add
; Cmp/multi-select non-folding because of extra non-whitelisted uses.
define i32 @no_fold_cmp_select_multi_non_whitelist(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
%c = select i1 %cmp1, i32 123, i32 %arg1
%ext = zext i1 %cmp1 to i32
%partial1 = add i32 %a, %b
%partial2 = add i32 %partial1, %c
%result = add i32 %partial2, %ext
ret i32 %result
}
; CHECK-LABEL: no_fold_cmp_select_multi_non_whitelist
; CHECK: set
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: movzx
; CHECK: add
; CHECK: add
; CHECK: add
......@@ -14,6 +14,11 @@ entry:
%cmp1 = icmp sgt i32 %a, %b
%cond2 = select i1 %cmp1, i32 10, i32 20
tail call void @useInt(i32 %cond2)
; Create "fake" uses of %cmp and %cmp1 to prevent O2 bool folding.
%d1 = zext i1 %cmp to i32
call void @useInt(i32 %d1)
%d2 = zext i1 %cmp1 to i32
call void @useInt(i32 %d2)
ret void
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment