Commit a59ae6ff by Jim Stichnoth

Subzero: Fold icmp into br/select lowering.

Originally there was a peephole-style optimization in lowerIcmp() that looks ahead to see if the next instruction is a conditional branch with the right properties, and if so, folds the icmp and br into a single lowering sequence. However, sometimes extra instructions come between the icmp and br instructions, disabling the folding even though it would still be possible. One thought is to do the folding inside lowerBr() instead of lowerIcmp(), by looking backward for a suitable icmp instruction. The problem here is that the icmp lowering code may leave lowered instructions that can't easily be dead-code eliminated, e.g. instructions lacking a dest variable. Instead, before lowering a basic block, we do a prepass on the block to identify folding candidates. For the icmp/br example, the prepass would tentatively delete the icmp instruction and then the br lowering would fold in the icmp. This folding can also be extended to several producers: icmp (i32 operands), icmp (i64 operands), fcmp, trunc .. to i1 and several consumers: br, select, sext, zext This CL starts with 2 combinations: icmp32 paired with br & select. Other combinations will be added in later CLs. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4162 BUG= https://code.google.com/p/nativeclient/issues/detail?id=4095 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1141213004
parent 33492e76
...@@ -516,6 +516,7 @@ void CfgNode::genCode() { ...@@ -516,6 +516,7 @@ void CfgNode::genCode() {
LoweringContext &Context = Target->getContext(); LoweringContext &Context = Target->getContext();
// Lower the regular instructions. // Lower the regular instructions.
Context.init(this); Context.init(this);
Target->initNodeForLowering(this);
while (!Context.atEnd()) { while (!Context.atEnd()) {
InstList::iterator Orig = Context.getCur(); InstList::iterator Orig = Context.getCur();
if (llvm::isa<InstRet>(*Orig)) if (llvm::isa<InstRet>(*Orig))
......
...@@ -195,6 +195,7 @@ cl::list<Ice::VerboseItem> VerboseList( ...@@ -195,6 +195,7 @@ cl::list<Ice::VerboseItem> VerboseList(
clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"), clEnumValN(Ice::IceV_Frame, "frame", "Stack frame layout details"),
clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"), clEnumValN(Ice::IceV_AddrOpt, "addropt", "Address mode optimization"),
clEnumValN(Ice::IceV_Random, "random", "Randomization details"), clEnumValN(Ice::IceV_Random, "random", "Randomization details"),
clEnumValN(Ice::IceV_Folding, "fold", "Instruction folding details"),
clEnumValN(Ice::IceV_All, "all", "Use all verbose options"), clEnumValN(Ice::IceV_All, "all", "Use all verbose options"),
clEnumValN(Ice::IceV_Most, "most", clEnumValN(Ice::IceV_Most, "most",
"Use all verbose options except 'regalloc'"), "Use all verbose options except 'regalloc'"),
......
...@@ -172,6 +172,7 @@ enum VerboseItem { ...@@ -172,6 +172,7 @@ enum VerboseItem {
IceV_Frame = 1 << 8, IceV_Frame = 1 << 8,
IceV_AddrOpt = 1 << 9, IceV_AddrOpt = 1 << 9,
IceV_Random = 1 << 10, IceV_Random = 1 << 10,
IceV_Folding = 1 << 11,
IceV_All = ~IceV_None, IceV_All = ~IceV_None,
IceV_Most = IceV_All & ~IceV_LinearScan IceV_Most = IceV_All & ~IceV_LinearScan
}; };
......
...@@ -81,6 +81,7 @@ public: ...@@ -81,6 +81,7 @@ public:
bool isDeleted() const { return Deleted; } bool isDeleted() const { return Deleted; }
void setDeleted() { Deleted = true; } void setDeleted() { Deleted = true; }
void setDead(bool Value = true) { Dead = Value; }
void deleteIfDead(); void deleteIfDead();
bool hasSideEffects() const { return HasSideEffects; } bool hasSideEffects() const { return HasSideEffects; }
...@@ -178,7 +179,9 @@ protected: ...@@ -178,7 +179,9 @@ protected:
InstNumberT Number; InstNumberT Number;
// Deleted means irrevocably deleted. // Deleted means irrevocably deleted.
bool Deleted; bool Deleted;
// Dead means pending deletion after liveness analysis converges. // Dead means one of two things depending on context: (1) pending
// deletion after liveness analysis converges, or (2) marked for
// deletion during lowering due to a folded bool operation.
bool Dead; bool Dead;
// HasSideEffects means the instruction is something like a function // HasSideEffects means the instruction is something like a function
// call or a volatile load that can't be removed even if its Dest // call or a volatile load that can't be removed even if its Dest
......
...@@ -126,82 +126,85 @@ void TargetLowering::doNopInsertion() { ...@@ -126,82 +126,85 @@ void TargetLowering::doNopInsertion() {
void TargetLowering::lower() { void TargetLowering::lower() {
assert(!Context.atEnd()); assert(!Context.atEnd());
Inst *Inst = Context.getCur(); Inst *Inst = Context.getCur();
// Mark the current instruction as deleted before lowering, Inst->deleteIfDead();
// otherwise the Dest variable will likely get marked as non-SSA. if (!Inst->isDeleted()) {
// See Variable::setDefinition(). // Mark the current instruction as deleted before lowering,
Inst->setDeleted(); // otherwise the Dest variable will likely get marked as non-SSA.
switch (Inst->getKind()) { // See Variable::setDefinition().
case Inst::Alloca: Inst->setDeleted();
lowerAlloca(llvm::dyn_cast<InstAlloca>(Inst)); switch (Inst->getKind()) {
break; case Inst::Alloca:
case Inst::Arithmetic: lowerAlloca(llvm::cast<InstAlloca>(Inst));
lowerArithmetic(llvm::dyn_cast<InstArithmetic>(Inst)); break;
break; case Inst::Arithmetic:
case Inst::Assign: lowerArithmetic(llvm::cast<InstArithmetic>(Inst));
lowerAssign(llvm::dyn_cast<InstAssign>(Inst)); break;
break; case Inst::Assign:
case Inst::Br: lowerAssign(llvm::cast<InstAssign>(Inst));
lowerBr(llvm::dyn_cast<InstBr>(Inst)); break;
break; case Inst::Br:
case Inst::Call: lowerBr(llvm::cast<InstBr>(Inst));
lowerCall(llvm::dyn_cast<InstCall>(Inst)); break;
break; case Inst::Call:
case Inst::Cast: lowerCall(llvm::cast<InstCall>(Inst));
lowerCast(llvm::dyn_cast<InstCast>(Inst)); break;
break; case Inst::Cast:
case Inst::ExtractElement: lowerCast(llvm::cast<InstCast>(Inst));
lowerExtractElement(llvm::dyn_cast<InstExtractElement>(Inst)); break;
break; case Inst::ExtractElement:
case Inst::Fcmp: lowerExtractElement(llvm::cast<InstExtractElement>(Inst));
lowerFcmp(llvm::dyn_cast<InstFcmp>(Inst)); break;
break; case Inst::Fcmp:
case Inst::Icmp: lowerFcmp(llvm::cast<InstFcmp>(Inst));
lowerIcmp(llvm::dyn_cast<InstIcmp>(Inst)); break;
break; case Inst::Icmp:
case Inst::InsertElement: lowerIcmp(llvm::cast<InstIcmp>(Inst));
lowerInsertElement(llvm::dyn_cast<InstInsertElement>(Inst)); break;
break; case Inst::InsertElement:
case Inst::IntrinsicCall: { lowerInsertElement(llvm::cast<InstInsertElement>(Inst));
InstIntrinsicCall *Call = llvm::dyn_cast<InstIntrinsicCall>(Inst); break;
if (Call->getIntrinsicInfo().ReturnsTwice) case Inst::IntrinsicCall: {
setCallsReturnsTwice(true); InstIntrinsicCall *Call = llvm::cast<InstIntrinsicCall>(Inst);
lowerIntrinsicCall(Call); if (Call->getIntrinsicInfo().ReturnsTwice)
break; setCallsReturnsTwice(true);
} lowerIntrinsicCall(Call);
case Inst::Load: break;
lowerLoad(llvm::dyn_cast<InstLoad>(Inst)); }
break; case Inst::Load:
case Inst::Phi: lowerLoad(llvm::cast<InstLoad>(Inst));
lowerPhi(llvm::dyn_cast<InstPhi>(Inst)); break;
break; case Inst::Phi:
case Inst::Ret: lowerPhi(llvm::cast<InstPhi>(Inst));
lowerRet(llvm::dyn_cast<InstRet>(Inst)); break;
break; case Inst::Ret:
case Inst::Select: lowerRet(llvm::cast<InstRet>(Inst));
lowerSelect(llvm::dyn_cast<InstSelect>(Inst)); break;
break; case Inst::Select:
case Inst::Store: lowerSelect(llvm::cast<InstSelect>(Inst));
lowerStore(llvm::dyn_cast<InstStore>(Inst)); break;
break; case Inst::Store:
case Inst::Switch: lowerStore(llvm::cast<InstStore>(Inst));
lowerSwitch(llvm::dyn_cast<InstSwitch>(Inst)); break;
break; case Inst::Switch:
case Inst::Unreachable: lowerSwitch(llvm::cast<InstSwitch>(Inst));
lowerUnreachable(llvm::dyn_cast<InstUnreachable>(Inst)); break;
break; case Inst::Unreachable:
case Inst::BundleLock: lowerUnreachable(llvm::cast<InstUnreachable>(Inst));
case Inst::BundleUnlock: break;
case Inst::FakeDef: case Inst::BundleLock:
case Inst::FakeUse: case Inst::BundleUnlock:
case Inst::FakeKill: case Inst::FakeDef:
case Inst::Target: case Inst::FakeUse:
// These are all Target instruction types and shouldn't be case Inst::FakeKill:
// encountered at this stage. case Inst::Target:
Func->setError("Can't lower unsupported instruction type"); // These are all Target instruction types and shouldn't be
break; // encountered at this stage.
} Func->setError("Can't lower unsupported instruction type");
break;
}
postLower(); postLower();
}
Context.advanceCur(); Context.advanceCur();
Context.advanceNext(); Context.advanceNext();
......
...@@ -221,6 +221,7 @@ public: ...@@ -221,6 +221,7 @@ public:
// Performs target-specific argument lowering. // Performs target-specific argument lowering.
virtual void lowerArguments() = 0; virtual void lowerArguments() = 0;
virtual void initNodeForLowering(CfgNode *) {}
virtual void addProlog(CfgNode *Node) = 0; virtual void addProlog(CfgNode *Node) = 0;
virtual void addEpilog(CfgNode *Node) = 0; virtual void addEpilog(CfgNode *Node) = 0;
......
...@@ -260,6 +260,152 @@ ICETYPE_TABLE ...@@ -260,6 +260,152 @@ ICETYPE_TABLE
} // end of anonymous namespace } // end of anonymous namespace
BoolFoldingEntry::BoolFoldingEntry(Inst *I)
: Instr(I), IsComplex(BoolFolding::hasComplexLowering(I)), IsLiveOut(true),
NumUses(0) {}
BoolFolding::BoolFoldingProducerKind
BoolFolding::getProducerKind(const Inst *Instr) {
if (llvm::isa<InstIcmp>(Instr)) {
if (Instr->getSrc(0)->getType() != IceType_i64)
return PK_Icmp32;
return PK_None; // TODO(stichnot): actually PK_Icmp64;
}
return PK_None; // TODO(stichnot): remove this
if (llvm::isa<InstFcmp>(Instr))
return PK_Fcmp;
if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
switch (Cast->getCastKind()) {
default:
return PK_None;
case InstCast::Trunc:
return PK_Trunc;
}
}
return PK_None;
}
BoolFolding::BoolFoldingConsumerKind
BoolFolding::getConsumerKind(const Inst *Instr) {
if (llvm::isa<InstBr>(Instr))
return CK_Br;
if (llvm::isa<InstSelect>(Instr))
return CK_Select;
return CK_None; // TODO(stichnot): remove this
if (auto *Cast = llvm::dyn_cast<InstCast>(Instr)) {
switch (Cast->getCastKind()) {
default:
return CK_None;
case InstCast::Sext:
return CK_Sext;
case InstCast::Zext:
return CK_Zext;
}
}
return CK_None;
}
// Returns true if the producing instruction has a "complex" lowering
// sequence. This generally means that its lowering sequence requires
// more than one conditional branch, namely 64-bit integer compares
// and some floating-point compares. When this is true, and there is
// more than one consumer, we prefer to disable the folding
// optimization because it minimizes branches.
bool BoolFolding::hasComplexLowering(const Inst *Instr) {
switch (getProducerKind(Instr)) {
default:
return false;
case PK_Icmp64:
return true;
case PK_Fcmp:
return TableFcmp[llvm::cast<InstFcmp>(Instr)->getCondition()].C2 !=
CondX86::Br_None;
}
}
void BoolFolding::init(CfgNode *Node) {
Producers.clear();
for (Inst &Instr : Node->getInsts()) {
// Check whether Instr is a valid producer.
Variable *Var = Instr.getDest();
if (!Instr.isDeleted() // only consider non-deleted instructions
&& Var // only instructions with an actual dest var
&& Var->getType() == IceType_i1 // only bool-type dest vars
&& getProducerKind(&Instr) != PK_None) { // white-listed instructions
Producers[Var->getIndex()] = BoolFoldingEntry(&Instr);
}
// Check each src variable against the map.
for (SizeT I = 0; I < Instr.getSrcSize(); ++I) {
Operand *Src = Instr.getSrc(I);
SizeT NumVars = Src->getNumVars();
for (SizeT J = 0; J < NumVars; ++J) {
const Variable *Var = Src->getVar(J);
SizeT VarNum = Var->getIndex();
if (containsValid(VarNum)) {
if (I != 0 // All valid consumers use Var as the first source operand
|| getConsumerKind(&Instr) == CK_None // must be white-listed
|| (Producers[VarNum].IsComplex && // complex can't be multi-use
Producers[VarNum].NumUses > 0)) {
setInvalid(VarNum);
continue;
}
++Producers[VarNum].NumUses;
if (Instr.isLastUse(Var)) {
Producers[VarNum].IsLiveOut = false;
}
}
}
}
}
for (auto &I : Producers) {
// Ignore entries previously marked invalid.
if (I.second.Instr == nullptr)
continue;
// Disable the producer if its dest may be live beyond this block.
if (I.second.IsLiveOut) {
setInvalid(I.first);
continue;
}
// Mark as "dead" rather than outright deleting. This is so that
// other peephole style optimizations during or before lowering
// have access to this instruction in undeleted form. See for
// example tryOptimizedCmpxchgCmpBr().
I.second.Instr->setDead();
}
}
const Inst *BoolFolding::getProducerFor(const Operand *Opnd) const {
auto *Var = llvm::dyn_cast<const Variable>(Opnd);
if (Var == nullptr)
return nullptr;
SizeT VarNum = Var->getIndex();
auto Element = Producers.find(VarNum);
if (Element == Producers.end())
return nullptr;
return Element->second.Instr;
}
void BoolFolding::dump(const Cfg *Func) const {
if (!ALLOW_DUMP || !Func->isVerbose(IceV_Folding))
return;
OstreamLocker L(Func->getContext());
Ostream &Str = Func->getContext()->getStrDump();
for (auto &I : Producers) {
if (I.second.Instr == nullptr)
continue;
Str << "Found foldable producer:\n ";
I.second.Instr->dump(Func);
Str << "\n";
}
}
void TargetX8632::initNodeForLowering(CfgNode *Node) {
FoldingInfo.init(Node);
FoldingInfo.dump(Func);
}
TargetX8632::TargetX8632(Cfg *Func) TargetX8632::TargetX8632(Cfg *Func)
: TargetLowering(Func), : TargetLowering(Func),
InstructionSet(static_cast<X86InstructionSet>( InstructionSet(static_cast<X86InstructionSet>(
...@@ -1719,12 +1865,35 @@ void TargetX8632::lowerAssign(const InstAssign *Inst) { ...@@ -1719,12 +1865,35 @@ void TargetX8632::lowerAssign(const InstAssign *Inst) {
void TargetX8632::lowerBr(const InstBr *Inst) { void TargetX8632::lowerBr(const InstBr *Inst) {
if (Inst->isUnconditional()) { if (Inst->isUnconditional()) {
_br(Inst->getTargetUnconditional()); _br(Inst->getTargetUnconditional());
} else { return;
Operand *Src0 = legalize(Inst->getCondition(), Legal_Reg | Legal_Mem);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
_cmp(Src0, Zero);
_br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
} }
Operand *Cond = Inst->getCondition();
// Handle folding opportunities.
if (const class Inst *Producer = FoldingInfo.getProducerFor(Cond)) {
assert(Producer->isDeleted());
switch (BoolFolding::getProducerKind(Producer)) {
default:
break;
case BoolFolding::PK_Icmp32: {
// TODO(stichnot): Refactor similarities between this block and
// the corresponding code in lowerIcmp().
auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
Operand *Src0 = Producer->getSrc(0);
Operand *Src1 = legalize(Producer->getSrc(1));
Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
_cmp(Src0RM, Src1);
_br(getIcmp32Mapping(Cmp->getCondition()), Inst->getTargetTrue(),
Inst->getTargetFalse());
return;
}
}
}
Operand *Src0 = legalize(Cond, Legal_Reg | Legal_Mem);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
_cmp(Src0, Zero);
_br(CondX86::Br_ne, Inst->getTargetTrue(), Inst->getTargetFalse());
} }
void TargetX8632::lowerCall(const InstCall *Instr) { void TargetX8632::lowerCall(const InstCall *Instr) {
...@@ -2678,37 +2847,6 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2678,37 +2847,6 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
return; return;
} }
// If Src1 is an immediate, or known to be a physical register, we can
// allow Src0 to be a memory operand. Otherwise, Src0 must be copied into
// a physical register. (Actually, either Src0 or Src1 can be chosen for
// the physical register, but unfortunately we have to commit to one or
// the other before register allocation.)
bool IsSrc1ImmOrReg = false;
if (llvm::isa<Constant>(Src1)) {
IsSrc1ImmOrReg = true;
} else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
if (Var->hasReg())
IsSrc1ImmOrReg = true;
}
// Try to fuse a compare immediately followed by a conditional branch. This
// is possible when the compare dest and the branch source operands are the
// same, and are their only uses. TODO: implement this optimization for i64.
if (InstBr *NextBr = llvm::dyn_cast_or_null<InstBr>(Context.getNextInst())) {
if (Src0->getType() != IceType_i64 && !NextBr->isUnconditional() &&
Dest == NextBr->getSrc(0) && NextBr->isLastUse(Dest)) {
NextBr->setDeleted();
Operand *Src0RM =
legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
_cmp(Src0RM, Src1);
_br(getIcmp32Mapping(Inst->getCondition()), NextBr->getTargetTrue(),
NextBr->getTargetFalse());
// Skip over the following branch instruction.
Context.advanceNext();
return;
}
}
// a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1: // a=icmp cond, b, c ==> cmp b,c; a=1; br cond,L1; FakeUse(a); a=0; L1:
if (Src0->getType() == IceType_i64) { if (Src0->getType() == IceType_i64) {
InstIcmp::ICond Condition = Inst->getCondition(); InstIcmp::ICond Condition = Inst->getCondition();
...@@ -2737,8 +2875,7 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) { ...@@ -2737,8 +2875,7 @@ void TargetX8632::lowerIcmp(const InstIcmp *Inst) {
} }
// cmp b, c // cmp b, c
Operand *Src0RM = Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
_cmp(Src0RM, Src1); _cmp(Src0RM, Src1);
_setcc(Dest, getIcmp32Mapping(Inst->getCondition())); _setcc(Dest, getIcmp32Mapping(Inst->getCondition()));
} }
...@@ -4001,7 +4138,51 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) { ...@@ -4001,7 +4138,51 @@ void TargetX8632::lowerSelect(const InstSelect *Inst) {
return; return;
} }
// a=d?b:c ==> cmp d,0; a=b; jne L1; FakeUse(a); a=c; L1: // Handle folding opportunities.
if (const class Inst *Producer = FoldingInfo.getProducerFor(Condition)) {
assert(Producer->isDeleted());
switch (BoolFolding::getProducerKind(Producer)) {
default:
break;
case BoolFolding::PK_Icmp32: {
// d=cmp e,f; a=d?b:c ==> cmp e,f; a=b; jne L1; a=c; L1:
auto *Cmp = llvm::dyn_cast<InstIcmp>(Producer);
InstX8632Label *Label = InstX8632Label::create(Func, this);
Operand *Src0 = Producer->getSrc(0);
Operand *Src1 = legalize(Producer->getSrc(1));
Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
_cmp(Src0RM, Src1);
// This is the same code as below (for both i64 and non-i64),
// except without the _cmp instruction and with a different
// branch condition. TODO(stichnot): refactor.
if (Dest->getType() == IceType_i64) {
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
Operand *SrcLoRI = legalize(loOperand(SrcT), Legal_Reg | Legal_Imm);
Operand *SrcHiRI = legalize(hiOperand(SrcT), Legal_Reg | Legal_Imm);
_mov(DestLo, SrcLoRI);
_mov(DestHi, SrcHiRI);
_br(getIcmp32Mapping(Cmp->getCondition()), Label);
Operand *SrcFLo = loOperand(SrcF);
Operand *SrcFHi = hiOperand(SrcF);
SrcLoRI = legalize(SrcFLo, Legal_Reg | Legal_Imm);
SrcHiRI = legalize(SrcFHi, Legal_Reg | Legal_Imm);
_mov_nonkillable(DestLo, SrcLoRI);
_mov_nonkillable(DestHi, SrcHiRI);
} else {
SrcT = legalize(SrcT, Legal_Reg | Legal_Imm);
_mov(Dest, SrcT);
_br(getIcmp32Mapping(Cmp->getCondition()), Label);
SrcF = legalize(SrcF, Legal_Reg | Legal_Imm);
_mov_nonkillable(Dest, SrcF);
}
Context.insert(Label);
return;
}
}
}
// a=d?b:c ==> cmp d,0; a=b; jne L1; a=c; L1:
Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem); Operand *ConditionRM = legalize(Condition, Legal_Reg | Legal_Mem);
Constant *Zero = Ctx->getConstantZero(IceType_i32); Constant *Zero = Ctx->getConstantZero(IceType_i32);
InstX8632Label *Label = InstX8632Label::create(Func, this); InstX8632Label *Label = InstX8632Label::create(Func, this);
...@@ -4534,6 +4715,23 @@ Variable *TargetX8632::legalizeToVar(Operand *From, int32_t RegNum) { ...@@ -4534,6 +4715,23 @@ Variable *TargetX8632::legalizeToVar(Operand *From, int32_t RegNum) {
return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum)); return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
} }
// For the cmp instruction, if Src1 is an immediate, or known to be a
// physical register, we can allow Src0 to be a memory operand.
// Otherwise, Src0 must be copied into a physical register.
// (Actually, either Src0 or Src1 can be chosen for the physical
// register, but unfortunately we have to commit to one or the other
// before register allocation.)
Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
bool IsSrc1ImmOrReg = false;
if (llvm::isa<Constant>(Src1)) {
IsSrc1ImmOrReg = true;
} else if (Variable *Var = llvm::dyn_cast<Variable>(Src1)) {
if (Var->hasReg())
IsSrc1ImmOrReg = true;
}
return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
}
OperandX8632Mem *TargetX8632::FormMemoryOperand(Operand *Operand, Type Ty) { OperandX8632Mem *TargetX8632::FormMemoryOperand(Operand *Operand, Type Ty) {
OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand); OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
// It may be the case that address mode optimization already creates // It may be the case that address mode optimization already creates
......
...@@ -16,14 +16,81 @@ ...@@ -16,14 +16,81 @@
#ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H #ifndef SUBZERO_SRC_ICETARGETLOWERINGX8632_H
#define SUBZERO_SRC_ICETARGETLOWERINGX8632_H #define SUBZERO_SRC_ICETARGETLOWERINGX8632_H
#include <unordered_map>
#include "assembler_ia32.h" #include "assembler_ia32.h"
#include "IceDefs.h" #include "IceDefs.h"
#include "IceInst.h"
#include "IceInstX8632.h" #include "IceInstX8632.h"
#include "IceRegistersX8632.h" #include "IceRegistersX8632.h"
#include "IceTargetLowering.h" #include "IceTargetLowering.h"
namespace Ice { namespace Ice {
class BoolFoldingEntry {
BoolFoldingEntry(const BoolFoldingEntry &) = delete;
public:
BoolFoldingEntry()
: Instr(nullptr), IsComplex(false), IsLiveOut(true), NumUses(0) {}
explicit BoolFoldingEntry(Inst *I);
BoolFoldingEntry &operator=(const BoolFoldingEntry &) = default;
// Instr is the instruction producing the i1-type variable of interest.
Inst *Instr;
// IsComplex is the cached result of BoolFolding::hasComplexLowering(Instr).
bool IsComplex;
// IsLiveOut is initialized conservatively to true, and is set to false when
// we encounter an instruction that ends Var's live range. We disable the
// folding optimization when Var is live beyond this basic block. Note that
// if liveness analysis is not performed (e.g. in Om1 mode), IsLiveOut will
// always be true and the folding optimization will never be performed.
bool IsLiveOut;
// NumUses counts the number of times Var is used as a source operand in the
// basic block. If IsComplex is true and there is more than one use of Var,
// then the folding optimization is disabled for Var.
uint32_t NumUses;
};
class BoolFolding {
public:
enum BoolFoldingProducerKind {
PK_None,
PK_Icmp32,
PK_Icmp64,
PK_Fcmp,
PK_Trunc
};
// Currently the actual enum values are not used (other than CK_None), but we
// go
// ahead and produce them anyway for symmetry with the
// BoolFoldingProducerKind.
enum BoolFoldingConsumerKind { CK_None, CK_Br, CK_Select, CK_Sext, CK_Zext };
private:
BoolFolding(const BoolFolding &) = delete;
BoolFolding &operator=(const BoolFolding &) = delete;
public:
BoolFolding() {}
static BoolFoldingProducerKind getProducerKind(const Inst *Instr);
static BoolFoldingConsumerKind getConsumerKind(const Inst *Instr);
static bool hasComplexLowering(const Inst *Instr);
void init(CfgNode *Node);
const Inst *getProducerFor(const Operand *Opnd) const;
void dump(const Cfg *Func) const;
private:
// Returns true if Producers contains a valid entry for the given VarNum.
bool containsValid(SizeT VarNum) const {
auto Element = Producers.find(VarNum);
return Element != Producers.end() && Element->second.Instr != nullptr;
}
void setInvalid(SizeT VarNum) { Producers[VarNum].Instr = nullptr; }
// Producers maps Variable::Number to a BoolFoldingEntry.
std::unordered_map<SizeT, BoolFoldingEntry> Producers;
};
class TargetX8632 : public TargetLowering { class TargetX8632 : public TargetLowering {
TargetX8632() = delete; TargetX8632() = delete;
TargetX8632(const TargetX8632 &) = delete; TargetX8632(const TargetX8632 &) = delete;
...@@ -63,6 +130,7 @@ public: ...@@ -63,6 +130,7 @@ public:
void emit(const ConstantDouble *C) const final; void emit(const ConstantDouble *C) const final;
void lowerArguments() override; void lowerArguments() override;
void initNodeForLowering(CfgNode *Node) override;
void addProlog(CfgNode *Node) override; void addProlog(CfgNode *Node) override;
void addEpilog(CfgNode *Node) override; void addEpilog(CfgNode *Node) override;
// Ensure that a 64-bit Variable has been split into 2 32-bit // Ensure that a 64-bit Variable has been split into 2 32-bit
...@@ -157,6 +225,8 @@ protected: ...@@ -157,6 +225,8 @@ protected:
Operand *legalize(Operand *From, LegalMask Allowed = Legal_All, Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
int32_t RegNum = Variable::NoRegister); int32_t RegNum = Variable::NoRegister);
Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister); Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
// Legalize the first source operand for use in the cmp instruction.
Operand *legalizeSrc0ForCmp(Operand *Src0, Operand *Src1);
// Turn a pointer operand into a memory operand that can be // Turn a pointer operand into a memory operand that can be
// used by a real load/store operation. Legalizes the operand as well. // used by a real load/store operation. Legalizes the operand as well.
// This is a nop if the operand is already a legal memory operand. // This is a nop if the operand is already a legal memory operand.
...@@ -507,6 +577,7 @@ protected: ...@@ -507,6 +577,7 @@ protected:
private: private:
~TargetX8632() override {} ~TargetX8632() override {}
BoolFolding FoldingInfo;
}; };
class TargetDataX8632 : public TargetDataLowering { class TargetDataX8632 : public TargetDataLowering {
......
...@@ -3,6 +3,8 @@ ...@@ -3,6 +3,8 @@
; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s ; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s ; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 | FileCheck %s
declare void @useInt(i32 %x)
define internal i32 @add8Bit(i32 %a, i32 %b) { define internal i32 @add8Bit(i32 %a, i32 %b) {
entry: entry:
%a_8 = trunc i32 %a to i8 %a_8 = trunc i32 %a to i8
...@@ -278,6 +280,9 @@ entry: ...@@ -278,6 +280,9 @@ entry:
%cmp = icmp slt i8 %a_8, %b_8 %cmp = icmp slt i8 %a_8, %b_8
%ret = select i1 %cmp, i8 %a_8, i8 %b_8 %ret = select i1 %cmp, i8 %a_8, i8 %b_8
%ret_ext = zext i8 %ret to i32 %ret_ext = zext i8 %ret to i32
; Create a "fake" use of %cmp to prevent O2 bool folding.
%d1 = zext i1 %cmp to i32
call void @useInt(i32 %d1)
ret i32 %ret_ext ret i32 %ret_ext
} }
; CHECK-LABEL: selectI8Var ; CHECK-LABEL: selectI8Var
......
; This tests the optimization where producers and consumers of i1 (bool)
; variables are combined to implicitly use flags instead of explicitly using
; stack or register variables.
; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 | FileCheck %s
declare void @use_value(i32)
; Basic cmp/branch folding.
define i32 @fold_cmp_br(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: fold_cmp_br
; CHECK: cmp
; CHECK: jge
; Cmp/branch folding with intervening instructions.
define i32 @fold_cmp_br_intervening_insts(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
call void @use_value(i32 %arg1)
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: fold_cmp_br_intervening_insts
; CHECK-NOT: cmp
; CHECK: call
; CHECK: cmp
; CHECK: jge
; Cmp/branch non-folding because of live-out.
define i32 @no_fold_cmp_br_liveout(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
br label %next
next:
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 1
branch2:
ret i32 2
}
; CHECK-LABEL: no_fold_cmp_br_liveout
; CHECK: cmp
; CHECK: set
; CHECK: cmp
; CHECK: je
; Cmp/branch non-folding because of extra non-whitelisted uses.
define i32 @no_fold_cmp_br_non_whitelist(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%result = zext i1 %cmp1 to i32
br i1 %cmp1, label %branch1, label %branch2
branch1:
ret i32 %result
branch2:
ret i32 2
}
; CHECK-LABEL: no_fold_cmp_br_non_whitelist
; CHECK: cmp
; CHECK: set
; CHECK: movzx
; CHECK: cmp
; CHECK: je
; Basic cmp/select folding.
define i32 @fold_cmp_select(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%result = select i1 %cmp1, i32 %arg1, i32 %arg2
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; 64-bit cmp/select folding.
define i64 @fold_cmp_select_64(i64 %arg1, i64 %arg2) {
entry:
%arg1_trunc = trunc i64 %arg1 to i32
%arg2_trunc = trunc i64 %arg2 to i32
%cmp1 = icmp slt i32 %arg1_trunc, %arg2_trunc
%result = select i1 %cmp1, i64 %arg1, i64 %arg2
ret i64 %result
}
; CHECK-LABEL: fold_cmp_select_64
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; CHECK: mov
; Cmp/select folding with intervening instructions.
define i32 @fold_cmp_select_intervening_insts(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
call void @use_value(i32 %arg1)
%result = select i1 %cmp1, i32 %arg1, i32 %arg2
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select_intervening_insts
; CHECK-NOT: cmp
; CHECK: call
; CHECK: cmp
; CHECK: jl
; CHECK: mov
; Cmp/multi-select folding.
define i32 @fold_cmp_select_multi(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
%c = select i1 %cmp1, i32 123, i32 %arg1
%partial = add i32 %a, %b
%result = add i32 %partial, %c
ret i32 %result
}
; CHECK-LABEL: fold_cmp_select_multi
; CHECK: cmp
; CHECK: jl
; CHECK: cmp
; CHECK: jl
; CHECK: cmp
; CHECK: jl
; CHECK: add
; CHECK: add
; Cmp/multi-select non-folding because of live-out.
define i32 @no_fold_cmp_select_multi_liveout(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
br label %next
next:
%c = select i1 %cmp1, i32 123, i32 %arg1
%partial = add i32 %a, %b
%result = add i32 %partial, %c
ret i32 %result
}
; CHECK-LABEL: no_fold_cmp_select_multi_liveout
; CHECK: set
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: add
; CHECK: add
; Cmp/multi-select non-folding because of extra non-whitelisted uses.
define i32 @no_fold_cmp_select_multi_non_whitelist(i32 %arg1, i32 %arg2) {
entry:
%cmp1 = icmp slt i32 %arg1, %arg2
%a = select i1 %cmp1, i32 %arg1, i32 %arg2
%b = select i1 %cmp1, i32 %arg2, i32 %arg1
%c = select i1 %cmp1, i32 123, i32 %arg1
%ext = zext i1 %cmp1 to i32
%partial1 = add i32 %a, %b
%partial2 = add i32 %partial1, %c
%result = add i32 %partial2, %ext
ret i32 %result
}
; CHECK-LABEL: no_fold_cmp_select_multi_non_whitelist
; CHECK: set
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: cmp
; CHECK: jne
; CHECK: movzx
; CHECK: add
; CHECK: add
; CHECK: add
...@@ -14,6 +14,11 @@ entry: ...@@ -14,6 +14,11 @@ entry:
%cmp1 = icmp sgt i32 %a, %b %cmp1 = icmp sgt i32 %a, %b
%cond2 = select i1 %cmp1, i32 10, i32 20 %cond2 = select i1 %cmp1, i32 10, i32 20
tail call void @useInt(i32 %cond2) tail call void @useInt(i32 %cond2)
; Create "fake" uses of %cmp and %cmp1 to prevent O2 bool folding.
%d1 = zext i1 %cmp to i32
call void @useInt(i32 %d1)
%d2 = zext i1 %cmp1 to i32
call void @useInt(i32 %d2)
ret void ret void
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment