Commit 8e6bf6e1 by Jim Stichnoth

Subzero: Improve/refactor folding loads into the next instruction.

This is turned into a separate (O2-only) pass that looks for opportunities: 1. A Load instruction, or an AtomicLoad intrinsic that would be lowered just like a Load instruction 2. Followed immediately by an instruction with a whitelisted kind that uses the Load dest variable as one of its operands 3. Where the whitelisted instruction ends the live range of the Load dest variable. In such cases, the original two instructions are deleted and a new instruction is added that folds the load into the whitelisted instruction. We also do some work to splice the liveness information (Inst::LiveRangesEnded and Inst::isLastUse()) into the new instruction, so that the target lowering pass might still take advantage. Currently this is used quite sparingly, but in the future we could use that along with operator commutativity to choose among different lowering sequences to reduce register pressure. The whitelisted instruction kinds are chosen based primarily on whether the main operation's native instruction can use a memory operand - e.g., arithmetic (add/sub/imul/etc), compare (cmp/ucomiss), cast (movsx/movzx/etc). Notably, call and ret are not included because arg passing is done through simple assignments which normal lowering is sufficient for. BUG= none R=jvoung@chromium.org, mtrofin@chromium.org Review URL: https://codereview.chromium.org/1169493002
parent bb9d11a5
...@@ -383,8 +383,9 @@ void ELFObjectWriter::writeDataOfType(SectionType ST, ...@@ -383,8 +383,9 @@ void ELFObjectWriter::writeDataOfType(SectionType ST,
for (VariableDeclaration::Initializer *Init : Var->getInitializers()) { for (VariableDeclaration::Initializer *Init : Var->getInitializers()) {
switch (Init->getKind()) { switch (Init->getKind()) {
case VariableDeclaration::Initializer::DataInitializerKind: { case VariableDeclaration::Initializer::DataInitializerKind: {
const auto Data = llvm::cast<VariableDeclaration::DataInitializer>( const auto Data =
Init)->getContents(); llvm::cast<VariableDeclaration::DataInitializer>(Init)
->getContents();
Section->appendData(Str, llvm::StringRef(Data.data(), Data.size())); Section->appendData(Str, llvm::StringRef(Data.data(), Data.size()));
break; break;
} }
......
...@@ -112,6 +112,44 @@ bool Inst::isLastUse(const Operand *TestSrc) const { ...@@ -112,6 +112,44 @@ bool Inst::isLastUse(const Operand *TestSrc) const {
return false; return false;
} }
// Given an instruction like:
// a = b + c + [x,y] + e
// which was created from OrigInst:
// a = b + c + d + e
// with SpliceAssn spliced in:
// d = [x,y]
//
// Reconstruct the LiveRangesEnded bitmask in this instruction by
// combining the LiveRangesEnded values of OrigInst and SpliceAssn.
// If operands d and [x,y] contain a different number of variables,
// then the bitmask position for e may be different in OrigInst and
// the current instruction, requiring extra shifts and masks in the
// computation. In the example above, OrigInst has variable e in bit
// position 3, whereas the current instruction has e in bit position 4
// because [x,y] consumes 2 bitmask slots while d only consumed 1.
//
// Additionally, set HasSideEffects if either OrigInst or SpliceAssn
// have HasSideEffects set.
void Inst::spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn) {
HasSideEffects |= OrigInst->HasSideEffects;
HasSideEffects |= SpliceAssn->HasSideEffects;
// Find the bitmask index of SpliceAssn's dest within OrigInst.
Variable *SpliceDest = SpliceAssn->getDest();
SizeT Index = 0;
for (SizeT I = 0; I < OrigInst->getSrcSize(); ++I) {
Operand *Src = OrigInst->getSrc(I);
if (Src == SpliceDest) {
LREndedBits LeftMask = OrigInst->LiveRangesEnded & ((1 << Index) - 1);
LREndedBits RightMask = OrigInst->LiveRangesEnded >> (Index + 1);
LiveRangesEnded = LeftMask | (SpliceAssn->LiveRangesEnded << Index) |
(RightMask << (Index + getSrc(I)->getNumVars()));
return;
}
Index += getSrc(I)->getNumVars();
}
llvm::report_fatal_error("Failed to find splice operand");
}
void Inst::livenessLightweight(Cfg *Func, LivenessBV &Live) { void Inst::livenessLightweight(Cfg *Func, LivenessBV &Live) {
assert(!isDeleted()); assert(!isDeleted());
resetLastUses(); resetLastUses();
......
...@@ -102,6 +102,7 @@ public: ...@@ -102,6 +102,7 @@ public:
} }
bool isLastUse(const Operand *Src) const; bool isLastUse(const Operand *Src) const;
void spliceLivenessInfo(Inst *OrigInst, Inst *SpliceAssn);
// Returns a list of out-edges corresponding to a terminator // Returns a list of out-edges corresponding to a terminator
// instruction, which is the last instruction of the block. // instruction, which is the last instruction of the block.
......
...@@ -482,6 +482,7 @@ void TargetX8632::translateO2() { ...@@ -482,6 +482,7 @@ void TargetX8632::translateO2() {
return; return;
Func->dump("After x86 address mode opt"); Func->dump("After x86 address mode opt");
doLoadOpt();
Func->genCode(); Func->genCode();
if (Func->hasError()) if (Func->hasError())
return; return;
...@@ -572,6 +573,126 @@ void TargetX8632::translateOm1() { ...@@ -572,6 +573,126 @@ void TargetX8632::translateOm1() {
} }
} }
namespace {
// Converts a ConstantInteger32 operand into its constant value, or
// MemoryOrderInvalid if the operand is not a ConstantInteger32.
uint64_t getConstantMemoryOrder(Operand *Opnd) {
if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
return Integer->getValue();
return Intrinsics::MemoryOrderInvalid;
}
// Determines whether the dest of a Load instruction can be folded
// into one of the src operands of a 2-operand instruction. This is
// true as long as the load dest matches exactly one of the binary
// instruction's src operands. Replaces Src0 or Src1 with LoadSrc if
// the answer is true.
bool canFoldLoadIntoBinaryInst(Operand *LoadSrc, Variable *LoadDest,
Operand *&Src0, Operand *&Src1) {
if (Src0 == LoadDest && Src1 != LoadDest) {
Src0 = LoadSrc;
return true;
}
if (Src0 != LoadDest && Src1 == LoadDest) {
Src1 = LoadSrc;
return true;
}
return false;
}
} // end of anonymous namespace
void TargetX8632::doLoadOpt() {
for (CfgNode *Node : Func->getNodes()) {
Context.init(Node);
while (!Context.atEnd()) {
Variable *LoadDest = nullptr;
Operand *LoadSrc = nullptr;
Inst *CurInst = Context.getCur();
Inst *Next = Context.getNextInst();
// Determine whether the current instruction is a Load
// instruction or equivalent.
if (auto *Load = llvm::dyn_cast<InstLoad>(CurInst)) {
// An InstLoad always qualifies.
LoadDest = Load->getDest();
const bool DoLegalize = false;
LoadSrc = formMemoryOperand(Load->getSourceAddress(),
LoadDest->getType(), DoLegalize);
} else if (auto *Intrin = llvm::dyn_cast<InstIntrinsicCall>(CurInst)) {
// An AtomicLoad intrinsic qualifies as long as it has a valid
// memory ordering, and can be implemented in a single
// instruction (i.e., not i64).
Intrinsics::IntrinsicID ID = Intrin->getIntrinsicInfo().ID;
if (ID == Intrinsics::AtomicLoad &&
Intrin->getDest()->getType() != IceType_i64 &&
Intrinsics::isMemoryOrderValid(
ID, getConstantMemoryOrder(Intrin->getArg(1)))) {
LoadDest = Intrin->getDest();
const bool DoLegalize = false;
LoadSrc = formMemoryOperand(Intrin->getArg(0), LoadDest->getType(),
DoLegalize);
}
}
// A Load instruction can be folded into the following
// instruction only if the following instruction ends the Load's
// Dest variable's live range.
if (LoadDest && Next && Next->isLastUse(LoadDest)) {
assert(LoadSrc);
Inst *NewInst = nullptr;
if (auto *Arith = llvm::dyn_cast<InstArithmetic>(Next)) {
Operand *Src0 = Arith->getSrc(0);
Operand *Src1 = Arith->getSrc(1);
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstArithmetic::create(Func, Arith->getOp(),
Arith->getDest(), Src0, Src1);
}
} else if (auto *Icmp = llvm::dyn_cast<InstIcmp>(Next)) {
Operand *Src0 = Icmp->getSrc(0);
Operand *Src1 = Icmp->getSrc(1);
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstIcmp::create(Func, Icmp->getCondition(),
Icmp->getDest(), Src0, Src1);
}
} else if (auto *Fcmp = llvm::dyn_cast<InstFcmp>(Next)) {
Operand *Src0 = Fcmp->getSrc(0);
Operand *Src1 = Fcmp->getSrc(1);
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstFcmp::create(Func, Fcmp->getCondition(),
Fcmp->getDest(), Src0, Src1);
}
} else if (auto *Select = llvm::dyn_cast<InstSelect>(Next)) {
Operand *Src0 = Select->getTrueOperand();
Operand *Src1 = Select->getFalseOperand();
if (canFoldLoadIntoBinaryInst(LoadSrc, LoadDest, Src0, Src1)) {
NewInst = InstSelect::create(Func, Select->getDest(),
Select->getCondition(), Src0, Src1);
}
} else if (auto *Cast = llvm::dyn_cast<InstCast>(Next)) {
// The load dest can always be folded into a Cast
// instruction.
Variable *Src0 = llvm::dyn_cast<Variable>(Cast->getSrc(0));
if (Src0 == LoadDest) {
NewInst = InstCast::create(Func, Cast->getCastKind(),
Cast->getDest(), LoadSrc);
}
}
if (NewInst) {
CurInst->setDeleted();
Next->setDeleted();
Context.insert(NewInst);
// Update NewInst->LiveRangesEnded so that target lowering
// may benefit. Also update NewInst->HasSideEffects.
NewInst->spliceLivenessInfo(Next, CurInst);
}
}
Context.advanceCur();
Context.advanceNext();
}
}
Func->dump("After load optimization");
}
bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) { bool TargetX8632::doBranchOpt(Inst *I, const CfgNode *NextNode) {
if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) { if (InstX8632Br *Br = llvm::dyn_cast<InstX8632Br>(I)) {
return Br->optimizeBranch(NextNode); return Br->optimizeBranch(NextNode);
...@@ -1170,6 +1291,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) { ...@@ -1170,6 +1291,10 @@ void TargetX8632::lowerArithmetic(const InstArithmetic *Inst) {
Variable *Dest = Inst->getDest(); Variable *Dest = Inst->getDest();
Operand *Src0 = legalize(Inst->getSrc(0)); Operand *Src0 = legalize(Inst->getSrc(0));
Operand *Src1 = legalize(Inst->getSrc(1)); Operand *Src1 = legalize(Inst->getSrc(1));
if (Inst->isCommutative()) {
if (!llvm::isa<Variable>(Src0) && llvm::isa<Variable>(Src1))
std::swap(Src0, Src1);
}
if (Dest->getType() == IceType_i64) { if (Dest->getType() == IceType_i64) {
Variable *DestLo = llvm::cast<Variable>(loOperand(Dest)); Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest)); Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
...@@ -2891,18 +3016,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) { ...@@ -2891,18 +3016,6 @@ void TargetX8632::lowerInsertElement(const InstInsertElement *Inst) {
} }
} }
namespace {
// Converts a ConstantInteger32 operand into its constant value, or
// MemoryOrderInvalid if the operand is not a ConstantInteger32.
uint64_t getConstantMemoryOrder(Operand *Opnd) {
if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
return Integer->getValue();
return Intrinsics::MemoryOrderInvalid;
}
} // end of anonymous namespace
void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) { switch (Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID) {
case Intrinsics::AtomicCmpxchg: { case Intrinsics::AtomicCmpxchg: {
...@@ -3006,9 +3119,10 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { ...@@ -3006,9 +3119,10 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
Func->setError("Unexpected memory ordering for AtomicRMW"); Func->setError("Unexpected memory ordering for AtomicRMW");
return; return;
} }
lowerAtomicRMW(Instr->getDest(), lowerAtomicRMW(
static_cast<uint32_t>(llvm::cast<ConstantInteger32>( Instr->getDest(),
Instr->getArg(0))->getValue()), static_cast<uint32_t>(
llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
Instr->getArg(1), Instr->getArg(2)); Instr->getArg(1), Instr->getArg(2));
return; return;
case Intrinsics::AtomicStore: { case Intrinsics::AtomicStore: {
...@@ -3852,66 +3966,9 @@ void TargetX8632::lowerLoad(const InstLoad *Load) { ...@@ -3852,66 +3966,9 @@ void TargetX8632::lowerLoad(const InstLoad *Load) {
// OperandX8632Mem operand. Note that the address mode // OperandX8632Mem operand. Note that the address mode
// optimization already creates an OperandX8632Mem operand, so it // optimization already creates an OperandX8632Mem operand, so it
// doesn't need another level of transformation. // doesn't need another level of transformation.
Type Ty = Load->getDest()->getType();
Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
// Fuse this load with a subsequent Arithmetic instruction in the
// following situations:
// a=[mem]; c=b+a ==> c=b+[mem] if last use of a and a not in b
// a=[mem]; c=a+b ==> c=b+[mem] if commutative and above is true
//
// Fuse this load with a subsequent Cast instruction:
// a=[mem]; b=cast(a) ==> b=cast([mem]) if last use of a
//
// TODO: Clean up and test thoroughly.
// (E.g., if there is an mfence-all make sure the load ends up on the
// same side of the fence).
//
// TODO: Why limit to Arithmetic instructions? This could probably be
// applied to most any instruction type. Look at all source operands
// in the following instruction, and if there is one instance of the
// load instruction's dest variable, and that instruction ends that
// variable's live range, then make the substitution. Deal with
// commutativity optimization in the arithmetic instruction lowering.
//
// TODO(stichnot): Do load fusing as a separate pass. Run it before
// the bool folding pass. Modify Ice::Inst to allow src operands to
// be replaced, including updating Inst::LiveRangesEnded, to avoid
// having to manually mostly clone each instruction type.
Inst *NextInst = Context.getNextInst();
Variable *DestLoad = Load->getDest(); Variable *DestLoad = Load->getDest();
if (NextInst && NextInst->isLastUse(DestLoad)) { Type Ty = DestLoad->getType();
if (auto *Arith = llvm::dyn_cast<InstArithmetic>(NextInst)) { Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
InstArithmetic *NewArith = nullptr;
Variable *Src0Arith = llvm::dyn_cast<Variable>(Arith->getSrc(0));
Variable *Src1Arith = llvm::dyn_cast<Variable>(Arith->getSrc(1));
if (Src1Arith == DestLoad && DestLoad != Src0Arith) {
NewArith = InstArithmetic::create(
Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(0), Src0);
} else if (Src0Arith == DestLoad && Arith->isCommutative() &&
DestLoad != Src1Arith) {
NewArith = InstArithmetic::create(
Func, Arith->getOp(), Arith->getDest(), Arith->getSrc(1), Src0);
}
if (NewArith) {
Arith->setDeleted();
Context.advanceNext();
lowerArithmetic(NewArith);
return;
}
} else if (auto *Cast = llvm::dyn_cast<InstCast>(NextInst)) {
Variable *Src0Cast = llvm::dyn_cast<Variable>(Cast->getSrc(0));
if (Src0Cast == DestLoad) {
InstCast *NewCast =
InstCast::create(Func, Cast->getCastKind(), Cast->getDest(), Src0);
Cast->setDeleted();
Context.advanceNext();
lowerCast(NewCast);
return;
}
}
}
InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0); InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
lowerAssign(Assign); lowerAssign(Assign);
} }
...@@ -4639,7 +4696,8 @@ Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) { ...@@ -4639,7 +4696,8 @@ Operand *TargetX8632::legalizeSrc0ForCmp(Operand *Src0, Operand *Src1) {
return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg); return legalize(Src0, IsSrc1ImmOrReg ? (Legal_Reg | Legal_Mem) : Legal_Reg);
} }
OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) { OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty,
bool DoLegalize) {
OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand); OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(Operand);
// It may be the case that address mode optimization already creates // It may be the case that address mode optimization already creates
// an OperandX8632Mem, so in that case it wouldn't need another level // an OperandX8632Mem, so in that case it wouldn't need another level
...@@ -4656,7 +4714,7 @@ OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) { ...@@ -4656,7 +4714,7 @@ OperandX8632Mem *TargetX8632::formMemoryOperand(Operand *Operand, Type Ty) {
} }
Mem = OperandX8632Mem::create(Func, Ty, Base, Offset); Mem = OperandX8632Mem::create(Func, Ty, Base, Offset);
} }
return llvm::cast<OperandX8632Mem>(legalize(Mem)); return llvm::cast<OperandX8632Mem>(DoLegalize ? legalize(Mem) : Mem);
} }
Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) { Variable *TargetX8632::makeReg(Type Type, int32_t RegNum) {
......
...@@ -101,6 +101,7 @@ public: ...@@ -101,6 +101,7 @@ public:
void translateOm1() override; void translateOm1() override;
void translateO2() override; void translateO2() override;
void doLoadOpt();
bool doBranchOpt(Inst *I, const CfgNode *NextNode) override; bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;
SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; } SizeT getNumRegisters() const override { return RegX8632::Reg_NUM; }
...@@ -229,7 +230,8 @@ protected: ...@@ -229,7 +230,8 @@ protected:
// Turn a pointer operand into a memory operand that can be // Turn a pointer operand into a memory operand that can be
// used by a real load/store operation. Legalizes the operand as well. // used by a real load/store operation. Legalizes the operand as well.
// This is a nop if the operand is already a legal memory operand. // This is a nop if the operand is already a legal memory operand.
OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty); OperandX8632Mem *formMemoryOperand(Operand *Ptr, Type Ty,
bool DoLegalize = true);
Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister); Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
static Type stackSlotType(); static Type stackSlotType();
......
...@@ -14,7 +14,7 @@ declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32) ...@@ -14,7 +14,7 @@ declare void @llvm.nacl.atomic.store.i32(i32, i32*, i32)
@g32_c = internal global [4 x i8] zeroinitializer, align 4 @g32_c = internal global [4 x i8] zeroinitializer, align 4
@g32_d = internal global [4 x i8] zeroinitializer, align 4 @g32_d = internal global [4 x i8] zeroinitializer, align 4
define i32 @test_fused_load_add_a() { define i32 @test_fused_load_sub_a() {
entry: entry:
%p_alloca = alloca i8, i32 4, align 4 %p_alloca = alloca i8, i32 4, align 4
%p_alloca_bc = bitcast i8* %p_alloca to i32* %p_alloca_bc = bitcast i8* %p_alloca to i32*
...@@ -22,39 +22,39 @@ entry: ...@@ -22,39 +22,39 @@ entry:
%p_a = bitcast [4 x i8]* @g32_a to i32* %p_a = bitcast [4 x i8]* @g32_a to i32*
%l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
%l_a2 = add i32 %l_a, 1 %l_a2 = sub i32 1, %l_a
call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
%p_b = bitcast [4 x i8]* @g32_b to i32* %p_b = bitcast [4 x i8]* @g32_b to i32*
%l_b = load i32, i32* %p_b, align 1 %l_b = load i32, i32* %p_b, align 1
%l_b2 = add i32 %l_b, 1 %l_b2 = sub i32 1, %l_b
store i32 %l_b2, i32* %p_b, align 1 store i32 %l_b2, i32* %p_b, align 1
%p_c = bitcast [4 x i8]* @g32_c to i32* %p_c = bitcast [4 x i8]* @g32_c to i32*
%l_c = load i32, i32* %p_c, align 1 %l_c = load i32, i32* %p_c, align 1
%l_c2 = add i32 %l_c, 1 %l_c2 = sub i32 1, %l_c
call void @llvm.nacl.atomic.fence.all() call void @llvm.nacl.atomic.fence.all()
store i32 %l_c2, i32* %p_c, align 1 store i32 %l_c2, i32* %p_c, align 1
ret i32 %l_c2 ret i32 %l_c2
} }
; CHECK-LABEL: test_fused_load_add_a ; CHECK-LABEL: test_fused_load_sub_a
; alloca store ; alloca store
; CHECK: mov {{.*}},esp ; CHECK: mov {{.*}},esp
; CHECK: mov DWORD PTR {{.*}},0x3e7 ; CHECK: mov DWORD PTR {{.*}},0x3e7
; atomic store (w/ its own mfence) ; atomic store (w/ its own mfence)
; The load + add are optimized into one everywhere. ; The load + sub are optimized into one everywhere.
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; CHECK: mfence ; CHECK: mfence
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
; CHECK: mfence ; CHECK: mfence
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; Test with the fence moved up a bit. ; Test with the fence moved up a bit.
define i32 @test_fused_load_add_b() { define i32 @test_fused_load_sub_b() {
entry: entry:
%p_alloca = alloca i8, i32 4, align 4 %p_alloca = alloca i8, i32 4, align 4
%p_alloca_bc = bitcast i8* %p_alloca to i32* %p_alloca_bc = bitcast i8* %p_alloca to i32*
...@@ -62,40 +62,40 @@ entry: ...@@ -62,40 +62,40 @@ entry:
%p_a = bitcast [4 x i8]* @g32_a to i32* %p_a = bitcast [4 x i8]* @g32_a to i32*
%l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
%l_a2 = add i32 %l_a, 1 %l_a2 = sub i32 1, %l_a
call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
%p_b = bitcast [4 x i8]* @g32_b to i32* %p_b = bitcast [4 x i8]* @g32_b to i32*
%l_b = load i32, i32* %p_b, align 1 %l_b = load i32, i32* %p_b, align 1
%l_b2 = add i32 %l_b, 1 %l_b2 = sub i32 1, %l_b
store i32 %l_b2, i32* %p_b, align 1 store i32 %l_b2, i32* %p_b, align 1
%p_c = bitcast [4 x i8]* @g32_c to i32* %p_c = bitcast [4 x i8]* @g32_c to i32*
call void @llvm.nacl.atomic.fence.all() call void @llvm.nacl.atomic.fence.all()
%l_c = load i32, i32* %p_c, align 1 %l_c = load i32, i32* %p_c, align 1
%l_c2 = add i32 %l_c, 1 %l_c2 = sub i32 1, %l_c
store i32 %l_c2, i32* %p_c, align 1 store i32 %l_c2, i32* %p_c, align 1
ret i32 %l_c2 ret i32 %l_c2
} }
; CHECK-LABEL: test_fused_load_add_b ; CHECK-LABEL: test_fused_load_sub_b
; alloca store ; alloca store
; CHECK: mov {{.*}},esp ; CHECK: mov {{.*}},esp
; CHECK: mov DWORD PTR {{.*}},0x3e7 ; CHECK: mov DWORD PTR {{.*}},0x3e7
; atomic store (w/ its own mfence) ; atomic store (w/ its own mfence)
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; CHECK: mfence ; CHECK: mfence
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_b ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_b
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; CHECK: mfence ; CHECK: mfence
; Load + add can still be optimized into one instruction ; Load + sub can still be optimized into one instruction
; because it is not separated by a fence. ; because it is not separated by a fence.
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; Test with the fence splitting a load/add. ; Test with the fence splitting a load/sub.
define i32 @test_fused_load_add_c() { define i32 @test_fused_load_sub_c() {
entry: entry:
%p_alloca = alloca i8, i32 4, align 4 %p_alloca = alloca i8, i32 4, align 4
%p_alloca_bc = bitcast i8* %p_alloca to i32* %p_alloca_bc = bitcast i8* %p_alloca to i32*
...@@ -103,38 +103,39 @@ entry: ...@@ -103,38 +103,39 @@ entry:
%p_a = bitcast [4 x i8]* @g32_a to i32* %p_a = bitcast [4 x i8]* @g32_a to i32*
%l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6) %l_a = call i32 @llvm.nacl.atomic.load.i32(i32* %p_a, i32 6)
%l_a2 = add i32 %l_a, 1 %l_a2 = sub i32 1, %l_a
call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6) call void @llvm.nacl.atomic.store.i32(i32 %l_a2, i32* %p_a, i32 6)
%p_b = bitcast [4 x i8]* @g32_b to i32* %p_b = bitcast [4 x i8]* @g32_b to i32*
%l_b = load i32, i32* %p_b, align 1 %l_b = load i32, i32* %p_b, align 1
call void @llvm.nacl.atomic.fence.all() call void @llvm.nacl.atomic.fence.all()
%l_b2 = add i32 %l_b, 1 %l_b2 = sub i32 1, %l_b
store i32 %l_b2, i32* %p_b, align 1 store i32 %l_b2, i32* %p_b, align 1
%p_c = bitcast [4 x i8]* @g32_c to i32* %p_c = bitcast [4 x i8]* @g32_c to i32*
%l_c = load i32, i32* %p_c, align 1 %l_c = load i32, i32* %p_c, align 1
%l_c2 = add i32 %l_c, 1 %l_c2 = sub i32 1, %l_c
store i32 %l_c2, i32* %p_c, align 1 store i32 %l_c2, i32* %p_c, align 1
ret i32 %l_c2 ret i32 %l_c2
} }
; CHECK-LABEL: test_fused_load_add_c ; CHECK-LABEL: test_fused_load_sub_c
; alloca store ; alloca store
; CHECK: mov {{.*}},esp ; CHECK: mov {{.*}},esp
; CHECK: mov DWORD PTR {{.*}},0x3e7 ; CHECK: mov DWORD PTR {{.*}},0x3e7
; atomic store (w/ its own mfence) ; atomic store (w/ its own mfence)
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_a ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_a
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; CHECK: mfence ; CHECK: mfence
; This load + add are no longer optimized into one, ; This load + sub are no longer optimized into one,
; though perhaps it should be legal as long as ; though perhaps it should be legal as long as
; the load stays on the same side of the fence. ; the load stays on the same side of the fence.
; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b ; CHECK: mov {{.*}},DWORD PTR {{.*}}g32_b
; CHECK: mfence ; CHECK: mfence
; CHECK: add {{.*}},0x1 ; CHECK: mov {{.*}},0x1
; CHECK: sub
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
; CHECK: add {{.*}},DWORD PTR {{.*}}g32_c ; CHECK: sub {{.*}},DWORD PTR {{.*}}g32_c
; CHECK: mov DWORD PTR ; CHECK: mov DWORD PTR
......
...@@ -95,17 +95,17 @@ entry: ...@@ -95,17 +95,17 @@ entry:
next: next:
%ptr = inttoptr i32 %iptr to i32* %ptr = inttoptr i32 %iptr to i32*
%r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6) %r = call i32 @llvm.nacl.atomic.load.i32(i32* %ptr, i32 6)
%r2 = add i32 %r, 32 %r2 = sub i32 32, %r
ret i32 %r2 ret i32 %r2
} }
; CHECK-LABEL: test_atomic_load_32_with_arith ; CHECK-LABEL: test_atomic_load_32_with_arith
; CHECK: mov {{.*}},DWORD ; CHECK: mov {{.*}},DWORD
; The next instruction may be a separate load or folded into an add. ; The next instruction may be a separate load or folded into an add.
; ;
; In O2 mode, we know that the load and add are going to be fused. ; In O2 mode, we know that the load and sub are going to be fused.
; O2-LABEL: test_atomic_load_32_with_arith ; O2-LABEL: test_atomic_load_32_with_arith
; O2: mov {{.*}},DWORD ; O2: mov {{.*}},DWORD
; O2: add {{.*}},DWORD ; O2: sub {{.*}},DWORD
define i32 @test_atomic_load_32_ignored(i32 %iptr) { define i32 @test_atomic_load_32_ignored(i32 %iptr) {
entry: entry:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment