Commit c820ddf2 by Jan Voung

Add a peephole to fuse cmpxchg w/ later cmp+branch.

The cmpxchg instruction already sets ZF for comparing the return value vs the expected value. So there is no need to compare eq again. Lots of pexes-in-the-wild have this pattern. Some compare against a constant, some compare against a variable. BUG=https://code.google.com/p/nativeclient/issues/detail?id=3882 R=stichnot@chromium.org Review URL: https://codereview.chromium.org/413903002
parent 839c4cea
...@@ -54,9 +54,21 @@ ...@@ -54,9 +54,21 @@
FOR_ALL_RMWOP_TYPES(X) FOR_ALL_RMWOP_TYPES(X)
#undef X #undef X
#define X(type) \ #define X(type) \
type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) { \ type test_val_cmp_swap(volatile type *ptr, type oldval, type newval) { \
return __sync_val_compare_and_swap(ptr, oldval, newval); \ return __sync_val_compare_and_swap(ptr, oldval, newval); \
} \
type test_val_cmp_swap_loop(volatile type *ptr, type oldval, type newval) { \
type prev; \
type succeeded_first_try = 1; \
while (1) { \
prev = __sync_val_compare_and_swap(ptr, oldval, newval); \
if (prev == oldval) \
break; \
succeeded_first_try = 0; \
oldval = prev; \
} \
return succeeded_first_try; \
} }
ATOMIC_TYPE_TABLE ATOMIC_TYPE_TABLE
......
...@@ -22,8 +22,9 @@ ...@@ -22,8 +22,9 @@
FOR_ALL_RMWOP_TYPES(X) FOR_ALL_RMWOP_TYPES(X)
#undef X #undef X
#define X(type) \ #define X(type) \
type test_val_cmp_swap(volatile type *ptr, type oldval, type newval); type test_val_cmp_swap(volatile type *ptr, type oldval, type newval); \
type test_val_cmp_swap_loop(volatile type *ptr, type oldval, type newval);
ATOMIC_TYPE_TABLE ATOMIC_TYPE_TABLE
#undef X #undef X
...@@ -127,33 +127,43 @@ void testAtomicRMW(volatile Type *AtomicLoc, ...@@ -127,33 +127,43 @@ void testAtomicRMW(volatile Type *AtomicLoc,
template <typename Type> template <typename Type>
void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests, void testValCompareAndSwap(volatile Type *AtomicLoc, size_t &TotalTests,
size_t &Passes, size_t &Failures) { size_t &Passes, size_t &Failures) {
for (size_t i = 0; i < NumValues; ++i) { typedef Type (*FuncType)(volatile Type *, Type, Type);
Type Value1 = static_cast<Type>(Values[i]); static struct {
for (size_t j = 0; j < NumValues; ++j) { const char *Name;
Type Value2 = static_cast<Type>(Values[j]); FuncType FuncLlc;
for (size_t f = 0; f < 2; ++f) { FuncType FuncSz;
bool flip = f; } Funcs[] = {{"val_cmp_swap", test_val_cmp_swap, Subzero_::test_val_cmp_swap},
++TotalTests; {"val_cmp_swap_loop", test_val_cmp_swap_loop,
*AtomicLoc = Value1; Subzero_::test_val_cmp_swap_loop}};
Type ResultSz1 = Subzero_::test_val_cmp_swap( const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
AtomicLoc, flip ? Value2 : Value1, Value2); for (size_t f = 0; f < NumFuncs; ++f) {
Type ResultSz2 = *AtomicLoc; for (size_t i = 0; i < NumValues; ++i) {
*AtomicLoc = Value1; Type Value1 = static_cast<Type>(Values[i]);
Type ResultLlc1 = test_val_cmp_swap( for (size_t j = 0; j < NumValues; ++j) {
AtomicLoc, flip ? Value2 : Value1, Value2); Type Value2 = static_cast<Type>(Values[j]);
Type ResultLlc2 = *AtomicLoc; for (size_t f = 0; f < 2; ++f) {
if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) { bool flip = f;
++Passes; ++TotalTests;
} else { *AtomicLoc = Value1;
++Failures; Type ResultSz1 =
std::cout << "test_val_cmp_swap" << (CHAR_BIT * sizeof(Type)) << "(" Funcs[f].FuncSz(AtomicLoc, flip ? Value2 : Value1, Value2);
<< static_cast<uint64_t>(Value1) << ", " Type ResultSz2 = *AtomicLoc;
<< static_cast<uint64_t>(Value2) *AtomicLoc = Value1;
<< "): sz1=" << static_cast<uint64_t>(ResultSz1) Type ResultLlc1 =
<< " llc1=" << static_cast<uint64_t>(ResultLlc1) Funcs[f].FuncLlc(AtomicLoc, flip ? Value2 : Value1, Value2);
<< " sz2=" << static_cast<uint64_t>(ResultSz2) Type ResultLlc2 = *AtomicLoc;
<< " llc2=" << static_cast<uint64_t>(ResultLlc2) if (ResultSz1 == ResultLlc1 && ResultSz2 == ResultLlc2) {
<< "\n"; ++Passes;
} else {
++Failures;
std::cout << "test_" << Funcs[f].Name << (CHAR_BIT * sizeof(Type))
<< "(" << static_cast<uint64_t>(Value1) << ", "
<< static_cast<uint64_t>(Value2)
<< "): sz1=" << static_cast<uint64_t>(ResultSz1)
<< " llc1=" << static_cast<uint64_t>(ResultLlc1)
<< " sz2=" << static_cast<uint64_t>(ResultSz2)
<< " llc2=" << static_cast<uint64_t>(ResultLlc2) << "\n";
}
} }
} }
} }
......
...@@ -104,7 +104,8 @@ void CfgNode::placePhiLoads() { ...@@ -104,7 +104,8 @@ void CfgNode::placePhiLoads() {
// added before any branch instruction, and also if the block ends // added before any branch instruction, and also if the block ends
// with a compare instruction followed by a branch instruction that we // with a compare instruction followed by a branch instruction that we
// may want to fuse, it's better to insert the new assignments before // may want to fuse, it's better to insert the new assignments before
// the compare instruction. // the compare instruction. The tryOptimizedCmpxchgCmpBr() method
// assumes this ordering of instructions.
// //
// Note that this transformation takes the Phi dest variables out of // Note that this transformation takes the Phi dest variables out of
// SSA form, as there may be assignments to the dest variable in // SSA form, as there may be assignments to the dest variable in
......
...@@ -38,12 +38,12 @@ void LoweringContext::insert(Inst *Inst) { ...@@ -38,12 +38,12 @@ void LoweringContext::insert(Inst *Inst) {
Inst->updateVars(getNode()); Inst->updateVars(getNode());
} }
void LoweringContext::skipDeleted(InstList::iterator &I) { void LoweringContext::skipDeleted(InstList::iterator &I) const {
while (I != End && (*I)->isDeleted()) while (I != End && (*I)->isDeleted())
++I; ++I;
} }
void LoweringContext::advance(InstList::iterator &I) { void LoweringContext::advance(InstList::iterator &I) const {
if (I != End) { if (I != End) {
++I; ++I;
skipDeleted(I); skipDeleted(I);
......
...@@ -41,6 +41,12 @@ public: ...@@ -41,6 +41,12 @@ public:
return NULL; return NULL;
return *Next; return *Next;
} }
Inst *getNextInst(InstList::iterator &Iter) const {
advance(Iter);
if (Iter == End)
return NULL;
return *Iter;
}
CfgNode *getNode() const { return Node; } CfgNode *getNode() const { return Node; }
bool atEnd() const { return Cur == End; } bool atEnd() const { return Cur == End; }
InstList::iterator getCur() const { return Cur; } InstList::iterator getCur() const { return Cur; }
...@@ -68,8 +74,8 @@ private: ...@@ -68,8 +74,8 @@ private:
// End is a copy of Insts.end(), used if Next needs to be advanced. // End is a copy of Insts.end(), used if Next needs to be advanced.
InstList::iterator End; InstList::iterator End;
void skipDeleted(InstList::iterator &I); void skipDeleted(InstList::iterator &I) const;
void advance(InstList::iterator &I); void advance(InstList::iterator &I) const;
LoweringContext(const LoweringContext &) LLVM_DELETED_FUNCTION; LoweringContext(const LoweringContext &) LLVM_DELETED_FUNCTION;
LoweringContext &operator=(const LoweringContext &) LLVM_DELETED_FUNCTION; LoweringContext &operator=(const LoweringContext &) LLVM_DELETED_FUNCTION;
}; };
......
...@@ -2654,12 +2654,9 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) { ...@@ -2654,12 +2654,9 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
Operand *PtrToMem = Instr->getArg(0); Operand *PtrToMem = Instr->getArg(0);
Operand *Expected = Instr->getArg(1); Operand *Expected = Instr->getArg(1);
Operand *Desired = Instr->getArg(2); Operand *Desired = Instr->getArg(2);
if (tryOptimizedCmpxchgCmpBr(DestPrev, PtrToMem, Expected, Desired))
return;
lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired); lowerAtomicCmpxchg(DestPrev, PtrToMem, Expected, Desired);
// TODO(jvoung): If we peek ahead a few instructions and see how
// DestPrev is used (typically via another compare and branch),
// we may be able to optimize. If the result truly is used by a
// compare + branch, and the comparison is for equality, then we can
// optimize out the later compare, and fuse with the later branch.
return; return;
} }
case Intrinsics::AtomicFence: case Intrinsics::AtomicFence:
...@@ -2975,6 +2972,79 @@ void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, ...@@ -2975,6 +2972,79 @@ void TargetX8632::lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr,
_mov(DestPrev, T_eax); _mov(DestPrev, T_eax);
} }
bool TargetX8632::tryOptimizedCmpxchgCmpBr(Variable *Dest, Operand *PtrToMem,
Operand *Expected,
Operand *Desired) {
if (Ctx->getOptLevel() == Opt_m1)
return false;
// Peek ahead a few instructions and see how Dest is used.
// It's very common to have:
//
// %x = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* ptr, i32 %expected, ...)
// [%y_phi = ...] // list of phi stores
// %p = icmp eq i32 %x, %expected
// br i1 %p, label %l1, label %l2
//
// which we can optimize into:
//
// %x = <cmpxchg code>
// [%y_phi = ...] // list of phi stores
// br eq, %l1, %l2
InstList::iterator I = Context.getCur();
// I is currently the InstIntrinsicCall. Peek past that.
// This assumes that the atomic cmpxchg has not been lowered yet,
// so that the instructions seen in the scan from "Cur" is simple.
assert(llvm::isa<InstIntrinsicCall>(*I));
Inst *NextInst = Context.getNextInst(I);
if (!NextInst)
return false;
// There might be phi assignments right before the compare+branch, since this
// could be a backward branch for a loop. This placement of assignments is
// determined by placePhiStores().
std::vector<InstAssign *> PhiAssigns;
while (InstAssign *PhiAssign = llvm::dyn_cast<InstAssign>(NextInst)) {
if (PhiAssign->getDest() == Dest)
return false;
PhiAssigns.push_back(PhiAssign);
NextInst = Context.getNextInst(I);
if (!NextInst)
return false;
}
if (InstIcmp *NextCmp = llvm::dyn_cast<InstIcmp>(NextInst)) {
if (!(NextCmp->getCondition() == InstIcmp::Eq &&
((NextCmp->getSrc(0) == Dest && NextCmp->getSrc(1) == Expected) ||
(NextCmp->getSrc(1) == Dest && NextCmp->getSrc(0) == Expected)))) {
return false;
}
NextInst = Context.getNextInst(I);
if (!NextInst)
return false;
if (InstBr *NextBr = llvm::dyn_cast<InstBr>(NextInst)) {
if (!NextBr->isUnconditional() &&
NextCmp->getDest() == NextBr->getCondition() &&
NextBr->isLastUse(NextCmp->getDest())) {
lowerAtomicCmpxchg(Dest, PtrToMem, Expected, Desired);
for (size_t i = 0; i < PhiAssigns.size(); ++i) {
// Lower the phi assignments now, before the branch (same placement
// as before).
InstAssign *PhiAssign = PhiAssigns[i];
lowerAssign(PhiAssign);
PhiAssign->setDeleted();
Context.advanceNext();
}
_br(InstX8632::Br_e, NextBr->getTargetTrue(), NextBr->getTargetFalse());
// Skip over the old compare and branch, by deleting them.
NextCmp->setDeleted();
NextBr->setDeleted();
Context.advanceNext();
Context.advanceNext();
return true;
}
}
}
return false;
}
void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation, void TargetX8632::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
Operand *Ptr, Operand *Val) { Operand *Ptr, Operand *Val) {
bool NeedsCmpxchg = false; bool NeedsCmpxchg = false;
......
...@@ -97,8 +97,12 @@ protected: ...@@ -97,8 +97,12 @@ protected:
virtual void doAddressOptLoad(); virtual void doAddressOptLoad();
virtual void doAddressOptStore(); virtual void doAddressOptStore();
// Naive lowering of cmpxchg.
void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected, void lowerAtomicCmpxchg(Variable *DestPrev, Operand *Ptr, Operand *Expected,
Operand *Desired); Operand *Desired);
// Attempt a more optimized lowering of cmpxchg. Returns true if optimized.
bool tryOptimizedCmpxchgCmpBr(Variable *DestPrev, Operand *Ptr,
Operand *Expected, Operand *Desired);
void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr, void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
Operand *Val); Operand *Val);
void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal, void lowerCountZeros(bool Cttz, Type Ty, Variable *Dest, Operand *FirstVal,
......
; This tests the optimization of atomic cmpxchg w/ following cmp + branches.
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=O2
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s --check-prefix=OM1
; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice -Om1 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
; RUN: %llvm2ice --verbose none %s | FileCheck --check-prefix=ERRORS %s
; RUN: %llvm2iceinsts %s | %szdiff %s | FileCheck --check-prefix=DUMP %s
; RUN: %llvm2iceinsts --pnacl %s | %szdiff %s \
; RUN: | FileCheck --check-prefix=DUMP %s
declare i32 @llvm.nacl.atomic.cmpxchg.i32(i32*, i32, i32, i32, i32)
; Test that a cmpxchg followed by icmp eq and branch can be optimized to
; reuse the flags set by the cmpxchg instruction itself.
; This is only expected to work w/ O2, based on lightweight liveness.
; (Or if we had other means to detect the only use).
declare void @use_value(i32);
define i32 @test_atomic_cmpxchg_loop(i32 %iptr, i32 %expected, i32 %desired) {
entry:
br label %loop
loop:
%expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
%succeeded_first_try = phi i32 [ 1, %entry ], [ 2, %loop ]
%ptr = inttoptr i32 %iptr to i32*
%old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
i32 %desired, i32 6, i32 6)
%success = icmp eq i32 %expected_loop, %old
br i1 %success, label %done, label %loop
done:
call void @use_value(i32 %old)
ret i32 %succeeded_first_try
}
; O2-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}loop
; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2-NOT: cmp
; Make sure the phi assignment for succeeded_first_try is still there.
; O2: mov {{.*}}, 2
; O2-NOT: cmp
; O2: je
; O2-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}done
; Make sure the call isn't accidentally deleted.
; O2: call
;
; Check that the unopt version does have a cmp
; OM1-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}loop
; OM1: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; OM1: cmp
; OM1: je
; OM1-LABEL: .Ltest_atomic_cmpxchg_loop{{.*}}done
; OM1: call
; Still works if the compare operands are flipped.
define i32 @test_atomic_cmpxchg_loop2(i32 %iptr, i32 %expected, i32 %desired) {
entry:
br label %loop
loop:
%expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
%ptr = inttoptr i32 %iptr to i32*
%old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
i32 %desired, i32 6, i32 6)
%success = icmp eq i32 %old, %expected_loop
br i1 %success, label %done, label %loop
done:
ret i32 %old
}
; O2-LABEL: .Ltest_atomic_cmpxchg_loop2{{.*}}loop
; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2-NOT: cmp
; O2: je
; Still works if the compare operands are constants.
define i32 @test_atomic_cmpxchg_loop_const(i32 %iptr, i32 %desired) {
entry:
br label %loop
loop:
%succeeded_first_try = phi i32 [ 1, %entry ], [ 0, %loop ]
%ptr = inttoptr i32 %iptr to i32*
%old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 0,
i32 %desired, i32 6, i32 6)
%success = icmp eq i32 %old, 0
br i1 %success, label %done, label %loop
done:
ret i32 %succeeded_first_try
}
; O2-LABEL: .Ltest_atomic_cmpxchg_loop_const{{.*}}loop
; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2-NOT: cmp
; O2: je
; This is a case where the flags cannot be reused (compare is for some
; other condition).
define i32 @test_atomic_cmpxchg_no_opt(i32 %iptr, i32 %expected, i32 %desired) {
entry:
br label %loop
loop:
%expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
%ptr = inttoptr i32 %iptr to i32*
%old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
i32 %desired, i32 6, i32 6)
%success = icmp sgt i32 %old, %expected
br i1 %success, label %done, label %loop
done:
ret i32 %old
}
; O2-LABEL: .Ltest_atomic_cmpxchg_no_opt{{.*}}loop
; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2: mov {{.*}}
; O2: cmp
; O2: jg
; Another case where the flags cannot be reused (the comparison result
; is used somewhere else).
define i32 @test_atomic_cmpxchg_no_opt2(i32 %iptr, i32 %expected, i32 %desired) {
entry:
br label %loop
loop:
%expected_loop = phi i32 [ %expected, %entry ], [ %old, %loop ]
%ptr = inttoptr i32 %iptr to i32*
%old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %expected_loop,
i32 %desired, i32 6, i32 6)
%success = icmp eq i32 %old, %expected
br i1 %success, label %done, label %loop
done:
%r = zext i1 %success to i32
ret i32 %r
}
; O2-LABEL: .Ltest_atomic_cmpxchg_no_opt2{{.*}}loop
; O2: lock cmpxchg dword ptr [e{{[^a].}}], e{{[^a]}}
; O2: mov {{.*}}
; O2: cmp
; O2: je
; ERRORS-NOT: ICE translation error
; DUMP-NOT: SZ
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
; size allowed. ; size allowed.
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2REM ; RUN: %llvm2ice -O2 --verbose none %s | FileCheck %s --check-prefix=CHECKO2
; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s ; RUN: %llvm2ice -Om1 --verbose none %s | FileCheck %s
; RUN: %llvm2ice -O2 --verbose none %s \ ; RUN: %llvm2ice -O2 --verbose none %s \
; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj ; RUN: | llvm-mc -arch=x86 -x86-asm-syntax=intel -filetype=obj
...@@ -815,23 +815,6 @@ entry: ...@@ -815,23 +815,6 @@ entry:
; CHECK-DAG: mov ebx ; CHECK-DAG: mov ebx
; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}] ; CHECK: lock cmpxchg8b qword ptr [e{{.[^x]}}]
define i32 @test_atomic_cmpxchg_32_loop(i32 %iptr, i32 %expected, i32 %desired) {
entry:
br label %loop
loop:
%cmp = phi i32 [ %expected, %entry ], [ %old, %loop ]
%ptr = inttoptr i32 %iptr to i32*
%old = call i32 @llvm.nacl.atomic.cmpxchg.i32(i32* %ptr, i32 %cmp,
i32 %desired, i32 6, i32 6)
%success = icmp eq i32 %cmp, %old
br i1 %success, label %done, label %loop
done:
ret i32 %old
}
; CHECK-LABEL: test_atomic_cmpxchg_32_loop
;;;; Fence and is-lock-free. ;;;; Fence and is-lock-free.
define void @test_atomic_fence() { define void @test_atomic_fence() {
...@@ -879,9 +862,9 @@ entry: ...@@ -879,9 +862,9 @@ entry:
; CHECK-LABEL: test_atomic_is_lock_free_ignored ; CHECK-LABEL: test_atomic_is_lock_free_ignored
; CHECK: mov {{.*}}, 0 ; CHECK: mov {{.*}}, 0
; This can get optimized out, because it's side-effect-free. ; This can get optimized out, because it's side-effect-free.
; CHECKO2REM-LABEL: test_atomic_is_lock_free_ignored ; CHECKO2-LABEL: test_atomic_is_lock_free_ignored
; CHECKO2REM-NOT: mov {{.*}}, 1 ; CHECKO2-NOT: mov {{.*}}, 1
; CHECKO2REM: mov {{.*}}, 0 ; CHECKO2: mov {{.*}}, 0
; TODO(jvoung): at some point we can take advantage of the ; TODO(jvoung): at some point we can take advantage of the
; fact that nacl.atomic.is.lock.free will resolve to a constant ; fact that nacl.atomic.is.lock.free will resolve to a constant
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment