Commit 5c87542a by David Sehr

Optimize 64-bit compares with zero

Comparisons with zero can be done with no branches in most cases and with simpler sequences of operations. BUG= R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1406593003 .
parent e3455053
...@@ -24,7 +24,11 @@ ...@@ -24,7 +24,11 @@
bool icmp##cmp(uint64 a, uint64 b) { return a op b; } \ bool icmp##cmp(uint64 a, uint64 b) { return a op b; } \
v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; } \ v4ui32 icmp##cmp(v4ui32 a, v4ui32 b) { return a op b; } \
v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; } \ v8ui16 icmp##cmp(v8ui16 a, v8ui16 b) { return a op b; } \
v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; } v16ui8 icmp##cmp(v16ui8 a, v16ui8 b) { return a op b; } \
bool icmp_zero##cmp(uint8_t a) { return a op 0; } \
bool icmp_zero##cmp(uint16_t a) { return a op 0; } \
bool icmp_zero##cmp(uint32_t a) { return a op 0; } \
bool icmp_zero##cmp(uint64 a) { return a op 0; }
ICMP_U_TABLE ICMP_U_TABLE
#undef X #undef X
...@@ -35,6 +39,10 @@ ICMP_U_TABLE ...@@ -35,6 +39,10 @@ ICMP_U_TABLE
bool icmp##cmp(int64 a, int64 b) { return a op b; } \ bool icmp##cmp(int64 a, int64 b) { return a op b; } \
v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; } \ v4si32 icmp##cmp(v4si32 a, v4si32 b) { return a op b; } \
v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; } \ v8si16 icmp##cmp(v8si16 a, v8si16 b) { return a op b; } \
v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; } v16si8 icmp##cmp(v16si8 a, v16si8 b) { return a op b; } \
bool icmp_zero##cmp(myint8_t a) { return a op 0; } \
bool icmp_zero##cmp(int16_t a) { return a op 0; } \
bool icmp_zero##cmp(int32_t a) { return a op 0; } \
bool icmp_zero##cmp(int64 a) { return a op 0; }
ICMP_S_TABLE ICMP_S_TABLE
#undef X #undef X
...@@ -24,7 +24,11 @@ ...@@ -24,7 +24,11 @@
bool icmp##cmp(uint64 a, uint64 b); \ bool icmp##cmp(uint64 a, uint64 b); \
v4ui32 icmp##cmp(v4ui32 a, v4ui32 b); \ v4ui32 icmp##cmp(v4ui32 a, v4ui32 b); \
v8ui16 icmp##cmp(v8ui16 a, v8ui16 b); \ v8ui16 icmp##cmp(v8ui16 a, v8ui16 b); \
v16ui8 icmp##cmp(v16ui8 a, v16ui8 b); v16ui8 icmp##cmp(v16ui8 a, v16ui8 b); \
bool icmp_zero##cmp(uint8_t a); \
bool icmp_zero##cmp(uint16_t a); \
bool icmp_zero##cmp(uint32_t a); \
bool icmp_zero##cmp(uint64 a);
ICMP_U_TABLE ICMP_U_TABLE
#undef X #undef X
...@@ -35,7 +39,11 @@ ICMP_U_TABLE ...@@ -35,7 +39,11 @@ ICMP_U_TABLE
bool icmp##cmp(int64 a, int64 b); \ bool icmp##cmp(int64 a, int64 b); \
v4si32 icmp##cmp(v4si32 a, v4si32 b); \ v4si32 icmp##cmp(v4si32 a, v4si32 b); \
v8si16 icmp##cmp(v8si16 a, v8si16 b); \ v8si16 icmp##cmp(v8si16 a, v8si16 b); \
v16si8 icmp##cmp(v16si8 a, v16si8 b); v16si8 icmp##cmp(v16si8 a, v16si8 b); \
bool icmp_zero##cmp(myint8_t a); \
bool icmp_zero##cmp(int16_t a); \
bool icmp_zero##cmp(int32_t a); \
bool icmp_zero##cmp(int64 a);
ICMP_S_TABLE ICMP_S_TABLE
#undef X #undef X
......
...@@ -121,6 +121,78 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) { ...@@ -121,6 +121,78 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
} }
} }
template <typename TypeUnsigned, typename TypeSigned>
void testsIntWithZero(size_t &TotalTests, size_t &Passes, size_t &Failures) {
typedef bool (*FuncTypeUnsigned)(TypeUnsigned);
typedef bool (*FuncTypeSigned)(TypeSigned);
static struct {
const char *Name;
FuncTypeUnsigned FuncLlc;
FuncTypeUnsigned FuncSz;
} Funcs[] = {
#define X(cmp, op) \
{ \
STR(cmp), (FuncTypeUnsigned)icmp_zero##cmp, \
(FuncTypeUnsigned)Subzero_::icmp_zero##cmp \
} \
,
ICMP_U_TABLE
#undef X
#define X(cmp, op) \
{ \
STR(cmp), (FuncTypeUnsigned)(FuncTypeSigned)icmp_zero##cmp, \
(FuncTypeUnsigned)(FuncTypeSigned)Subzero_::icmp_zero##cmp \
} \
,
ICMP_S_TABLE
#undef X
};
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
if (sizeof(TypeUnsigned) <= sizeof(uint32_t)) {
// This is the "normal" version of the loop nest, for 32-bit or
// narrower types.
for (size_t f = 0; f < NumFuncs; ++f) {
for (size_t i = 0; i < NumValues; ++i) {
TypeUnsigned Value = Values[i];
++TotalTests;
bool ResultSz = Funcs[f].FuncSz(Value);
bool ResultLlc = Funcs[f].FuncLlc(Value);
if (ResultSz == ResultLlc) {
++Passes;
} else {
++Failures;
std::cout << "icmp" << Funcs[f].Name
<< (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value
<< "): sz=" << ResultSz << " llc=" << ResultLlc << "\n";
}
}
}
} else {
// This is the 64-bit version. Test values are synthesized from
// the 32-bit values in Values[].
for (size_t f = 0; f < NumFuncs; ++f) {
for (size_t iLo = 0; iLo < NumValues; ++iLo) {
for (size_t iHi = 0; iHi < NumValues; ++iHi) {
TypeUnsigned Value =
(((TypeUnsigned)Values[iHi]) << 32) + Values[iLo];
++TotalTests;
bool ResultSz = Funcs[f].FuncSz(Value);
bool ResultLlc = Funcs[f].FuncLlc(Value);
if (ResultSz == ResultLlc) {
++Passes;
} else {
++Failures;
std::cout << "icmp" << Funcs[f].Name
<< (CHAR_BIT * sizeof(TypeUnsigned)) << "(" << Value
<< "): sz=" << ResultSz << " llc=" << ResultLlc << "\n";
}
}
}
}
}
}
const static size_t MaxTestsPerFunc = 100000; const static size_t MaxTestsPerFunc = 100000;
template <typename TypeUnsignedLabel, typename TypeSignedLabel> template <typename TypeUnsignedLabel, typename TypeSignedLabel>
...@@ -287,6 +359,10 @@ int main(int argc, char *argv[]) { ...@@ -287,6 +359,10 @@ int main(int argc, char *argv[]) {
testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures); testsInt<uint16_t, int16_t>(TotalTests, Passes, Failures);
testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures); testsInt<uint32_t, int32_t>(TotalTests, Passes, Failures);
testsInt<uint64, int64>(TotalTests, Passes, Failures); testsInt<uint64, int64>(TotalTests, Passes, Failures);
testsIntWithZero<uint8_t, myint8_t>(TotalTests, Passes, Failures);
testsIntWithZero<uint16_t, int16_t>(TotalTests, Passes, Failures);
testsIntWithZero<uint32_t, int32_t>(TotalTests, Passes, Failures);
testsIntWithZero<uint64, int64>(TotalTests, Passes, Failures);
testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures); testsVecInt<v4ui32, v4si32>(TotalTests, Passes, Failures);
testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures); testsVecInt<v8ui16, v8si16>(TotalTests, Passes, Failures);
testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures); testsVecInt<v16ui8, v16si8>(TotalTests, Passes, Failures);
......
...@@ -2092,7 +2092,6 @@ void InstX86Test<Machine>::emitIAS(const Cfg *Func) const { ...@@ -2092,7 +2092,6 @@ void InstX86Test<Machine>::emitIAS(const Cfg *Func) const {
return; return;
} }
} }
llvm_unreachable("Nothing actually generates this so it's untested");
emitIASAsAddrOpTyGPR<Machine>(Func, Ty, Src0, Src1, AddrEmitter); emitIASAsAddrOpTyGPR<Machine>(Func, Ty, Src0, Src1, AddrEmitter);
} }
......
...@@ -2648,6 +2648,14 @@ void TargetX86Base<Machine>::lowerFcmp(const InstFcmp *Inst) { ...@@ -2648,6 +2648,14 @@ void TargetX86Base<Machine>::lowerFcmp(const InstFcmp *Inst) {
} }
} }
inline bool isZero(const Operand *Opnd) {
if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Opnd))
return C64->getValue() == 0;
if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(Opnd))
return C32->getValue() == 0;
return false;
}
template <class Machine> template <class Machine>
void TargetX86Base<Machine>::lowerIcmp(const InstIcmp *Inst) { void TargetX86Base<Machine>::lowerIcmp(const InstIcmp *Inst) {
Operand *Src0 = legalize(Inst->getSrc(0)); Operand *Src0 = legalize(Inst->getSrc(0));
...@@ -2769,6 +2777,18 @@ void TargetX86Base<Machine>::lowerIcmp(const InstIcmp *Inst) { ...@@ -2769,6 +2777,18 @@ void TargetX86Base<Machine>::lowerIcmp(const InstIcmp *Inst) {
} }
// cmp b, c // cmp b, c
if (isZero(Src1)) {
switch (Inst->getCondition()) {
default:
break;
case InstIcmp::Uge:
_mov(Dest, Ctx->getConstantInt(Dest->getType(), 1));
return;
case InstIcmp::Ult:
_mov(Dest, Ctx->getConstantInt(Dest->getType(), 0));
return;
}
}
Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1); Operand *Src0RM = legalizeSrc0ForCmp(Src0, Src1);
_cmp(Src0RM, Src1); _cmp(Src0RM, Src1);
_setcc(Dest, Traits::getIcmp32Mapping(Inst->getCondition())); _setcc(Dest, Traits::getIcmp32Mapping(Inst->getCondition()));
...@@ -2785,12 +2805,88 @@ TargetX86Base<Machine>::lowerIcmp64(const InstIcmp *Inst) { ...@@ -2785,12 +2805,88 @@ TargetX86Base<Machine>::lowerIcmp64(const InstIcmp *Inst) {
InstIcmp::ICond Condition = Inst->getCondition(); InstIcmp::ICond Condition = Inst->getCondition();
size_t Index = static_cast<size_t>(Condition); size_t Index = static_cast<size_t>(Condition);
assert(Index < Traits::TableIcmp64Size); assert(Index < Traits::TableIcmp64Size);
Operand *Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
Operand *Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
Constant *Zero = Ctx->getConstantZero(IceType_i32); Constant *Zero = Ctx->getConstantZero(IceType_i32);
Constant *One = Ctx->getConstantInt32(1); Constant *One = Ctx->getConstantInt32(1);
Operand *Src0LoRM = nullptr;
Operand *Src0HiRM = nullptr;
// Legalize the portions of Src0 that are going to be needed.
if (isZero(Src1)) {
switch (Condition) {
default:
llvm_unreachable("unexpected condition");
break;
// These two are not optimized, so we fall through to the general case,
// which needs the upper and lower halves legalized.
case InstIcmp::Sgt:
case InstIcmp::Sle:
// These four compare after performing an "or" of the high and low half, so they
// need the upper and lower halves legalized.
case InstIcmp::Eq:
case InstIcmp::Ule:
case InstIcmp::Ne:
case InstIcmp::Ugt:
Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
// These two test only the high half's sign bit, so they need only
// the upper half legalized.
case InstIcmp::Sge:
case InstIcmp::Slt:
Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
break;
// These two move constants and hence need no legalization.
case InstIcmp::Uge:
case InstIcmp::Ult:
break;
}
} else {
Src0LoRM = legalize(loOperand(Src0), Legal_Reg | Legal_Mem);
Src0HiRM = legalize(hiOperand(Src0), Legal_Reg | Legal_Mem);
}
// Optimize comparisons with zero.
if (isZero(Src1)) {
Constant *SignMask = Ctx->getConstantInt32(0x80000000);
Variable *Temp = nullptr;
switch (Condition) {
default:
llvm_unreachable("unexpected condition");
break;
case InstIcmp::Eq:
case InstIcmp::Ule:
_mov(Temp, Src0LoRM);
_or(Temp, Src0HiRM);
Context.insert(InstFakeUse::create(Func, Temp));
_setcc(Dest, Traits::Cond::Br_e);
return;
case InstIcmp::Ne:
case InstIcmp::Ugt:
_mov(Temp, Src0LoRM);
_or(Temp, Src0HiRM);
Context.insert(InstFakeUse::create(Func, Temp));
_setcc(Dest, Traits::Cond::Br_ne);
return;
case InstIcmp::Uge:
_mov(Dest, Ctx->getConstantInt(Dest->getType(), 1));
return;
case InstIcmp::Ult:
_mov(Dest, Ctx->getConstantInt(Dest->getType(), 0));
return;
case InstIcmp::Sgt:
break;
case InstIcmp::Sge:
_test(Src0HiRM, SignMask);
_setcc(Dest, Traits::Cond::Br_e);
return;
case InstIcmp::Slt:
_test(Src0HiRM, SignMask);
_setcc(Dest, Traits::Cond::Br_ne);
return;
case InstIcmp::Sle:
break;
}
}
// Handle general compares.
Operand *Src1LoRI = legalize(loOperand(Src1), Legal_Reg | Legal_Imm);
Operand *Src1HiRI = legalize(hiOperand(Src1), Legal_Reg | Legal_Imm);
typename Traits::Insts::Label *LabelFalse = typename Traits::Insts::Label *LabelFalse =
Traits::Insts::Label::create(Func, this); Traits::Insts::Label::create(Func, this);
typename Traits::Insts::Label *LabelTrue = typename Traits::Insts::Label *LabelTrue =
......
...@@ -166,7 +166,7 @@ next: ...@@ -166,7 +166,7 @@ next:
call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6) call void @llvm.nacl.atomic.store.i32(i32 %val, i32* %ptr, i32 6)
br label %next2 br label %next2
next2: next2:
%cmp = icmp ult i64 %val64, 0 %cmp = icmp ult i64 %val64, 1
br i1 %cmp, label %next, label %next2 br i1 %cmp, label %next, label %next2
} }
; CHECK-LABEL: test_local_forward_then_back ; CHECK-LABEL: test_local_forward_then_back
......
; Simple test of non-fused compare/branch.
; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 \
; RUN: -allow-externally-defined-symbols | FileCheck %s
; RUN: %p2i --filetype=obj --disassemble -i %s --args -Om1 \
; RUN: -allow-externally-defined-symbols | FileCheck %s
define internal void @icmpEqZero64() {
entry:
%cmp = icmp eq i64 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
call void @func()
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpEqZero64
; CHECK: or
; CHECK-NEXT: sete
define internal void @icmpNeZero64() {
entry:
%cmp = icmp ne i64 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
call void @func()
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpNeZero64
; CHECK: or
; CHECK-NEXT: setne
define internal void @icmpSgeZero64() {
entry:
%cmp = icmp sge i64 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
call void @func()
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpSgeZero64
; CHECK: test eax,0x80000000
; CHECK-NEXT: sete
define internal void @icmpSltZero64() {
entry:
%cmp = icmp slt i64 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
call void @func()
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpSltZero64
; CHECK: test eax,0x80000000
; CHECK-NEXT: setne
define internal void @icmpUltZero64() {
entry:
%cmp = icmp ult i64 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
call void @func()
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUltZero64
; CHECK: mov [[RESULT:.*]],0x0
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUgeZero64() {
entry:
%cmp = icmp uge i64 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
call void @func()
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUgeZero64
; CHECK: mov [[RESULT:.*]],0x1
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUltZero32() {
entry:
%cmp = icmp ult i32 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%cmp_ext = zext i1 %cmp to i32
call void @use(i32 %cmp_ext)
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUltZero32
; CHECK: mov [[RESULT:.*]],0x0
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUgeZero32() {
entry:
%cmp = icmp uge i32 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%cmp_ext = zext i1 %cmp to i32
call void @use(i32 %cmp_ext)
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUgeZero32
; CHECK: mov [[RESULT:.*]],0x1
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUltZero16() {
entry:
%cmp = icmp ult i16 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%cmp_ext = zext i1 %cmp to i32
call void @use(i32 %cmp_ext)
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUltZero16
; CHECK: mov [[RESULT:.*]],0x0
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUgeZero16() {
entry:
%cmp = icmp uge i16 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%cmp_ext = zext i1 %cmp to i32
call void @use(i32 %cmp_ext)
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUgeZero16
; CHECK: mov [[RESULT:.*]],0x1
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUltZero8() {
entry:
%cmp = icmp ult i8 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%cmp_ext = zext i1 %cmp to i32
call void @use(i32 %cmp_ext)
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUltZero8
; CHECK: mov [[RESULT:.*]],0x0
; CHECK-NEXT: cmp [[RESULT]],0x0
define internal void @icmpUgeZero8() {
entry:
%cmp = icmp uge i8 123, 0
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%cmp_ext = zext i1 %cmp to i32
call void @use(i32 %cmp_ext)
br label %if.end
if.end: ; preds = %if.then, %if.end
ret void
}
; The following checks are not strictly necessary since one of the RUN
; lines actually runs the output through the assembler.
; CHECK-LABEL: icmpUgeZero8
; CHECK: mov [[RESULT:.*]],0x1
; CHECK-NEXT: cmp [[RESULT]],0x0
declare void @func()
declare void @use(i32)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment