Commit 55f2e6d3 by David Sehr

Optimize 64-bit shifts by constants for x86-32

Hopefully improves perf in fpclassifyd in ammp spec test. BUG=none R=stichnot@chromium.org Review URL: https://codereview.chromium.org/1351133003 .
parent 43632b95
......@@ -58,6 +58,7 @@
#define INT_VALUE_ARRAY \
{ 0x0, 0x1, 0x7ffffffe, 0x7fffffff, \
0x80000000, 0x80000001, 0xfffffffe, 0xffffffff, \
0x1e, 0x1f, 0x20, 0x21, 0x3e, 0x3f, 0x40, 0x41, \
0x7e, 0x7f, 0x80, 0x81, \
0xfe, 0xff, 0x100, 0x101, \
0x7ffe, 0x7fff, 0x8000, 0x8001, \
......
......@@ -2103,7 +2103,7 @@ template <class Machine> class InstX86Shld final : public InstX86Base<Machine> {
public:
static InstX86Shld *create(Cfg *Func, Variable *Dest, Variable *Source1,
Variable *Source2) {
Operand *Source2) {
return new (Func->allocate<InstX86Shld>())
InstX86Shld(Func, Dest, Source1, Source2);
}
......@@ -2115,7 +2115,7 @@ public:
}
private:
InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1, Variable *Source2);
InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
};
/// Shrd instruction - shift across a pair of operands.
......@@ -2126,7 +2126,7 @@ template <class Machine> class InstX86Shrd final : public InstX86Base<Machine> {
public:
static InstX86Shrd *create(Cfg *Func, Variable *Dest, Variable *Source1,
Variable *Source2) {
Operand *Source2) {
return new (Func->allocate<InstX86Shrd>())
InstX86Shrd(Func, Dest, Source1, Source2);
}
......@@ -2138,7 +2138,7 @@ public:
}
private:
InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1, Variable *Source2);
InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1, Operand *Source2);
};
/// Conditional move instruction.
......
......@@ -74,7 +74,7 @@ InstX86Mul<Machine>::InstX86Mul(Cfg *Func, Variable *Dest, Variable *Source1,
template <class Machine>
InstX86Shld<Machine>::InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1,
Variable *Source2)
Operand *Source2)
: InstX86Base<Machine>(Func, InstX86Base<Machine>::Shld, 3, Dest) {
this->addSource(Dest);
this->addSource(Source1);
......@@ -83,7 +83,7 @@ InstX86Shld<Machine>::InstX86Shld(Cfg *Func, Variable *Dest, Variable *Source1,
template <class Machine>
InstX86Shrd<Machine>::InstX86Shrd(Cfg *Func, Variable *Dest, Variable *Source1,
Variable *Source2)
Operand *Source2)
: InstX86Base<Machine>(Func, InstX86Base<Machine>::Shrd, 3, Dest) {
this->addSource(Dest);
this->addSource(Source1);
......
......@@ -568,13 +568,13 @@ protected:
void _shl(Variable *Dest, Operand *Src0) {
Context.insert(Traits::Insts::Shl::create(Func, Dest, Src0));
}
void _shld(Variable *Dest, Variable *Src0, Variable *Src1) {
void _shld(Variable *Dest, Variable *Src0, Operand *Src1) {
Context.insert(Traits::Insts::Shld::create(Func, Dest, Src0, Src1));
}
void _shr(Variable *Dest, Operand *Src0) {
Context.insert(Traits::Insts::Shr::create(Func, Dest, Src0));
}
void _shrd(Variable *Dest, Variable *Src0, Variable *Src1) {
void _shrd(Variable *Dest, Variable *Src0, Operand *Src1) {
Context.insert(Traits::Insts::Shrd::create(Func, Dest, Src0, Src1));
}
void _shufps(Variable *Dest, Operand *Src0, Operand *Src1) {
......
......@@ -1185,114 +1185,237 @@ void TargetX86Base<Machine>::lowerArithmetic(const InstArithmetic *Inst) {
} break;
case InstArithmetic::Shl: {
// TODO: Refactor the similarities between Shl, Lshr, and Ashr.
// gcc does the following:
// a=b<<c ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t3 = shld t3, t2, t1
// t2 = shl t2, t1
// test t1, 0x20
// je L1
// use(t3)
// t3 = t2
// t2 = 0
// L1:
// a.lo = t2
// a.hi = t3
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
typename Traits::Insts::Label *Label =
Traits::Insts::Label::create(Func, this);
_mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shld(T_3, T_2, T_1);
_shl(T_2, T_1);
_test(T_1, BitTest);
_br(Traits::Cond::Br_e, Label);
// T_2 and T_3 are being assigned again because of the intra-block
// control flow, so we need the _mov_nonkillable variant to avoid
// liveness problems.
_mov_nonkillable(T_3, T_2);
_mov_nonkillable(T_2, Zero);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
if (const auto *ConstantShiftAmount =
llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
uint32_t ShiftAmount = ConstantShiftAmount->getValue();
if (ShiftAmount > 32) {
// a=b<<c ==>
// t2 = b.lo
// t2 = shl t2, ShiftAmount-32
// t3 = t2
// t2 = 0
_mov(T_2, Src0Lo);
_shl(T_2, Ctx->getConstantInt32(ShiftAmount - 32));
_mov(DestHi, T_2);
_mov(DestLo, Zero);
} else if (ShiftAmount == 32) {
// a=b<<c ==>
// t2 = b.lo
// a.hi = t2
// a.lo = 0
_mov(T_2, Src0Lo);
_mov(DestHi, T_2);
_mov(DestLo, Zero);
} else {
// a=b<<c ==>
// t2 = b.lo
// t3 = b.hi
// t3 = shld t3, t2, ShiftAmount
// t2 = shl t2, ShiftAmount
// a.lo = t2
// a.hi = t3
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shld(T_3, T_2, Ctx->getConstantInt32(ShiftAmount));
_shl(T_2, Ctx->getConstantInt32(ShiftAmount));
// Move T_2 first to reduce register pressure.
_mov(DestLo, T_2);
_mov(DestHi, T_3);
}
} else {
// a=b<<c ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t3 = shld t3, t2, t1
// t2 = shl t2, t1
// test t1, 0x20
// je L1
// use(t3)
// t3 = t2
// t2 = 0
// L1:
// a.lo = t2
// a.hi = t3
Constant *BitTest = Ctx->getConstantInt32(0x20);
typename Traits::Insts::Label *Label =
Traits::Insts::Label::create(Func, this);
_mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shld(T_3, T_2, T_1);
_shl(T_2, T_1);
_test(T_1, BitTest);
_br(Traits::Cond::Br_e, Label);
// T_2 and T_3 are being assigned again because of the intra-block
// control flow, so we need the _mov_nonkillable variant to avoid
// liveness problems.
_mov_nonkillable(T_3, T_2);
_mov_nonkillable(T_2, Zero);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
}
} break;
case InstArithmetic::Lshr: {
// a=b>>c (unsigned) ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, t1
// t3 = shr t3, t1
// test t1, 0x20
// je L1
// use(t2)
// t2 = t3
// t3 = 0
// L1:
// a.lo = t2
// a.hi = t3
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *Zero = Ctx->getConstantZero(IceType_i32);
typename Traits::Insts::Label *Label =
Traits::Insts::Label::create(Func, this);
_mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, T_1);
_shr(T_3, T_1);
_test(T_1, BitTest);
_br(Traits::Cond::Br_e, Label);
// T_2 and T_3 are being assigned again because of the intra-block
// control flow, so we need the _mov_nonkillable variant to avoid
// liveness problems.
_mov_nonkillable(T_2, T_3);
_mov_nonkillable(T_3, Zero);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
if (const auto *ConstantShiftAmount =
llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
uint32_t ShiftAmount = ConstantShiftAmount->getValue();
if (ShiftAmount > 32) {
// a=b>>c (unsigned) ==>
// t3 = b.hi
// t3 = shr t3, ShiftAmount-32
// a.lo = t3
// a.hi = 0
_mov(T_3, Src0Hi);
_shr(T_3, Ctx->getConstantInt32(ShiftAmount - 32));
_mov(DestLo, T_3);
_mov(DestHi, Zero);
} else if (ShiftAmount == 32) {
// a=b>>c (unsigned) ==>
// t3 = b.hi
// a.lo = t3
// a.hi = 0
_mov(T_3, Src0Hi);
_mov(DestLo, T_3);
_mov(DestHi, Zero);
} else {
// a=b>>c (unsigned) ==>
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, ShiftAmount
// t3 = shr t3, ShiftAmount
// a.lo = t2
// a.hi = t3
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, Ctx->getConstantInt32(ShiftAmount));
_shr(T_3, Ctx->getConstantInt32(ShiftAmount));
// Move T_3 first to reduce register pressure.
_mov(DestHi, T_3);
_mov(DestLo, T_2);
}
} else {
// a=b>>c (unsigned) ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, t1
// t3 = shr t3, t1
// test t1, 0x20
// je L1
// use(t2)
// t2 = t3
// t3 = 0
// L1:
// a.lo = t2
// a.hi = t3
Constant *BitTest = Ctx->getConstantInt32(0x20);
typename Traits::Insts::Label *Label =
Traits::Insts::Label::create(Func, this);
_mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, T_1);
_shr(T_3, T_1);
_test(T_1, BitTest);
_br(Traits::Cond::Br_e, Label);
// T_2 and T_3 are being assigned again because of the intra-block
// control flow, so we need the _mov_nonkillable variant to avoid
// liveness problems.
_mov_nonkillable(T_2, T_3);
_mov_nonkillable(T_3, Zero);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
}
} break;
case InstArithmetic::Ashr: {
// a=b>>c (signed) ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, t1
// t3 = sar t3, t1
// test t1, 0x20
// je L1
// use(t2)
// t2 = t3
// t3 = sar t3, 0x1f
// L1:
// a.lo = t2
// a.hi = t3
Variable *T_1 = nullptr, *T_2 = nullptr, *T_3 = nullptr;
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *SignExtend = Ctx->getConstantInt32(0x1f);
typename Traits::Insts::Label *Label =
Traits::Insts::Label::create(Func, this);
_mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, T_1);
_sar(T_3, T_1);
_test(T_1, BitTest);
_br(Traits::Cond::Br_e, Label);
// T_2 and T_3 are being assigned again because of the intra-block
// control flow, so T_2 needs the _mov_nonkillable variant to avoid
// liveness problems. T_3 doesn't need special treatment because it is
// reassigned via _sar instead of _mov.
_mov_nonkillable(T_2, T_3);
_sar(T_3, SignExtend);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
if (const auto *ConstantShiftAmount =
llvm::dyn_cast<ConstantInteger32>(Src1Lo)) {
uint32_t ShiftAmount = ConstantShiftAmount->getValue();
if (ShiftAmount > 32) {
// a=b>>c (signed) ==>
// t2 = b.hi
// t3 = b.hi
// t3 = sar t3, 0x1f
// t2 = shrd t2, t3, ShiftAmount-32
// a.lo = t2
// a.hi = t3
_mov(T_2, Src0Hi);
_mov(T_3, Src0Hi);
_sar(T_3, Ctx->getConstantInt32(0x1f));
_shrd(T_2, T_3, Ctx->getConstantInt32(ShiftAmount - 32));
_mov(DestLo, T_2);
_mov(DestHi, T_3);
} else if (ShiftAmount == 32) {
// a=b>>c (signed) ==>
// t2 = b.hi
// a.lo = t2
// t3 = b.hi
// t3 = sar t3, 0x1f
// a.hi = t3
_mov(T_2, Src0Hi);
_mov(DestLo, T_2);
_mov(T_3, Src0Hi);
_sar(T_3, Ctx->getConstantInt32(0x1f));
_mov(DestHi, T_3);
} else {
// a=b>>c (signed) ==>
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, ShiftAmount
// t3 = sar t3, ShiftAmount
// a.lo = t2
// a.hi = t3
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, Ctx->getConstantInt32(ShiftAmount));
_sar(T_3, Ctx->getConstantInt32(ShiftAmount));
_mov(DestLo, T_2);
_mov(DestHi, T_3);
}
} else {
// a=b>>c (signed) ==>
// t1:ecx = c.lo & 0xff
// t2 = b.lo
// t3 = b.hi
// t2 = shrd t2, t3, t1
// t3 = sar t3, t1
// test t1, 0x20
// je L1
// use(t2)
// t2 = t3
// t3 = sar t3, 0x1f
// L1:
// a.lo = t2
// a.hi = t3
Constant *BitTest = Ctx->getConstantInt32(0x20);
Constant *SignExtend = Ctx->getConstantInt32(0x1f);
typename Traits::Insts::Label *Label =
Traits::Insts::Label::create(Func, this);
_mov(T_1, Src1Lo, Traits::RegisterSet::Reg_ecx);
_mov(T_2, Src0Lo);
_mov(T_3, Src0Hi);
_shrd(T_2, T_3, T_1);
_sar(T_3, T_1);
_test(T_1, BitTest);
_br(Traits::Cond::Br_e, Label);
// T_2 and T_3 are being assigned again because of the intra-block
// control flow, so T_2 needs the _mov_nonkillable variant to avoid
// liveness problems. T_3 doesn't need special treatment because it is
// reassigned via _sar instead of _mov.
_mov_nonkillable(T_2, T_3);
_sar(T_3, SignExtend);
Context.insert(Label);
_mov(DestLo, T_2);
_mov(DestHi, T_3);
}
} break;
case InstArithmetic::Fadd:
case InstArithmetic::Fsub:
......
......@@ -112,3 +112,105 @@ entry:
}
; CHECK-LABEL: ashrImmNeg
; CHECK: sar {{.*}},0xff
define i64 @shlImm64One(i64 %val) {
entry:
%result = shl i64 %val, 1
ret i64 %result
}
; CHECK-LABEL: shlImm64One
; CHECK: shl {{.*}},1
define i64 @shlImm64LessThan32(i64 %val) {
entry:
%result = shl i64 %val, 4
ret i64 %result
}
; CHECK-LABEL: shlImm64LessThan32
; CHECK: shl {{.*}},0x4
define i64 @shlImm64Equal32(i64 %val) {
entry:
%result = shl i64 %val, 32
ret i64 %result
}
; CHECK-LABEL: shlImm64Equal32
; CHECK-NOT: shl
define i64 @shlImm64GreaterThan32(i64 %val) {
entry:
%result = shl i64 %val, 40
ret i64 %result
}
; CHECK-LABEL: shlImm64GreaterThan32
; CHECK: shl {{.*}},0x8
define i64 @lshrImm64One(i64 %val) {
entry:
%result = lshr i64 %val, 1
ret i64 %result
}
; CHECK-LABEL: lshrImm64One
; CHECK: shr {{.*}},1
define i64 @lshrImm64LessThan32(i64 %val) {
entry:
%result = lshr i64 %val, 4
ret i64 %result
}
; CHECK-LABEL: lshrImm64LessThan32
; CHECK: shrd {{.*}},0x4
; CHECK: shr {{.*}},0x4
define i64 @lshrImm64Equal32(i64 %val) {
entry:
%result = lshr i64 %val, 32
ret i64 %result
}
; CHECK-LABEL: lshrImm64Equal32
; CHECK-NOT: shr
define i64 @lshrImm64GreaterThan32(i64 %val) {
entry:
%result = lshr i64 %val, 40
ret i64 %result
}
; CHECK-LABEL: lshrImm64GreaterThan32
; CHECK-NOT: shrd
; CHECK: shr {{.*}},0x8
define i64 @ashrImm64One(i64 %val) {
entry:
%result = ashr i64 %val, 1
ret i64 %result
}
; CHECK-LABEL: ashrImm64One
; CHECK: shrd {{.*}},0x1
; CHECK: sar {{.*}},1
define i64 @ashrImm64LessThan32(i64 %val) {
entry:
%result = ashr i64 %val, 4
ret i64 %result
}
; CHECK-LABEL: ashrImm64LessThan32
; CHECK: shrd {{.*}},0x4
; CHECK: sar {{.*}},0x4
define i64 @ashrImm64Equal32(i64 %val) {
entry:
%result = ashr i64 %val, 32
ret i64 %result
}
; CHECK-LABEL: ashrImm64Equal32
; CHECK: sar {{.*}},0x1f
; CHECK-NOT: shrd
define i64 @ashrImm64GreaterThan32(i64 %val) {
entry:
%result = ashr i64 %val, 40
ret i64 %result
}
; CHECK-LABEL: ashrImm64GreaterThan32
; CHECK: sar {{.*}},0x1f
; CHECK: shrd {{.*}},0x8
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment