Commit 98cc08ca by John Porto

Subzero. ARM32. Strength reduce multiplications.

parent 614140e2
......@@ -49,3 +49,37 @@ SINTOP_TABLE
v4f32 test##inst(v4f32 a, v4f32 b) { return func(a op b); }
FPOP_TABLE
#undef X
#define X(mult_by) \
bool testMultiplyBy##mult_by(bool a, bool /*unused*/) { \
return a * (mult_by); \
} \
bool testMultiplyByNeg##mult_by(bool a, bool /*unused*/) { \
return a * (-(mult_by)); \
} \
uint8_t testMultiplyBy##mult_by(uint8_t a, uint8_t /*unused*/) { \
return a * (mult_by); \
} \
uint8_t testMultiplyByNeg##mult_by(uint8_t a, uint8_t /*unused*/) { \
return a * (-(mult_by)); \
} \
uint16_t testMultiplyBy##mult_by(uint16_t a, uint16_t /*unused*/) { \
return a * (mult_by); \
} \
uint16_t testMultiplyByNeg##mult_by(uint16_t a, uint16_t /*unused*/) { \
return a * (-(mult_by)); \
} \
uint32_t testMultiplyBy##mult_by(uint32_t a, uint32_t /*unused*/) { \
return a * (mult_by); \
} \
uint32_t testMultiplyByNeg##mult_by(uint32_t a, uint32_t /*unused*/) { \
return a * (-(mult_by)); \
} \
uint64_t testMultiplyBy##mult_by(uint64_t a, uint64_t /*unused*/) { \
return a * (mult_by); \
} \
uint64_t testMultiplyByNeg##mult_by(uint64_t a, uint64_t /*unused*/) { \
return a * (-(mult_by)); \
}
MULIMM_TABLE
#undef X
......@@ -17,35 +17,35 @@
#define XSTR(s) STR(s)
#define STR(s) #s
#define UINTOP_TABLE \
/* inst, operator, div, shift */ \
X(Add, +, 0, 0) \
X(Sub, -, 0, 0) \
X(Mul, *, 0, 0) \
X(Udiv, /, 1, 0) \
X(Urem, %, 1, 0) \
X(Shl, <<, 0, 1) \
X(Lshr, >>, 0, 1) \
X(And, &, 0, 0) \
X(Or, |, 0, 0) \
X(Xor, ^, 0, 0) \
#define UINTOP_TABLE \
/* inst, operator, div, shift */ \
X(Add, +, 0, 0) \
X(Sub, -, 0, 0) \
X(Mul, *, 0, 0) \
X(Udiv, /, 1, 0) \
X(Urem, %, 1, 0) \
X(Shl, <<, 0, 1) \
X(Lshr, >>, 0, 1) \
X(And, &, 0, 0) \
X(Or, |, 0, 0) \
X(Xor, ^, 0, 0) \
//#define X(inst, op, isdiv, isshift)
#define SINTOP_TABLE \
/* inst, operator, div, shift */ \
X(Sdiv, /, 1, 0) \
X(Srem, %, 1, 0) \
X(Ashr, >>, 0, 1) \
#define SINTOP_TABLE \
/* inst, operator, div, shift */ \
X(Sdiv, /, 1, 0) \
X(Srem, %, 1, 0) \
X(Ashr, >>, 0, 1) \
//#define X(inst, op, isdiv, isshift)
#define COMMA ,
#define FPOP_TABLE \
/* inst, infix_op, func */ \
X(Fadd, +, ) \
X(Fsub, -, ) \
X(Fmul, *, ) \
X(Fdiv, /, ) \
X(Frem, COMMA, myFrem) \
#define FPOP_TABLE \
/* inst, infix_op, func */ \
X(Fadd, +, ) \
X(Fsub, -, ) \
X(Fmul, *, ) \
X(Fdiv, /, ) \
X(Frem, COMMA, myFrem) \
//#define X(inst, op, func)
// Note: The above definition of COMMA, plus the "func" argument to
......@@ -55,30 +55,51 @@
// instruction and "(a + b)" for the Fadd instruction. The two
// versions of myFrem() are defined in a separate bitcode file.
#define INT_VALUE_ARRAY \
{ 0x0, 0x1, 0x7ffffffe, 0x7fffffff, \
0x80000000, 0x80000001, 0xfffffffe, 0xffffffff, \
0x1e, 0x1f, 0x20, 0x21, 0x3e, 0x3f, 0x40, 0x41, \
0x7e, 0x7f, 0x80, 0x81, \
0xfe, 0xff, 0x100, 0x101, \
0x7ffe, 0x7fff, 0x8000, 0x8001, \
#define INT_VALUE_ARRAY \
{ 0x0, 0x1, 0x7ffffffe, 0x7fffffff, \
0x80000000, 0x80000001, 0xfffffffe, 0xffffffff, \
0x1e, 0x1f, 0x20, 0x21, 0x3e, 0x3f, 0x40, 0x41, \
0x7e, 0x7f, 0x80, 0x81, \
0xfe, 0xff, 0x100, 0x101, \
0x7ffe, 0x7fff, 0x8000, 0x8001, \
0xfffe, 0xffff, 0x10000, 0x10001 }
#define FP_VALUE_ARRAY(NegInf, PosInf, NegNan, NaN) \
{ 0, 1, 1.4, \
1.5, 1.6, -1.4, \
-1.5, -1.6, 0x7e, \
0x7f, 0x80, 0x81, \
0xfe, 0xff, 0x7ffe, \
0x7fff, 0x8000, 0x8001, \
0xfffe, 0xffff, 0x7ffffffe, \
0x7fffffff, 0x80000000, 0x80000001, \
0xfffffffe, 0xffffffff, 0x100000000ll, \
0x100000001ll, 0x7ffffffffffffffell, 0x7fffffffffffffffll, \
0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell, \
0xffffffffffffffffll, NegInf, PosInf, \
Nan, NegNan, -0.0, \
10.0, FLT_MIN, FLT_MAX, \
#define FP_VALUE_ARRAY(NegInf, PosInf, NegNan, NaN) \
{ 0, 1, 1.4, \
1.5, 1.6, -1.4, \
-1.5, -1.6, 0x7e, \
0x7f, 0x80, 0x81, \
0xfe, 0xff, 0x7ffe, \
0x7fff, 0x8000, 0x8001, \
0xfffe, 0xffff, 0x7ffffffe, \
0x7fffffff, 0x80000000, 0x80000001, \
0xfffffffe, 0xffffffff, 0x100000000ll, \
0x100000001ll, 0x7ffffffffffffffell, 0x7fffffffffffffffll, \
0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell, \
0xffffffffffffffffll, NegInf, PosInf, \
Nan, NegNan, -0.0, \
10.0, FLT_MIN, FLT_MAX, \
DBL_MIN, DBL_MAX }
#define MULIMM_TABLE \
/* mult_by */ \
X( 0) \
X( 1) \
X( 2) \
X( 3) \
X( 4) \
X( 5) \
X( 7) \
X( 8) \
X( 9) \
X( 10) \
X( 25) \
X( 100) \
X( 232) \
X(0x00FFF001) \
X(0x01000000) \
X(0x7FFFF07F) \
X(0x80000000) \
//#define X(mult_by)
#endif // TEST_ARITH_DEF
......@@ -60,3 +60,17 @@ double mySqrt(double a);
float myFabs(float a);
double myFabs(double a);
v4f32 myFabs(v4f32 a);
#define X(mult_by) \
bool testMultiplyBy##mult_by(bool a, bool); \
bool testMultiplyByNeg##mult_by(bool a, bool); \
uint8_t testMultiplyBy##mult_by(uint8_t a, uint8_t); \
uint8_t testMultiplyByNeg##mult_by(uint8_t a, uint8_t); \
uint16_t testMultiplyBy##mult_by(uint16_t a, uint16_t); \
uint16_t testMultiplyByNeg##mult_by(uint16_t a, uint16_t); \
uint32_t testMultiplyBy##mult_by(uint32_t a, uint32_t); \
uint32_t testMultiplyByNeg##mult_by(uint32_t a, uint32_t); \
uint64_t testMultiplyBy##mult_by(uint64_t a, uint64_t); \
uint64_t testMultiplyByNeg##mult_by(uint64_t a, uint64_t);
MULIMM_TABLE
#undef X
......@@ -73,7 +73,15 @@ void testsInt(size_t &TotalTests, size_t &Passes, size_t &Failures) {
,
SINTOP_TABLE
#undef X
};
#define X(mult_by) \
{ \
"Mult-By-" STR(mult_by), testMultiplyBy##mult_by, \
Subzero_::testMultiplyBy##mult_by, NULL, NULL, false \
} \
, {"Mult-By-Neg-" STR(mult_by), testMultiplyByNeg##mult_by, \
Subzero_::testMultiplyByNeg##mult_by, NULL, NULL, false},
MULIMM_TABLE};
#undef X
const static size_t NumFuncs = sizeof(Funcs) / sizeof(*Funcs);
if (sizeof(TypeUnsigned) <= sizeof(uint32_t)) {
......
......@@ -32,6 +32,7 @@
#include "llvm/Support/MathExtras.h"
#include <algorithm>
#include <array>
#include <utility>
namespace Ice {
......@@ -2036,6 +2037,154 @@ void TargetARM32::lowerInt64Arithmetic(InstArithmetic::OpKind Op,
}
}
namespace {
// StrengthReduction is a namespace with the strength reduction machinery. The
// entry point is the StrengthReduction::tryToOptimize method. It returns true
// if the optimization can be performed, and false otherwise.
//
// If the optimization can be performed, tryToOptimize sets its NumOperations
// parameter to the number of shifts that are needed to perform the
// multiplication; and it sets the Operations parameter with <ShAmt, AddOrSub>
// tuples that describe how to materialize the multiplication.
//
// The algorithm finds contiguous 1s in the Multiplication source, and uses one
// or two shifts to materialize it. A sequence of 1s, e.g.,
//
// M N
// ...00000000000011111...111110000000...
//
// is materializable with (1 << (M + 1)) - (1 << N):
//
// ...00000000000100000...000000000000... [1 << (M + 1)]
// ...00000000000000000...000010000000... (-) [1 << N]
// --------------------------------------
// ...00000000000011111...111110000000...
//
// And a single bit set, which is just a left shift.
namespace StrengthReduction {
enum AggregationOperation {
AO_Invalid,
AO_Add,
AO_Sub,
};
// AggregateElement is a glorified <ShAmt, AddOrSub> tuple.
class AggregationElement {
AggregationElement(const AggregationElement &) = delete;
public:
AggregationElement() = default;
AggregationElement &operator=(const AggregationElement &) = default;
AggregationElement(AggregationOperation Op, uint32_t ShAmt)
: Op(Op), ShAmt(ShAmt) {}
Operand *createShiftedOperand(Cfg *Func, Variable *OpR) const {
assert(OpR->mustHaveReg());
if (ShAmt == 0) {
return OpR;
}
return OperandARM32FlexReg::create(
Func, IceType_i32, OpR, OperandARM32::LSL,
OperandARM32ShAmtImm::create(
Func, llvm::cast<ConstantInteger32>(
Func->getContext()->getConstantInt32(ShAmt))));
}
bool aggregateWithAdd() const {
switch (Op) {
case AO_Invalid:
llvm::report_fatal_error("Invalid Strength Reduction Operations.");
case AO_Add:
return true;
case AO_Sub:
return false;
}
}
uint32_t shAmt() const { return ShAmt; }
private:
AggregationOperation Op = AO_Invalid;
uint32_t ShAmt;
};
// [RangeStart, RangeEnd] is a range of 1s in Src.
template <std::size_t N>
bool addOperations(uint32_t RangeStart, uint32_t RangeEnd, SizeT *NumOperations,
std::array<AggregationElement, N> *Operations) {
assert(*NumOperations < N);
if (RangeStart == RangeEnd) {
// Single bit set:
// Src : 0...00010...
// RangeStart : ^
// RangeEnd : ^
// NegSrc : 0...00001...
(*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart);
++(*NumOperations);
return true;
}
// Sequence of 1s: (two operations required.)
// Src : 0...00011...110...
// RangeStart : ^
// RangeEnd : ^
// NegSrc : 0...00000...001...
if (*NumOperations + 1 >= N) {
return false;
}
(*Operations)[*NumOperations] = AggregationElement(AO_Add, RangeStart + 1);
++(*NumOperations);
(*Operations)[*NumOperations] = AggregationElement(AO_Sub, RangeEnd);
++(*NumOperations);
return true;
}
// tryToOptmize scans Src looking for sequences of 1s (including the unitary bit
// 1 surrounded by zeroes.
template <std::size_t N>
bool tryToOptimize(uint32_t Src, SizeT *NumOperations,
std::array<AggregationElement, N> *Operations) {
constexpr uint32_t SrcSizeBits = sizeof(Src) * CHAR_BIT;
uint32_t NegSrc = ~Src;
*NumOperations = 0;
while (Src != 0 && *NumOperations < N) {
// Each step of the algorithm:
// * finds L, the last bit set in Src;
// * clears all the upper bits in NegSrc up to bit L;
// * finds nL, the last bit set in NegSrc;
// * clears all the upper bits in Src up to bit nL;
//
// if L == nL + 1, then a unitary 1 was found in Src. Otherwise, a sequence
// of 1s starting at L, and ending at nL + 1, was found.
const uint32_t SrcLastBitSet = llvm::findLastSet(Src);
const uint32_t NegSrcClearMask =
(SrcLastBitSet == 0) ? 0
: (0xFFFFFFFFu) >> (SrcSizeBits - SrcLastBitSet);
NegSrc &= NegSrcClearMask;
if (NegSrc == 0) {
if (addOperations(SrcLastBitSet, 0, NumOperations, Operations)) {
return true;
}
return false;
}
const uint32_t NegSrcLastBitSet = llvm::findLastSet(NegSrc);
assert(NegSrcLastBitSet < SrcLastBitSet);
const uint32_t SrcClearMask =
(NegSrcLastBitSet == 0) ? 0 : (0xFFFFFFFFu) >>
(SrcSizeBits - NegSrcLastBitSet);
Src &= SrcClearMask;
if (!addOperations(SrcLastBitSet, NegSrcLastBitSet + 1, NumOperations,
Operations)) {
return false;
}
}
return Src == 0;
}
} // end of namespace StrengthReduction
} // end of anonymous namespace
void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
Variable *Dest = Inst->getDest();
......@@ -2044,29 +2193,30 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
return;
}
if (Dest->getType() == IceType_i1) {
Type DestTy = Dest->getType();
if (DestTy == IceType_i1) {
lowerInt1Arithmetic(Inst);
return;
}
Operand *Src0 = legalizeUndef(Inst->getSrc(0));
Operand *Src1 = legalizeUndef(Inst->getSrc(1));
if (Dest->getType() == IceType_i64) {
if (DestTy == IceType_i64) {
lowerInt64Arithmetic(Inst->getOp(), Inst->getDest(), Src0, Src1);
return;
}
if (isVectorType(Dest->getType())) {
if (isVectorType(DestTy)) {
// Add a fake def to keep liveness consistent in the meantime.
Variable *T = makeReg(Dest->getType());
Variable *T = makeReg(DestTy);
Context.insert(InstFakeDef::create(Func, T));
_mov(Dest, T);
UnimplementedError(Func->getContext()->getFlags());
return;
}
// Dest->getType() is a non-i64 scalar.
Variable *T = makeReg(Dest->getType());
// DestTy is a non-i64 scalar.
Variable *T = makeReg(DestTy);
// * Handle div/rem separately. They require a non-legalized Src1 to inspect
// whether or not Src1 is a non-zero constant. Once legalized it is more
......@@ -2107,7 +2257,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
case InstArithmetic::Frem: {
constexpr SizeT MaxSrcs = 2;
Variable *Src0R = legalizeToReg(Src0);
Type Ty = Dest->getType();
Type Ty = DestTy;
InstCall *Call = makeHelperCall(
isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
Call->addArg(Src0R);
......@@ -2205,8 +2355,6 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
}
case InstArithmetic::Sub: {
if (Srcs.hasConstOperand()) {
// TODO(jpp): lowering Src0R here is wrong -- Src0R it is not guaranteed
// to be used.
if (Srcs.immediateIsFlexEncodable()) {
Variable *Src0R = Srcs.src0R(this);
Operand *Src1RF = Srcs.src1RF(this);
......@@ -2233,6 +2381,85 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
return;
}
case InstArithmetic::Mul: {
const bool OptM1 = Ctx->getFlags().getOptLevel() == Opt_m1;
if (!OptM1 && Srcs.hasConstOperand()) {
constexpr std::size_t MaxShifts = 4;
std::array<StrengthReduction::AggregationElement, MaxShifts> Shifts;
SizeT NumOperations;
int32_t Const = Srcs.getConstantValue();
const bool Invert = Const < 0;
const bool MultiplyByZero = Const == 0;
Operand *_0 =
legalize(Ctx->getConstantZero(DestTy), Legal_Reg | Legal_Flex);
if (MultiplyByZero) {
_mov(T, _0);
_mov(Dest, T);
return;
}
if (Invert) {
Const = -Const;
}
if (StrengthReduction::tryToOptimize(Const, &NumOperations, &Shifts)) {
assert(NumOperations >= 1);
Variable *Src0R = Srcs.src0R(this);
int32_t Start;
int32_t End;
if (NumOperations == 1 || Shifts[NumOperations - 1].shAmt() != 0) {
// Multiplication by a power of 2 (NumOperations == 1); or
// Multiplication by a even number not a power of 2.
Start = 1;
End = NumOperations;
assert(Shifts[0].aggregateWithAdd());
_lsl(T, Src0R, shAmtImm(Shifts[0].shAmt()));
} else {
// Multiplication by an odd number. Put the free barrel shifter to a
// good use.
Start = 0;
End = NumOperations - 2;
const StrengthReduction::AggregationElement &Last =
Shifts[NumOperations - 1];
const StrengthReduction::AggregationElement &SecondToLast =
Shifts[NumOperations - 2];
if (!Last.aggregateWithAdd()) {
assert(SecondToLast.aggregateWithAdd());
_rsb(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
} else if (!SecondToLast.aggregateWithAdd()) {
assert(Last.aggregateWithAdd());
_sub(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
} else {
_add(T, Src0R, SecondToLast.createShiftedOperand(Func, Src0R));
}
}
// Odd numbers : S E I I
// +---+---+---+---+---+---+ ... +---+---+---+---+
// Shifts = | | | | | | | ... | | | | |
// +---+---+---+---+---+---+ ... +---+---+---+---+
// Even numbers: I S E
//
// S: Start; E: End; I: Init
for (int32_t I = Start; I < End; ++I) {
const StrengthReduction::AggregationElement &Current = Shifts[I];
Operand *SrcF = Current.createShiftedOperand(Func, Src0R);
if (Current.aggregateWithAdd()) {
_add(T, T, SrcF);
} else {
_sub(T, T, SrcF);
}
}
if (Invert) {
// T = 0 - T.
_rsb(T, T, _0);
}
_mov(Dest, T);
return;
}
}
Variable *Src0R = Srcs.unswappedSrc0R(this);
Variable *Src1R = Srcs.unswappedSrc1R(this);
_mul(T, Src0R, Src1R);
......@@ -2248,7 +2475,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
}
case InstArithmetic::Lshr: {
Variable *Src0R = Srcs.unswappedSrc0R(this);
if (Dest->getType() != IceType_i32) {
if (DestTy != IceType_i32) {
_uxt(Src0R, Src0R);
}
_lsr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));
......@@ -2257,7 +2484,7 @@ void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
}
case InstArithmetic::Ashr: {
Variable *Src0R = Srcs.unswappedSrc0R(this);
if (Dest->getType() != IceType_i32) {
if (DestTy != IceType_i32) {
_sxt(Src0R, Src0R);
}
_asr(T, Src0R, Srcs.unswappedSrc1RShAmtImm(this));
......
......@@ -977,15 +977,16 @@ private:
// AllowTemporaryWithNoReg indicates if TargetARM32::makeReg() can be invoked
// without specifying a physical register. This is needed for creating unbound
// temporaries during Ice -> ARM lowering, but before register allocation.
// This a safe-guard that, during the legalization post-passes no unbound
// temporaries are created.
// This a safe-guard that no unbound temporaries are created during the
// legalization post-passes.
bool AllowTemporaryWithNoReg = true;
// ForbidTemporaryWithoutReg is a RAII class that manages
// AllowTemporaryWithNoReg.
class ForbidTemporaryWithoutReg {
ForbidTemporaryWithoutReg() = delete;
ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg&) = delete;
ForbidTemporaryWithoutReg &operator=(const ForbidTemporaryWithoutReg&) = delete;
ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg &) = delete;
ForbidTemporaryWithoutReg &
operator=(const ForbidTemporaryWithoutReg &) = delete;
public:
explicit ForbidTemporaryWithoutReg(TargetARM32 *Target) : Target(Target) {
......
......@@ -11,7 +11,7 @@
; RUN: --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
; RUN: -i %s --args -O2 --skip-unimplemented \
; RUN: | %if --need=target_ARM32 --need=allow_dump \
; RUN: --command FileCheck --check-prefix ARM32 %s
; RUN: --command FileCheck --check-prefix ARM32 --check-prefix ARM-OPT2 %s
; RUN: %if --need=target_ARM32 --need=allow_dump \
; RUN: --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
; RUN: -i %s --args -O2 --mattr=hwdiv-arm --skip-unimplemented \
......@@ -21,7 +21,7 @@
; RUN: --command %p2i --filetype=asm --assemble --disassemble --target arm32 \
; RUN: -i %s --args -Om1 --skip-unimplemented \
; RUN: | %if --need=target_ARM32 --need=allow_dump \
; RUN: --command FileCheck --check-prefix ARM32 %s
; RUN: --command FileCheck --check-prefix ARM32 --check-prefix ARM32-OPTM1 %s
;
; RUN: %if --need=target_MIPS32 --need=allow_dump \
; RUN: --command %p2i --filetype=asm --assemble --disassemble --target mips32\
......@@ -117,8 +117,11 @@ entry:
; CHECK-LABEL: MulImm
; CHECK: imul e{{.*}},e{{.*}},0x63
; ARM32-LABEL: MulImm
; ARM32: movw {{.*}}, #99
; ARM32: mul r{{.*}}, r{{.*}}, r{{.*}}
; ARM32-OPTM1: movw {{.*}}, #99
; ARM32-OPTM1: mul r{{.*}}, r{{.*}}, r{{.*}}
; ARM32-OPT2: rsb [[T:r[0-9]+]], [[S:r[0-9]+]], [[S]], lsl #2
; ARM32-OPT2-DAG: add [[T]], [[T]], [[S]], lsl #7
; ARM32-OPT2-DAG: sub [[T]], [[T]], [[S]], lsl #5
; MIPS32-LABEL: MulImm
; MIPS32: mul
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment