Commit 8c980d0d by Jim Stichnoth

Subzero: Add fabs intrinsic support.

The intrinsic is lowered using the standard technique of masking off the FP sign bit, which is the high-order bit. To construct this mask, we use the existing trick of loading a vector register with all "1" bits, then logical-shift-right by one bit. In the future, we should add 128-bit vector values to the constant pool and force them to memory, and this could be used for the other routines that synthesize a vector constant. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4097 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1022573004
parent f644a4b3
......@@ -8,7 +8,7 @@ test: mem_intrin.cpp
[test_arith]
driver: test_arith_main.cpp
test: test_arith.cpp test_arith_frem.ll test_arith_sqrt.ll
test: test_arith.cpp test_arith_frem.ll test_arith_sqrt.ll test_arith_fabs.ll
[test_bitmanip]
driver: test_bitmanip_main.cpp
......
......@@ -55,3 +55,7 @@ FPOP_TABLE
float mySqrt(float a);
double mySqrt(double a);
// mySqrt for v4f32 is currently unsupported.
float myFabs(float a);
double myFabs(double a);
v4f32 myFabs(v4f32 a);
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
define float @_Z6myFabsf(float %a) {
%x = call float @llvm.fabs.f32(float %a)
ret float %x
}
define double @_Z6myFabsd(double %a) {
%x = call double @llvm.fabs.f64(double %a)
ret double %x
}
define <4 x float> @_Z6myFabsDv4_f(<4 x float> %a) {
%x = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
ret <4 x float> %x
}
......@@ -287,6 +287,18 @@ void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
<< Value << "): sz=" << ResultSz << " llc=" << ResultLlc
<< "\n";
}
++TotalTests;
ResultSz = Subzero_::myFabs(Value);
ResultLlc = myFabs(Value);
// Compare results using memcmp() in case they are both NaN.
if (!memcmp(&ResultSz, &ResultLlc, sizeof(Type))) {
++Passes;
} else {
++Failures;
std::cout << std::fixed << "test_fabs" << (CHAR_BIT * sizeof(Type)) << "("
<< Value << "): sz=" << ResultSz << " llc=" << ResultLlc
<< "\n";
}
}
}
......@@ -334,6 +346,19 @@ void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
<< "): sz=" << vectAsString<v4f32>(ResultSz) << " llc"
<< vectAsString<v4f32>(ResultLlc) << "\n";
}
// Special case for unary fabs operation. Use Value1, ignore Value2.
ResultSz = Subzero_::myFabs(Value1);
ResultLlc = myFabs(Value1);
++TotalTests;
if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
++Passes;
} else {
++Failures;
std::cout << "test_fabs_v4f32"
<< "(" << vectAsString<v4f32>(Value1)
<< "): sz=" << vectAsString<v4f32>(ResultSz) << " llc"
<< vectAsString<v4f32>(ResultLlc) << "\n";
}
}
}
}
......
......@@ -931,6 +931,7 @@ template <> const char *InstX8632Psll::Opcode = "psll";
template <> const char *InstX8632Shr::Opcode = "shr";
template <> const char *InstX8632Sar::Opcode = "sar";
template <> const char *InstX8632Psra::Opcode = "psra";
template <> const char *InstX8632Psrl::Opcode = "psrl";
template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
template <> const char *InstX8632MovssRegs::Opcode = "movss";
......@@ -1078,6 +1079,10 @@ template <>
const x86::AssemblerX86::XmmEmitterShiftOp InstX8632Psra::Emitter = {
&x86::AssemblerX86::psra, &x86::AssemblerX86::psra,
&x86::AssemblerX86::psra};
template <>
const x86::AssemblerX86::XmmEmitterShiftOp InstX8632Psrl::Emitter = {
&x86::AssemblerX86::psrl, &x86::AssemblerX86::psrl,
&x86::AssemblerX86::psrl};
template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
if (!ALLOW_DUMP)
......@@ -2667,6 +2672,15 @@ template <> void InstX8632Psra::emit(const Cfg *Func) const {
emitTwoAddress(buf, this, Func);
}
template <> void InstX8632Psrl::emit(const Cfg *Func) const {
if (!ALLOW_DUMP)
return;
char buf[30];
snprintf(buf, llvm::array_lengthof(buf), "psrl%s",
TypeX8632Attributes[getDest()->getType()].PackString);
emitTwoAddress(buf, this, Func);
}
void InstX8632Ret::emit(const Cfg *Func) const {
if (!ALLOW_DUMP)
return;
......
......@@ -143,15 +143,15 @@
X(IceType_i16, IceType_void, "si", "" , "" , "w", "") \
X(IceType_i32, IceType_void, "si", "" , "" , "l", "") \
X(IceType_i64, IceType_void, "si", "" , "" , "q", "") \
X(IceType_f32, IceType_void, "ss", "ss", "" , "", "s") \
X(IceType_f64, IceType_void, "sd", "sd", "" , "", "l") \
X(IceType_f32, IceType_void, "ss", "ss", "d", "", "s") \
X(IceType_f64, IceType_void, "sd", "sd", "q", "", "l") \
X(IceType_v4i1, IceType_i32 , "?" , "" , "d", "", "") \
X(IceType_v8i1, IceType_i16 , "?" , "" , "w", "", "") \
X(IceType_v16i1, IceType_i8 , "?" , "" , "b", "", "") \
X(IceType_v16i8, IceType_i8 , "?" , "" , "b", "", "") \
X(IceType_v8i16, IceType_i16 , "?" , "" , "w", "", "") \
X(IceType_v4i32, IceType_i32 , "dq", "" , "d", "", "") \
X(IceType_v4f32, IceType_f32 , "ps", "" , "" , "", "") \
X(IceType_v4f32, IceType_f32 , "ps", "" , "d", "", "") \
//#define X(tag, elementty, cvt, sdss, pack, width, fld)
#endif // SUBZERO_SRC_ICEINSTX8632_DEF
......@@ -234,6 +234,7 @@ public:
Pshufd,
Psll,
Psra,
Psrl,
Psub,
Push,
Pxor,
......@@ -786,7 +787,7 @@ void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
const Operand *Src,
const x86::AssemblerX86::XmmEmitterShiftOp &Emitter);
template <InstX8632::InstKindX8632 K>
template <InstX8632::InstKindX8632 K, bool AllowAllTypes = false>
class InstX8632BinopXmmShift : public InstX8632 {
InstX8632BinopXmmShift() = delete;
InstX8632BinopXmmShift(const InstX8632BinopXmmShift &) = delete;
......@@ -807,8 +808,7 @@ public:
}
void emitIAS(const Cfg *Func) const override {
Type Ty = getDest()->getType();
assert(Ty == IceType_v8i16 || Ty == IceType_v8i1 || Ty == IceType_v4i32 ||
Ty == IceType_v4i1);
assert(AllowAllTypes || isVectorType(Ty));
Type ElementTy = typeElementType(Ty);
assert(getSrcSize() == 2);
emitIASXmmShift(Func, ElementTy, getDest(), getSrc(1), Emitter);
......@@ -1013,6 +1013,7 @@ typedef InstX8632BinopXmm<InstX8632::Divss, false> InstX8632Divss;
typedef InstX8632BinopGPRShift<InstX8632::Rol> InstX8632Rol;
typedef InstX8632BinopGPRShift<InstX8632::Shl> InstX8632Shl;
typedef InstX8632BinopXmmShift<InstX8632::Psll> InstX8632Psll;
typedef InstX8632BinopXmmShift<InstX8632::Psrl, true> InstX8632Psrl;
typedef InstX8632BinopGPRShift<InstX8632::Shr> InstX8632Shr;
typedef InstX8632BinopGPRShift<InstX8632::Sar> InstX8632Sar;
typedef InstX8632BinopXmmShift<InstX8632::Psra> InstX8632Psra;
......@@ -1632,6 +1633,7 @@ template <> void InstX8632Pmull::emit(const Cfg *Func) const;
template <> void InstX8632Pmuludq::emit(const Cfg *Func) const;
template <> void InstX8632Psll::emit(const Cfg *Func) const;
template <> void InstX8632Psra::emit(const Cfg *Func) const;
template <> void InstX8632Psrl::emit(const Cfg *Func) const;
template <> void InstX8632Psub::emit(const Cfg *Func) const;
template <> void InstX8632Sqrtss::emit(const Cfg *Func) const;
template <> void InstX8632Subss::emit(const Cfg *Func) const;
......
......@@ -159,6 +159,16 @@ const struct IceIntrinsicsEntry_ {
CttzInit(IceType_i64, "i64"),
#undef CttzInit
#define FabsInit(Overload, NameSuffix) \
{ \
{ INTRIN(Fabs, SideEffects_F, ReturnsTwice_F), {Overload, Overload}, 2 } \
, "fabs." NameSuffix \
}
FabsInit(IceType_f32, "f32"),
FabsInit(IceType_f64, "f64"),
FabsInit(IceType_v4f32, "v4f32"),
#undef FabsInit
{{INTRIN(Longjmp, SideEffects_T, ReturnsTwice_F),
{IceType_void, IceType_i32, IceType_i32},
3},
......
......@@ -48,6 +48,7 @@ public:
Ctlz,
Ctpop,
Cttz,
Fabs,
Longjmp,
Memcpy,
Memmove,
......
......@@ -3089,6 +3089,18 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
SecondVal);
return;
}
case Intrinsics::Fabs: {
Operand *Src = legalize(Instr->getArg(0));
Type Ty = Src->getType();
Variable *Dest = Instr->getDest();
Variable *T = makeVectorOfFabsMask(Ty);
_pand(T, Src);
if (isVectorType(Ty))
_movp(Dest, T);
else
_mov(Dest, T);
return;
}
case Intrinsics::Longjmp: {
InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
Call->addArg(Instr->getArg(0));
......@@ -4362,6 +4374,18 @@ Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
}
}
// Construct a mask in a register that can be and'ed with a
// floating-point value to mask off its sign bit. The value will be
// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff>
// for f64. Construct it as vector of ones logically right shifted
// one bit. TODO(stichnot): Fix the wala TODO above, to represent
// vector constants in memory.
Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, int32_t RegNum) {
Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
_psrl(Reg, Ctx->getConstantInt8(1));
return Reg;
}
OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
Variable *Slot,
uint32_t Offset) {
......
......@@ -174,6 +174,8 @@ protected:
int32_t RegNum = Variable::NoRegister);
Variable *makeVectorOfHighOrderBits(Type Ty,
int32_t RegNum = Variable::NoRegister);
Variable *makeVectorOfFabsMask(Type Ty,
int32_t RegNum = Variable::NoRegister);
// Return a memory operand corresponding to a stack allocated Variable.
OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
......@@ -394,6 +396,9 @@ protected:
void _psra(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Psra::create(Func, Dest, Src0));
}
void _psrl(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Psrl::create(Func, Dest, Src0));
}
void _psub(Variable *Dest, Operand *Src0) {
Context.insert(InstX8632Psub::create(Func, Dest, Src0));
}
......
......@@ -715,6 +715,53 @@ void AssemblerX86::psra(Type Ty, XmmRegister dst, const Immediate &imm) {
EmitUint8(imm.value() & 0xFF);
}
void AssemblerX86::psrl(Type Ty, XmmRegister dst, XmmRegister src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
EmitUint8(0x0F);
if (Ty == IceType_i16) {
EmitUint8(0xD1);
} else if (Ty == IceType_f64) {
EmitUint8(0xD3);
} else {
assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
EmitUint8(0xD2);
}
EmitXmmRegisterOperand(dst, src);
}
void AssemblerX86::psrl(Type Ty, XmmRegister dst, const Address &src) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
EmitUint8(0x66);
EmitUint8(0x0F);
if (Ty == IceType_i16) {
EmitUint8(0xD1);
} else if (Ty == IceType_f64) {
EmitUint8(0xD3);
} else {
assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
EmitUint8(0xD2);
}
EmitOperand(dst, src);
}
void AssemblerX86::psrl(Type Ty, XmmRegister dst, const Immediate &imm) {
AssemblerBuffer::EnsureCapacity ensured(&buffer_);
assert(imm.is_int8());
EmitUint8(0x66);
EmitUint8(0x0F);
if (Ty == IceType_i16) {
EmitUint8(0x71);
} else if (Ty == IceType_f64) {
EmitUint8(0x73);
} else {
assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
EmitUint8(0x72);
}
EmitRegisterOperand(2, dst);
EmitUint8(imm.value() & 0xFF);
}
// {add,sub,mul,div}ps are given a Ty parameter for consistency with
// {add,sub,mul,div}ss. In the future, when the PNaCl ABI allows
// addpd, etc., we can use the Ty parameter to decide on adding
......
......@@ -559,6 +559,9 @@ public:
void psra(Type Ty, XmmRegister dst, XmmRegister src);
void psra(Type Ty, XmmRegister dst, const Address &src);
void psra(Type Ty, XmmRegister dst, const Immediate &src);
void psrl(Type Ty, XmmRegister dst, XmmRegister src);
void psrl(Type Ty, XmmRegister dst, const Address &src);
void psrl(Type Ty, XmmRegister dst, const Immediate &src);
void addps(Type Ty, XmmRegister dst, XmmRegister src);
void addps(Type Ty, XmmRegister dst, const Address &src);
......
......@@ -25,6 +25,9 @@ declare void @llvm.nacl.longjmp(i8*, i32)
declare i32 @llvm.nacl.setjmp(i8*)
declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
declare void @llvm.trap()
declare i16 @llvm.bswap.i16(i16)
declare i32 @llvm.bswap.i32(i32)
......@@ -268,6 +271,63 @@ entry:
; CHECKO2REM-NOT: sqrtss
; CHECKO2REM-NOT: sqrtsd
define float @test_fabs_float(float %x) {
entry:
%r = call float @llvm.fabs.f32(float %x)
%r2 = call float @llvm.fabs.f32(float %r)
%r3 = call float @llvm.fabs.f32(float -0.0)
%r4 = fadd float %r2, %r3
ret float %r4
}
; CHECK-LABEL: test_fabs_float
; CHECK: pcmpeqd
; CHECK: psrld
; CHECK: pand
; CHECK: pcmpeqd
; CHECK: psrld
; CHECK: pand
; CHECK: pcmpeqd
; CHECK: psrld
; CHECK: pand
define double @test_fabs_double(double %x) {
entry:
%r = call double @llvm.fabs.f64(double %x)
%r2 = call double @llvm.fabs.f64(double %r)
%r3 = call double @llvm.fabs.f64(double -0.0)
%r4 = fadd double %r2, %r3
ret double %r4
}
; CHECK-LABEL: test_fabs_double
; CHECK: pcmpeqd
; CHECK: psrlq
; CHECK: pand
; CHECK: pcmpeqd
; CHECK: psrlq
; CHECK: pand
; CHECK: pcmpeqd
; CHECK: psrlq
; CHECK: pand
define <4 x float> @test_fabs_v4f32(<4 x float> %x) {
entry:
%r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
%r2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %r)
%r3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
%r4 = fadd <4 x float> %r2, %r3
ret <4 x float> %r4
}
; CHECK-LABEL: test_fabs_v4f32
; CHECK: pcmpeqd
; CHECK: psrld
; CHECK: pand
; CHECK: pcmpeqd
; CHECK: psrld
; CHECK: pand
; CHECK: pcmpeqd
; CHECK: psrld
; CHECK: pand
define i32 @test_trap(i32 %br) {
entry:
%r1 = icmp eq i32 %br, 0
......
......@@ -14,6 +14,9 @@ declare void @llvm.nacl.longjmp(i8*, i32)
declare i32 @llvm.nacl.setjmp(i8*)
declare float @llvm.sqrt.f32(float)
declare double @llvm.sqrt.f64(double)
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
declare void @llvm.trap()
declare i16 @llvm.bswap.i16(i16)
declare i32 @llvm.bswap.i32(i32)
......@@ -149,6 +152,60 @@ entry:
; CHECK-NEXT: ret double %r4
; CHECK-NEXT: }
define float @test_fabs_float(float %x) {
entry:
%r = call float @llvm.fabs.f32(float %x)
%r2 = call float @llvm.fabs.f32(float %r)
%r3 = call float @llvm.fabs.f32(float -0.0)
%r4 = fadd float %r2, %r3
ret float %r4
}
; CHECK-NEXT: define float @test_fabs_float(float %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %r = call float @llvm.fabs.f32(float %x)
; CHECK-NEXT: %r2 = call float @llvm.fabs.f32(float %r)
; CHECK-NEXT: %r3 = call float @llvm.fabs.f32(float -0.000000e+00)
; CHECK-NEXT: %r4 = fadd float %r2, %r3
; CHECK-NEXT: ret float %r4
; CHECK-NEXT: }
define double @test_fabs_double(double %x) {
entry:
%r = call double @llvm.fabs.f64(double %x)
%r2 = call double @llvm.fabs.f64(double %r)
%r3 = call double @llvm.fabs.f64(double -0.0)
%r4 = fadd double %r2, %r3
ret double %r4
}
; CHECK-NEXT: define double @test_fabs_double(double %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %r = call double @llvm.fabs.f64(double %x)
; CHECK-NEXT: %r2 = call double @llvm.fabs.f64(double %r)
; CHECK-NEXT: %r3 = call double @llvm.fabs.f64(double -0.000000e+00)
; CHECK-NEXT: %r4 = fadd double %r2, %r3
; CHECK-NEXT: ret double %r4
; CHECK-NEXT: }
define <4 x float> @test_fabs_v4f32(<4 x float> %x) {
entry:
%r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
%r2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %r)
%r3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
%r4 = fadd <4 x float> %r2, %r3
ret <4 x float> %r4
}
; CHECK-NEXT: define <4 x float> @test_fabs_v4f32(<4 x float> %x) {
; CHECK-NEXT: entry:
; CHECK-NEXT: %r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
; CHECK-NEXT: %r2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %r)
; CHECK-NEXT: %r3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
; CHECK-NEXT: %r4 = fadd <4 x float> %r2, %r3
; CHECK-NEXT: ret <4 x float> %r4
; CHECK-NEXT: }
define i32 @test_trap(i32 %br) {
entry:
%r1 = icmp eq i32 %br, 0
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment