Subzero: Add fabs intrinsic support.

The intrinsic is lowered using the standard technique of masking off the FP sign bit, which is the high-order bit. To construct this mask, we use the existing trick of loading a vector register with all "1" bits, then logical-shift-right by one bit. In the future, we should add 128-bit vector values to the constant pool and force them to memory, and this could be used for the other routines that synthesize a vector constant. BUG= https://code.google.com/p/nativeclient/issues/detail?id=4097 R=jvoung@chromium.org Review URL: https://codereview.chromium.org/1022573004

Subzero: Add fabs intrinsic support.
8c980d0d · Jim Stichnoth · f644a4b3 · 8c980d0d · 8c980d0d · 8c980d0d
Commit 8c980d0d authored Mar 19, 2015 by Jim Stichnoth
15 changed files
--- a/crosstest/crosstest.cfg
+++ b/crosstest/crosstest.cfg
@@ -8,7 +8,7 @@ test: mem_intrin.cpp

 [test_arith]
 driver: test_arith_main.cpp
-test: test_arith.cpp test_arith_frem.ll test_arith_sqrt.ll
+test: test_arith.cpp test_arith_frem.ll test_arith_sqrt.ll test_arith_fabs.ll

 [test_bitmanip]
 driver: test_bitmanip_main.cpp

--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -55,3 +55,7 @@ FPOP_TABLE
 float mySqrt(float a);
 double mySqrt(double a);
 // mySqrt for v4f32 is currently unsupported.
+
+float myFabs(float a);
+double myFabs(double a);
+v4f32 myFabs(v4f32 a);
--- a/crosstest/test_arith_fabs.ll
+++ b/crosstest/test_arith_fabs.ll
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+
+define float @_Z6myFabsf(float %a) {
+  %x = call float @llvm.fabs.f32(float %a)
+  ret float %x
+}
+
+define double @_Z6myFabsd(double %a) {
+  %x = call double @llvm.fabs.f64(double %a)
+  ret double %x
+}
+
+define <4 x float> @_Z6myFabsDv4_f(<4 x float> %a) {
+  %x = call <4 x float> @llvm.fabs.v4f32(<4 x float> %a)
+  ret <4 x float> %x
+}
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
@@ -287,6 +287,18 @@ void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
                << Value << "): sz=" << ResultSz << " llc=" << ResultLlc
                << "\n";
    }
+    ++TotalTests;
+    ResultSz = Subzero_::myFabs(Value);
+    ResultLlc = myFabs(Value);
+    // Compare results using memcmp() in case they are both NaN.
+    if (!memcmp(&ResultSz, &ResultLlc, sizeof(Type))) {
+      ++Passes;
+    } else {
+      ++Failures;
+      std::cout << std::fixed << "test_fabs" << (CHAR_BIT * sizeof(Type)) << "("
+                << Value << "): sz=" << ResultSz << " llc=" << ResultLlc
+                << "\n";
+    }
  }
 }

@@ -334,6 +346,19 @@ void testsVecFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
                  << "): sz=" << vectAsString<v4f32>(ResultSz) << " llc"
                  << vectAsString<v4f32>(ResultLlc) << "\n";
      }
+      // Special case for unary fabs operation.  Use Value1, ignore Value2.
+      ResultSz = Subzero_::myFabs(Value1);
+      ResultLlc = myFabs(Value1);
+      ++TotalTests;
+      if (!memcmp(&ResultSz, &ResultLlc, sizeof(ResultSz))) {
+        ++Passes;
+      } else {
+        ++Failures;
+        std::cout << "test_fabs_v4f32"
+                  << "(" << vectAsString<v4f32>(Value1)
+                  << "): sz=" << vectAsString<v4f32>(ResultSz) << " llc"
+                  << vectAsString<v4f32>(ResultLlc) << "\n";
+      }
    }
  }
 }

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -931,6 +931,7 @@ template <> const char *InstX8632Psll::Opcode = "psll";
 template <> const char *InstX8632Shr::Opcode = "shr";
 template <> const char *InstX8632Sar::Opcode = "sar";
 template <> const char *InstX8632Psra::Opcode = "psra";
+template <> const char *InstX8632Psrl::Opcode = "psrl";
 template <> const char *InstX8632Pcmpeq::Opcode = "pcmpeq";
 template <> const char *InstX8632Pcmpgt::Opcode = "pcmpgt";
 template <> const char *InstX8632MovssRegs::Opcode = "movss";
@@ -1078,6 +1079,10 @@ template <>
 const x86::AssemblerX86::XmmEmitterShiftOp InstX8632Psra::Emitter = {
    &x86::AssemblerX86::psra, &x86::AssemblerX86::psra,
    &x86::AssemblerX86::psra};
+template <>
+const x86::AssemblerX86::XmmEmitterShiftOp InstX8632Psrl::Emitter = {
+    &x86::AssemblerX86::psrl, &x86::AssemblerX86::psrl,
+    &x86::AssemblerX86::psrl};

 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const {
  if (!ALLOW_DUMP)
@@ -2667,6 +2672,15 @@ template <> void InstX8632Psra::emit(const Cfg *Func) const {
  emitTwoAddress(buf, this, Func);
 }

+template <> void InstX8632Psrl::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  char buf[30];
+  snprintf(buf, llvm::array_lengthof(buf), "psrl%s",
+           TypeX8632Attributes[getDest()->getType()].PackString);
+  emitTwoAddress(buf, this, Func);
+}
+
 void InstX8632Ret::emit(const Cfg *Func) const {
  if (!ALLOW_DUMP)
    return;

--- a/src/IceInstX8632.def
+++ b/src/IceInstX8632.def
@@ -143,15 +143,15 @@
  X(IceType_i16,   IceType_void, "si", ""  , "" ,  "w", "")      \
  X(IceType_i32,   IceType_void, "si", ""  , "" ,  "l", "")      \
  X(IceType_i64,   IceType_void, "si", ""  , "" ,  "q", "")      \
-  X(IceType_f32,   IceType_void, "ss", "ss", "" ,  "",  "s")     \
-  X(IceType_f64,   IceType_void, "sd", "sd", "" ,  "",  "l")     \
+  X(IceType_f32,   IceType_void, "ss", "ss", "d",  "",  "s")     \
+  X(IceType_f64,   IceType_void, "sd", "sd", "q",  "",  "l")     \
  X(IceType_v4i1,  IceType_i32 , "?" , ""  , "d",  "",  "")      \
  X(IceType_v8i1,  IceType_i16 , "?" , ""  , "w",  "",  "")      \
  X(IceType_v16i1, IceType_i8  , "?" , ""  , "b",  "",  "")      \
  X(IceType_v16i8, IceType_i8  , "?" , ""  , "b",  "",  "")      \
  X(IceType_v8i16, IceType_i16 , "?" , ""  , "w",  "",  "")      \
  X(IceType_v4i32, IceType_i32 , "dq", ""  , "d",  "",  "")      \
-  X(IceType_v4f32, IceType_f32 , "ps", ""  , "" ,  "",  "")      \
+  X(IceType_v4f32, IceType_f32 , "ps", ""  , "d",  "",  "")      \
 //#define X(tag, elementty, cvt, sdss, pack, width, fld)

 #endif // SUBZERO_SRC_ICEINSTX8632_DEF
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -234,6 +234,7 @@ public:
    Pshufd,
    Psll,
    Psra,
+    Psrl,
    Psub,
    Push,
    Pxor,
@@ -786,7 +787,7 @@ void emitIASXmmShift(const Cfg *Func, Type Ty, const Variable *Var,
                     const Operand *Src,
                     const x86::AssemblerX86::XmmEmitterShiftOp &Emitter);

-template <InstX8632::InstKindX8632 K>
+template <InstX8632::InstKindX8632 K, bool AllowAllTypes = false>
 class InstX8632BinopXmmShift : public InstX8632 {
  InstX8632BinopXmmShift() = delete;
  InstX8632BinopXmmShift(const InstX8632BinopXmmShift &) = delete;
@@ -807,8 +808,7 @@ public:
  }
  void emitIAS(const Cfg *Func) const override {
    Type Ty = getDest()->getType();
-    assert(Ty == IceType_v8i16 || Ty == IceType_v8i1 || Ty == IceType_v4i32 ||
-           Ty == IceType_v4i1);
+    assert(AllowAllTypes || isVectorType(Ty));
    Type ElementTy = typeElementType(Ty);
    assert(getSrcSize() == 2);
    emitIASXmmShift(Func, ElementTy, getDest(), getSrc(1), Emitter);
@@ -1013,6 +1013,7 @@ typedef InstX8632BinopXmm<InstX8632::Divss, false> InstX8632Divss;
 typedef InstX8632BinopGPRShift<InstX8632::Rol> InstX8632Rol;
 typedef InstX8632BinopGPRShift<InstX8632::Shl> InstX8632Shl;
 typedef InstX8632BinopXmmShift<InstX8632::Psll> InstX8632Psll;
+typedef InstX8632BinopXmmShift<InstX8632::Psrl, true> InstX8632Psrl;
 typedef InstX8632BinopGPRShift<InstX8632::Shr> InstX8632Shr;
 typedef InstX8632BinopGPRShift<InstX8632::Sar> InstX8632Sar;
 typedef InstX8632BinopXmmShift<InstX8632::Psra> InstX8632Psra;
@@ -1632,6 +1633,7 @@ template <> void InstX8632Pmull::emit(const Cfg *Func) const;
 template <> void InstX8632Pmuludq::emit(const Cfg *Func) const;
 template <> void InstX8632Psll::emit(const Cfg *Func) const;
 template <> void InstX8632Psra::emit(const Cfg *Func) const;
+template <> void InstX8632Psrl::emit(const Cfg *Func) const;
 template <> void InstX8632Psub::emit(const Cfg *Func) const;
 template <> void InstX8632Sqrtss::emit(const Cfg *Func) const;
 template <> void InstX8632Subss::emit(const Cfg *Func) const;

--- a/src/IceIntrinsics.cpp
+++ b/src/IceIntrinsics.cpp
@@ -159,6 +159,16 @@ const struct IceIntrinsicsEntry_ {
    CttzInit(IceType_i64, "i64"),
 #undef CttzInit

+#define FabsInit(Overload, NameSuffix)                                         \
+  {                                                                            \
+    { INTRIN(Fabs, SideEffects_F, ReturnsTwice_F), {Overload, Overload}, 2 }   \
+    , "fabs." NameSuffix                                                       \
+  }
+    FabsInit(IceType_f32, "f32"),
+    FabsInit(IceType_f64, "f64"),
+    FabsInit(IceType_v4f32, "v4f32"),
+#undef FabsInit
+
    {{INTRIN(Longjmp, SideEffects_T, ReturnsTwice_F),
      {IceType_void, IceType_i32, IceType_i32},
      3},

--- a/src/IceIntrinsics.h
+++ b/src/IceIntrinsics.h
@@ -48,6 +48,7 @@ public:
    Ctlz,
    Ctpop,
    Cttz,
+    Fabs,
    Longjmp,
    Memcpy,
    Memmove,

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -3089,6 +3089,18 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
                    SecondVal);
    return;
  }
+  case Intrinsics::Fabs: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Type Ty = Src->getType();
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeVectorOfFabsMask(Ty);
+    _pand(T, Src);
+    if (isVectorType(Ty))
+      _movp(Dest, T);
+    else
+      _mov(Dest, T);
+    return;
+  }
  case Intrinsics::Longjmp: {
    InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
    Call->addArg(Instr->getArg(0));
@@ -4362,6 +4374,18 @@ Variable *TargetX8632::makeVectorOfHighOrderBits(Type Ty, int32_t RegNum) {
  }
 }

+// Construct a mask in a register that can be and'ed with a
+// floating-point value to mask off its sign bit.  The value will be
+// <4 x 0x7fffffff> for f32 and v4f32, and <2 x 0x7fffffffffffffff>
+// for f64.  Construct it as vector of ones logically right shifted
+// one bit.  TODO(stichnot): Fix the wala TODO above, to represent
+// vector constants in memory.
+Variable *TargetX8632::makeVectorOfFabsMask(Type Ty, int32_t RegNum) {
+  Variable *Reg = makeVectorOfMinusOnes(Ty, RegNum);
+  _psrl(Reg, Ctx->getConstantInt8(1));
+  return Reg;
+}
+
 OperandX8632Mem *TargetX8632::getMemoryOperandForStackSlot(Type Ty,
                                                           Variable *Slot,
                                                           uint32_t Offset) {

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -174,6 +174,8 @@ protected:
                                  int32_t RegNum = Variable::NoRegister);
  Variable *makeVectorOfHighOrderBits(Type Ty,
                                      int32_t RegNum = Variable::NoRegister);
+  Variable *makeVectorOfFabsMask(Type Ty,
+                                 int32_t RegNum = Variable::NoRegister);

  // Return a memory operand corresponding to a stack allocated Variable.
  OperandX8632Mem *getMemoryOperandForStackSlot(Type Ty, Variable *Slot,
@@ -394,6 +396,9 @@ protected:
  void _psra(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Psra::create(Func, Dest, Src0));
  }
+  void _psrl(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Psrl::create(Func, Dest, Src0));
+  }
  void _psub(Variable *Dest, Operand *Src0) {
    Context.insert(InstX8632Psub::create(Func, Dest, Src0));
  }

--- a/src/assembler_ia32.cpp
+++ b/src/assembler_ia32.cpp
@@ -715,6 +715,53 @@ void AssemblerX86::psra(Type Ty, XmmRegister dst, const Immediate &imm) {
  EmitUint8(imm.value() & 0xFF);
 }

+void AssemblerX86::psrl(Type Ty, XmmRegister dst, XmmRegister src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xD1);
+  } else if (Ty == IceType_f64) {
+    EmitUint8(0xD3);
+  } else {
+    assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
+    EmitUint8(0xD2);
+  }
+  EmitXmmRegisterOperand(dst, src);
+}
+
+void AssemblerX86::psrl(Type Ty, XmmRegister dst, const Address &src) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0xD1);
+  } else if (Ty == IceType_f64) {
+    EmitUint8(0xD3);
+  } else {
+    assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
+    EmitUint8(0xD2);
+  }
+  EmitOperand(dst, src);
+}
+
+void AssemblerX86::psrl(Type Ty, XmmRegister dst, const Immediate &imm) {
+  AssemblerBuffer::EnsureCapacity ensured(&buffer_);
+  assert(imm.is_int8());
+  EmitUint8(0x66);
+  EmitUint8(0x0F);
+  if (Ty == IceType_i16) {
+    EmitUint8(0x71);
+  } else if (Ty == IceType_f64) {
+    EmitUint8(0x73);
+  } else {
+    assert(Ty == IceType_i32 || Ty == IceType_f32 || Ty == IceType_v4f32);
+    EmitUint8(0x72);
+  }
+  EmitRegisterOperand(2, dst);
+  EmitUint8(imm.value() & 0xFF);
+}
+
 // {add,sub,mul,div}ps are given a Ty parameter for consistency with
 // {add,sub,mul,div}ss. In the future, when the PNaCl ABI allows
 // addpd, etc., we can use the Ty parameter to decide on adding

--- a/src/assembler_ia32.h
+++ b/src/assembler_ia32.h
@@ -559,6 +559,9 @@ public:
  void psra(Type Ty, XmmRegister dst, XmmRegister src);
  void psra(Type Ty, XmmRegister dst, const Address &src);
  void psra(Type Ty, XmmRegister dst, const Immediate &src);
+  void psrl(Type Ty, XmmRegister dst, XmmRegister src);
+  void psrl(Type Ty, XmmRegister dst, const Address &src);
+  void psrl(Type Ty, XmmRegister dst, const Immediate &src);

  void addps(Type Ty, XmmRegister dst, XmmRegister src);
  void addps(Type Ty, XmmRegister dst, const Address &src);

--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -25,6 +25,9 @@ declare void @llvm.nacl.longjmp(i8*, i32)
 declare i32 @llvm.nacl.setjmp(i8*)
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 declare void @llvm.trap()
 declare i16 @llvm.bswap.i16(i16)
 declare i32 @llvm.bswap.i32(i32)
@@ -268,6 +271,63 @@ entry:
 ; CHECKO2REM-NOT: sqrtss
 ; CHECKO2REM-NOT: sqrtsd

+define float @test_fabs_float(float %x) {
+entry:
+  %r = call float @llvm.fabs.f32(float %x)
+  %r2 = call float @llvm.fabs.f32(float %r)
+  %r3 = call float @llvm.fabs.f32(float -0.0)
+  %r4 = fadd float %r2, %r3
+  ret float %r4
+}
+; CHECK-LABEL: test_fabs_float
+; CHECK: pcmpeqd
+; CHECK: psrld
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: psrld
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: psrld
+; CHECK: pand
+
+define double @test_fabs_double(double %x) {
+entry:
+  %r = call double @llvm.fabs.f64(double %x)
+  %r2 = call double @llvm.fabs.f64(double %r)
+  %r3 = call double @llvm.fabs.f64(double -0.0)
+  %r4 = fadd double %r2, %r3
+  ret double %r4
+}
+; CHECK-LABEL: test_fabs_double
+; CHECK: pcmpeqd
+; CHECK: psrlq
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: psrlq
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: psrlq
+; CHECK: pand
+
+define <4 x float> @test_fabs_v4f32(<4 x float> %x) {
+entry:
+  %r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
+  %r2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %r)
+  %r3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+  %r4 = fadd <4 x float> %r2, %r3
+  ret <4 x float> %r4
+}
+; CHECK-LABEL: test_fabs_v4f32
+; CHECK: pcmpeqd
+; CHECK: psrld
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: psrld
+; CHECK: pand
+; CHECK: pcmpeqd
+; CHECK: psrld
+; CHECK: pand
+
 define i32 @test_trap(i32 %br) {
 entry:
  %r1 = icmp eq i32 %br, 0

--- a/tests_lit/reader_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/reader_tests/nacl-other-intrinsics.ll
@@ -14,6 +14,9 @@ declare void @llvm.nacl.longjmp(i8*, i32)
 declare i32 @llvm.nacl.setjmp(i8*)
 declare float @llvm.sqrt.f32(float)
 declare double @llvm.sqrt.f64(double)
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 declare void @llvm.trap()
 declare i16 @llvm.bswap.i16(i16)
 declare i32 @llvm.bswap.i32(i32)
@@ -149,6 +152,60 @@ entry:
 ; CHECK-NEXT:   ret double %r4
 ; CHECK-NEXT: }

+define float @test_fabs_float(float %x) {
+entry:
+  %r = call float @llvm.fabs.f32(float %x)
+  %r2 = call float @llvm.fabs.f32(float %r)
+  %r3 = call float @llvm.fabs.f32(float -0.0)
+  %r4 = fadd float %r2, %r3
+  ret float %r4
+}
+
+; CHECK-NEXT: define float @test_fabs_float(float %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %r = call float @llvm.fabs.f32(float %x)
+; CHECK-NEXT:   %r2 = call float @llvm.fabs.f32(float %r)
+; CHECK-NEXT:   %r3 = call float @llvm.fabs.f32(float -0.000000e+00)
+; CHECK-NEXT:   %r4 = fadd float %r2, %r3
+; CHECK-NEXT:   ret float %r4
+; CHECK-NEXT: }
+
+define double @test_fabs_double(double %x) {
+entry:
+  %r = call double @llvm.fabs.f64(double %x)
+  %r2 = call double @llvm.fabs.f64(double %r)
+  %r3 = call double @llvm.fabs.f64(double -0.0)
+  %r4 = fadd double %r2, %r3
+  ret double %r4
+}
+
+; CHECK-NEXT: define double @test_fabs_double(double %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %r = call double @llvm.fabs.f64(double %x)
+; CHECK-NEXT:   %r2 = call double @llvm.fabs.f64(double %r)
+; CHECK-NEXT:   %r3 = call double @llvm.fabs.f64(double -0.000000e+00)
+; CHECK-NEXT:   %r4 = fadd double %r2, %r3
+; CHECK-NEXT:   ret double %r4
+; CHECK-NEXT: }
+
+define <4 x float> @test_fabs_v4f32(<4 x float> %x) {
+entry:
+  %r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
+  %r2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %r)
+  %r3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+  %r4 = fadd <4 x float> %r2, %r3
+  ret <4 x float> %r4
+}
+
+; CHECK-NEXT: define <4 x float> @test_fabs_v4f32(<4 x float> %x) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   %r = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x)
+; CHECK-NEXT:   %r2 = call <4 x float> @llvm.fabs.v4f32(<4 x float> %r)
+; CHECK-NEXT:   %r3 = call <4 x float> @llvm.fabs.v4f32(<4 x float> undef)
+; CHECK-NEXT:   %r4 = fadd <4 x float> %r2, %r3
+; CHECK-NEXT:   ret <4 x float> %r4
+; CHECK-NEXT: }
+
 define i32 @test_trap(i32 %br) {
 entry:
  %r1 = icmp eq i32 %br, 0