Add scalar lowering for sqrt intrinsic.

Re-used test_arith_main.cpp, mostly to share the set of interesting floating point constants. BUG= https://code.google.com/p/nativeclient/issues/detail?id=3882 R=stichnot@chromium.org, wala@chromium.org Review URL: https://codereview.chromium.org/384443003

Add scalar lowering for sqrt intrinsic.
f37fbbe9 · Jan Voung · 9559899d · f37fbbe9 · f37fbbe9 · f37fbbe9
Commit f37fbbe9 authored Jul 09, 2014 by Jan Voung
10 changed files
--- a/crosstest/runtests.sh
+++ b/crosstest/runtests.sh
@@ -30,7 +30,9 @@ for optlevel in ${OPTLEVELS} ; do
    ./crosstest.py -O${optlevel} --prefix=Subzero_ --target=x8632 \
        --dir="${OUTDIR}" \
        --llvm-bin-path="${LLVM_BIN_PATH}" \
-        --test=test_arith.cpp --test=test_arith_frem.ll \
+        --test=test_arith.cpp \
+        --test=test_arith_frem.ll \
+        --test=test_arith_sqrt.ll \
        --driver=test_arith_main.cpp \
        --output=test_arith_O${optlevel}

--- a/crosstest/test_arith.h
+++ b/crosstest/test_arith.h
@@ -27,3 +27,6 @@ double myFrem(double a, double b);
  double test##inst(double a, double b);
 FPOP_TABLE
 #undef X
+float mySqrt(float a);
+double mySqrt(double a);
--- a/crosstest/test_arith_main.cpp
+++ b/crosstest/test_arith_main.cpp
 /* crosstest.py --test=test_arith.cpp --test=test_arith_frem.ll \
-   --driver=test_arith_main.cpp --prefix=Subzero_ --output=test_arith */
+   --test=test_arith_sqrt.ll --driver=test_arith_main.cpp \
+   --prefix=Subzero_ --output=test_arith */
 #include <stdint.h>
@@ -123,6 +124,7 @@ void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
  static const Type NegInf = -1.0 / 0.0;
  static const Type PosInf = 1.0 / 0.0;
  static const Type Nan = 0.0 / 0.0;
+  static const Type NegNan = -0.0 / 0.0;
  volatile Type Values[] = {
    0,                    1,                    0x7e,
    0x7f,                 0x80,                 0x81,
@@ -134,7 +136,8 @@ void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
    0x100000001ll,        0x7ffffffffffffffell, 0x7fffffffffffffffll,
    0x8000000000000000ll, 0x8000000000000001ll, 0xfffffffffffffffell,
    0xffffffffffffffffll, NegInf,               PosInf,
-    Nan,                  FLT_MIN,              FLT_MAX,
+    Nan,                  NegNan,               -0.0,
+    FLT_MIN,              FLT_MAX,
    DBL_MIN,              DBL_MAX
  };
  const static size_t NumValues = sizeof(Values) / sizeof(*Values);
@@ -173,6 +176,22 @@ void testsFp(size_t &TotalTests, size_t &Passes, size_t &Failures) {
      }
    }
  }
+  for (size_t i = 0; i < NumValues; ++i) {
+    Type Value = Values[i];
+    ++TotalTests;
+    Type ResultSz = Subzero_::mySqrt(Value);
+    Type ResultLlc = mySqrt(Value);
+    // Compare results using memcmp() in case they are both NaN.
+    if (!memcmp(&ResultSz, &ResultLlc, sizeof(Type))) {
+      ++Passes;
+    } else {
+      ++Failures;
+      std::cout << std::fixed << "test_sqrt"
+                << (8 * sizeof(Type)) << "(" << Value
+                << "): sz=" << ResultSz << " llc=" << ResultLlc
+                << std::endl;
+    }
+  }
 }
 int main(int argc, char **argv) {

--- a/crosstest/test_arith_sqrt.ll
+++ b/crosstest/test_arith_sqrt.ll
+target triple = "i686-pc-linux-gnu"
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+define float @_Z6mySqrtf(float %a) {
+  %x = call float @llvm.sqrt.f32(float %a)
+  ret float %x
+}
+define double @_Z6mySqrtd(double %a) {
+  %x = call double @llvm.sqrt.f64(double %a)
+  ret double %x
+}
--- a/crosstest/test_fcmp_main.cpp
+++ b/crosstest/test_fcmp_main.cpp
@@ -22,15 +22,18 @@ int main(int argc, char **argv) {
  static const double Ten = 10.0;
  static const double PosInf = 1.0 / 0.0;
  static const double Nan = 0.0 / 0.0;
+  static const double NegNan = -0.0 / 0.0;
  assert(std::fpclassify(NegInf) == FP_INFINITE);
  assert(std::fpclassify(PosInf) == FP_INFINITE);
  assert(std::fpclassify(Nan) == FP_NAN);
+  assert(std::fpclassify(NegNan) == FP_NAN);
  assert(NegInf < Zero);
  assert(NegInf < PosInf);
  assert(Zero < PosInf);
-  volatile double Values[] = { NegInf,  Zero,    DBL_MIN, FLT_MIN, Ten,
+  volatile double Values[] = { NegInf, -Zero,   Zero,    DBL_MIN, FLT_MIN,
-                               FLT_MAX, DBL_MAX, PosInf,  Nan, };
+                               Ten,    FLT_MAX, DBL_MAX, PosInf,  Nan,
+                               NegNan };
  const static size_t NumValues = sizeof(Values) / sizeof(*Values);
  typedef bool (*FuncTypeFloat)(float, float);

--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -271,6 +271,11 @@ bool InstX8632Movq::isRedundantAssign() const {
  return false;
 }
+InstX8632Sqrtss::InstX8632Sqrtss(Cfg *Func, Variable *Dest, Operand *Source)
+    : InstX8632(Func, InstX8632::Sqrtss, 1, Dest) {
+  addSource(Source);
+}
 InstX8632Ret::InstX8632Ret(Cfg *Func, Variable *Source)
    : InstX8632(Func, InstX8632::Ret, Source ? 1 : 0, NULL) {
  if (Source)
@@ -919,6 +924,25 @@ void InstX8632Ret::dump(const Cfg *Func) const {
  dumpSources(Func);
 }
+void InstX8632Sqrtss::emit(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Type Ty = getSrc(0)->getType();
+  assert(Ty == IceType_f32 || Ty == IceType_f64);
+  Str << "\tsqrt" << TypeX8632Attributes[Ty].SdSsString << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+  Str << "\n";
+}
+void InstX8632Sqrtss::dump(const Cfg *Func) const {
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << " = sqrt." << getDest()->getType() << " ";
+  dumpSources(Func);
+}
 void InstX8632Xadd::emit(const Cfg *Func) const {
  Ostream &Str = Func->getContext()->getStrEmit();
  if (Locked) {

--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -170,6 +170,7 @@ public:
    Shld,
    Shr,
    Shrd,
+    Sqrtss,
    Store,
    StoreQ,
    Sub,
@@ -827,6 +828,24 @@ private:
  virtual ~InstX8632Ret() {}
 };
+// Sqrtss - Scalar sqrt of a float or double.
+class InstX8632Sqrtss : public InstX8632 {
+public:
+  static InstX8632Sqrtss *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstX8632Sqrtss>())
+        InstX8632Sqrtss(Func, Dest, Source);
+  }
+  virtual void emit(const Cfg *Func) const;
+  virtual void dump(const Cfg *Func) const;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Sqrtss); }
+private:
+  InstX8632Sqrtss(Cfg *Func, Variable *Dest, Operand *Source);
+  InstX8632Sqrtss(const InstX8632Sqrtss &) LLVM_DELETED_FUNCTION;
+  InstX8632Sqrtss &operator=(const InstX8632Sqrtss &) LLVM_DELETED_FUNCTION;
+  virtual ~InstX8632Sqrtss() {}
+};
 // Exchanging Add instruction.  Exchanges the first operand (destination
 // operand) with the second operand (source operand), then loads the sum
 // of the two values into the destination operand. The destination may be

--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -2018,7 +2018,14 @@ void TargetX8632::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
    lowerCall(Call);
    return;
  }
-  case Intrinsics::Sqrt:
+  case Intrinsics::Sqrt: {
+    Operand *Src = legalize(Instr->getArg(0));
+    Variable *Dest = Instr->getDest();
+    Variable *T = makeReg(Dest->getType());
+    _sqrtss(T, Src);
+    _mov(Dest, T);
+    return;
+  }
  case Intrinsics::Stacksave:
  case Intrinsics::Stackrestore:
    // TODO(jvoung): fill it in.

--- a/src/IceTargetLoweringX8632.h
+++ b/src/IceTargetLoweringX8632.h
@@ -252,6 +252,9 @@ protected:
  void _shrd(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert(InstX8632Shrd::create(Func, Dest, Src0, Src1));
  }
+  void _sqrtss(Variable *Dest, Operand *Src0) {
+    Context.insert(InstX8632Sqrtss::create(Func, Dest, Src0));
+  }
  void _store(Operand *Value, OperandX8632 *Mem) {
    Context.insert(InstX8632Store::create(Func, Value, Mem));
  }

--- a/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
+++ b/tests_lit/llvm2ice_tests/nacl-other-intrinsics.ll
@@ -15,6 +15,8 @@ declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1)
 declare void @llvm.memset.p0i8.i32(i8*, i8, i32, i32, i1)
 declare void @llvm.nacl.longjmp(i8*, i32)
 declare i32 @llvm.nacl.setjmp(i8*)
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
 declare void @llvm.trap()
 define i32 @test_nacl_read_tp() {
@@ -160,6 +162,63 @@ entry:
 ; CHECKO2REM-LABEL: test_setjmp_unused
 ; CHECKO2REM: call setjmp
+define float @test_sqrt_float(float %x, i32 %iptr) {
+entry:
+  %r = call float @llvm.sqrt.f32(float %x)
+  %r2 = call float @llvm.sqrt.f32(float %r)
+  %r3 = call float @llvm.sqrt.f32(float -0.0)
+  %r4 = fadd float %r2, %r3
+  br label %next
+next:
+  %__6 = inttoptr i32 %iptr to float*
+  %y = load float* %__6, align 4
+  %r5 = call float @llvm.sqrt.f32(float %y)
+  %r6 = fadd float %r4, %r5
+  ret float %r6
+}
+; CHECK-LABEL: test_sqrt_float
+; CHECK: sqrtss xmm{{.*}}
+; CHECK: sqrtss xmm{{.*}}
+; CHECK: sqrtss xmm{{.*}}, dword ptr
+; CHECK-LABEL: .L{{.*}}next
+; We could fold the load and the sqrt into one operation, but the
+; current folding only handles load + arithmetic op. The sqrt inst
+; is considered an intrinsic call and not an arithmetic op.
+; CHECK: sqrtss xmm{{.*}}
+define double @test_sqrt_double(double %x, i32 %iptr) {
+entry:
+  %r = call double @llvm.sqrt.f64(double %x)
+  %r2 = call double @llvm.sqrt.f64(double %r)
+  %r3 = call double @llvm.sqrt.f64(double -0.0)
+  %r4 = fadd double %r2, %r3
+  br label %next
+next:
+  %__6 = inttoptr i32 %iptr to double*
+  %y = load double* %__6, align 8
+  %r5 = call double @llvm.sqrt.f64(double %y)
+  %r6 = fadd double %r4, %r5
+  ret double %r6
+}
+; CHECK-LABEL: test_sqrt_double
+; CHECK: sqrtsd xmm{{.*}}
+; CHECK: sqrtsd xmm{{.*}}
+; CHECK: sqrtsd xmm{{.*}}, qword ptr
+; CHECK-LABEL: .L{{.*}}next
+; CHECK: sqrtsd xmm{{.*}}
+define float @test_sqrt_ignored(float %x, double %y) {
+entry:
+  %ignored1 = call float @llvm.sqrt.f32(float %x)
+  %ignored2 = call double @llvm.sqrt.f64(double %y)
+  ret float 0.0
+}
+; CHECKO2REM-LABEL: test_sqrt_ignored
+; CHECKO2REM-NOT: sqrtss
+; CHECKO2REM-NOT: sqrtsd
 define i32 @test_trap(i32 %br) {
 entry:
  %r1 = icmp eq i32 %br, 0