Optimize reciprocal sqrt operation

This change deprecates rr::RcpSqrt_pp with rr::RcpSqrt. As with Rcp, RcpSqrt computes the result using Newton-Rhapson if it's faster and the initial approximation intrinsic is available on the current target. Currently, only LLVM on Intel will use NR for RelaxedPrecision. Note that passing in Precision::Relaxed will produce a faster, but less precise reciprocal sqrt. Also made it so that SprivShader instruction GLSLstd450InverseSqrt now invokes RcpSqrt(x, Precision::Full) instead of performing 1/sqrt(x). Note that the Vulkan spec states that inversesqrt()'s precision is 2 ULP, and sqrt()'s precision is inherited from 1.0 / inversesqrt(); however, our rr::Sqrt is implemented in terms of x86's sqrt intrinsic on x86, or as calls to sqrt from Math.h. Bug: b/169760262 Change-Id: I65ba9a64d1db934c523dda11c1a2c186059d220b Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51268 Commit-Queue: Antonio Maiorano <amaiorano@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Tested-by: 's avatarAntonio Maiorano <amaiorano@google.com>
parent d1561871
...@@ -236,25 +236,7 @@ Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp) ...@@ -236,25 +236,7 @@ Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
abs = Abs(abs); abs = Abs(abs);
} }
Float4 rsq; return Rcp(abs, pp ? Precision::Relaxed : Precision::Full);
if(!pp)
{
rsq = Float4(1.0f) / Sqrt(abs);
}
else
{
rsq = RcpSqrt_pp(abs);
if(!pp)
{
rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
}
rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
}
return rsq;
} }
Float4 modulo(RValue<Float4> x, RValue<Float4> y) Float4 modulo(RValue<Float4> x, RValue<Float4> y)
......
...@@ -750,19 +750,10 @@ SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitSt ...@@ -750,19 +750,10 @@ SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitSt
auto val = Operand(this, state, insn.word(5)); auto val = Operand(this, state, insn.word(5));
Decorations d; Decorations d;
ApplyDecorationsForId(&d, insn.word(5)); ApplyDecorationsForId(&d, insn.word(5));
if(d.RelaxedPrecision)
{ for(auto i = 0u; i < type.componentCount; i++)
for(auto i = 0u; i < type.componentCount; i++)
{
dst.move(i, RcpSqrt_pp(val.Float(i)));
}
}
else
{ {
for(auto i = 0u; i < type.componentCount; i++) dst.move(i, RcpSqrt(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
{
dst.move(i, SIMD::Float(1.0f) / Sqrt(val.Float(i)));
}
} }
break; break;
} }
......
...@@ -2906,6 +2906,35 @@ RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2) ...@@ -2906,6 +2906,35 @@ RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
#endif #endif
} }
bool HasRcpSqrtApprox()
{
#if defined(__i386__) || defined(__x86_64__)
return true;
#else
return false;
#endif
}
RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
{
#if defined(__i386__) || defined(__x86_64__)
return x86::rsqrtps(x);
#else
UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
return { 0.0f };
#endif
}
RValue<Float> RcpSqrtApprox(RValue<Float> x)
{
#if defined(__i386__) || defined(__x86_64__)
return x86::rsqrtss(x);
#else
UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
return { 0.0f };
#endif
}
RValue<Float> Sqrt(RValue<Float> x) RValue<Float> Sqrt(RValue<Float> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
......
...@@ -4712,4 +4712,69 @@ RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2) ...@@ -4712,4 +4712,69 @@ RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2)
return DoRcp(x, p, finite, exactAtPow2); return DoRcp(x, p, finite, exactAtPow2);
} }
// Functions implemented by backends
bool HasRcpSqrtApprox();
RValue<Float4> RcpSqrtApprox(RValue<Float4> x);
RValue<Float> RcpSqrtApprox(RValue<Float> x);
template<typename T>
struct CastToIntType;
template<>
struct CastToIntType<Float4>
{
using type = Int4;
};
template<>
struct CastToIntType<Float>
{
using type = Int;
};
// TODO: move to Reactor.hpp?
RValue<Int> CmpNEQ(RValue<Int> x, RValue<Int> y)
{
return IfThenElse(x != y, Int(~0), Int(0));
}
template<typename T>
static RValue<T> DoRcpSqrt(RValue<T> x, Precision p)
{
#if defined(__i386__) || defined(__x86_64__) // On x86, 1/x is fast enough, except for lower precision
bool approx = HasRcpApprox() && (p != Precision::Full);
#else
bool approx = HasRcpApprox();
#endif
if(approx)
{
using IntType = typename CastToIntType<T>::type;
T rsq = RcpSqrtApprox(x);
if(p == Precision::Full)
{
rsq = rsq * (T(3.0f) - rsq * rsq * x) * T(0.5f);
rsq = As<T>(CmpNEQ(As<IntType>(x), IntType(0x7F800000)) & As<IntType>(rsq));
}
return rsq;
}
else
{
return T(1.0f) / Sqrt(x);
}
}
RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p)
{
return DoRcpSqrt(x, p);
}
RValue<Float> RcpSqrt(RValue<Float> x, Precision p)
{
return DoRcpSqrt(x, p);
}
} // namespace rr } // namespace rr
...@@ -2168,8 +2168,11 @@ RValue<Float> Min(RValue<Float> x, RValue<Float> y); ...@@ -2168,8 +2168,11 @@ RValue<Float> Min(RValue<Float> x, RValue<Float> y);
// Deprecated: use Rcp // Deprecated: use Rcp
// TODO(b/147516027): Remove when GLES frontend is removed // TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false); RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
// Deprecated: use RcpSqrt
// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float> RcpSqrt_pp(RValue<Float> val); RValue<Float> RcpSqrt_pp(RValue<Float> val);
RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false); RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
RValue<Float> RcpSqrt(RValue<Float> x, Precision p = Precision::Full);
RValue<Float> Sqrt(RValue<Float> x); RValue<Float> Sqrt(RValue<Float> x);
// RValue<Int4> IsInf(RValue<Float> x); // RValue<Int4> IsInf(RValue<Float> x);
...@@ -2336,8 +2339,11 @@ RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y); ...@@ -2336,8 +2339,11 @@ RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
// Deprecated: use Rcp // Deprecated: use Rcp
// TODO(b/147516027): Remove when GLES frontend is removed // TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false); RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
// Deprecated: use RcpSqrt
// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float4> RcpSqrt_pp(RValue<Float4> val); RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false); RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p = Precision::Full);
RValue<Float4> Sqrt(RValue<Float4> x); RValue<Float4> Sqrt(RValue<Float4> x);
RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i); RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
RValue<Float> Extract(RValue<Float4> x, int i); RValue<Float> Extract(RValue<Float4> x, int i);
......
...@@ -3952,6 +3952,25 @@ RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2) ...@@ -3952,6 +3952,25 @@ RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
return { 0.0f }; return { 0.0f };
} }
bool HasRcpSqrtApprox()
{
return false;
}
RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
{
// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
return { 0.0f };
}
RValue<Float> RcpSqrtApprox(RValue<Float> x)
{
// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
UNREACHABLE("RValue<Float> RcpSqrtApprox()");
return { 0.0f };
}
RValue<Float4> Sqrt(RValue<Float4> x) RValue<Float4> Sqrt(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment