Optimize reciprocal operation

This change deprecates rr::Rcp_pp with rr::Rcp, which makes sure to correctly compute the reciprocal using the Newton-Rhapson refinement only if the current target supports the required instrinsic, otherwise using 1 / x. Currently, only LLVM on Intel will use NR. Note that passing in Precision::Relaxed will produce a faster, but less precise reciprocal. Also removed PixelProgram::linearToSRGB as it's unused. Bug: b/169760262 Bug: b/149574741 Change-Id: I4a2f943aa60116c4397d7a8ae18583a260824788 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/50648Reviewed-by: 's avatarAlexis Hétu <sugoi@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Tested-by: 's avatarAntonio Maiorano <amaiorano@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>
parent 1ca6504e
...@@ -373,12 +373,4 @@ void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS]) ...@@ -373,12 +373,4 @@ void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS])
} }
} }
Float4 PixelProgram::linearToSRGB(const Float4 &x) // Approximates x^(1.0/2.2)
{
Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
}
} // namespace sw } // namespace sw
...@@ -48,7 +48,6 @@ private: ...@@ -48,7 +48,6 @@ private:
Int4 maskAny(Int cMask[4]) const; Int4 maskAny(Int cMask[4]) const;
Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const; Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
Float4 linearToSRGB(const Float4 &x);
}; };
} // namespace sw } // namespace sw
......
...@@ -133,7 +133,7 @@ void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBu ...@@ -133,7 +133,7 @@ void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBu
WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]); WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
} }
WWWW = Rcp_pp(WWWW); WWWW = Rcp(WWWW, Precision::Relaxed);
XXXX *= WWWW; XXXX *= WWWW;
YYYY *= WWWW; YYYY *= WWWW;
......
...@@ -1226,10 +1226,10 @@ void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisot ...@@ -1226,10 +1226,10 @@ void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisot
uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask))); uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask))); vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
anisotropy = lod * Rcp_pp(det); anisotropy = lod * Rcp(det, Precision::Relaxed);
anisotropy = Min(anisotropy, state.maxAnisotropy); anisotropy = Min(anisotropy, state.maxAnisotropy);
lod *= Rcp_pp(anisotropy * anisotropy); lod *= Rcp(anisotropy * anisotropy, Precision::Relaxed);
} }
lod = log2sqrt(lod); // log2(sqrt(lod)) lod = log2sqrt(lod); // log2(sqrt(lod))
......
...@@ -224,22 +224,7 @@ Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp) ...@@ -224,22 +224,7 @@ Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2) Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
{ {
//return Float4(1.0f) / x; return Rcp(x, pp ? Precision::Relaxed : Precision::Full, finite, exactAtPow2);
Float4 rcp = Rcp_pp(x, exactAtPow2);
if(!pp)
{
rcp = (rcp + rcp) - (x * rcp * rcp);
}
if(finite)
{
int big = 0x7F7FFFFF;
rcp = Min(rcp, Float4((float &)big));
}
return rcp;
} }
Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp) Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
......
...@@ -2865,6 +2865,47 @@ RValue<Float> RcpSqrt_pp(RValue<Float> x) ...@@ -2865,6 +2865,47 @@ RValue<Float> RcpSqrt_pp(RValue<Float> x)
#endif #endif
} }
bool HasRcpApprox()
{
#if defined(__i386__) || defined(__x86_64__)
return true;
#else
return false;
#endif
}
RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
{
#if defined(__i386__) || defined(__x86_64__)
if(exactAtPow2)
{
// rcpps uses a piecewise-linear approximation which minimizes the relative error
// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
}
return x86::rcpps(x);
#else
UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
return { 0.0f };
#endif
}
RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
{
#if defined(__i386__) || defined(__x86_64__)
if(exactAtPow2)
{
// rcpss uses a piecewise-linear approximation which minimizes the relative error
// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
}
return x86::rcpss(x);
#else
UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
return { 0.0f };
#endif
}
RValue<Float> Sqrt(RValue<Float> x) RValue<Float> Sqrt(RValue<Float> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include "Reactor.hpp" #include "Reactor.hpp"
#include "CPUID.hpp"
#include "Debug.hpp" #include "Debug.hpp"
#include "Print.hpp" #include "Print.hpp"
...@@ -4659,4 +4660,56 @@ int DebugPrintf(const char *format, ...) ...@@ -4659,4 +4660,56 @@ int DebugPrintf(const char *format, ...)
#endif // ENABLE_RR_PRINT #endif // ENABLE_RR_PRINT
// Functions implemented by backends
bool HasRcpApprox();
RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2 = false);
RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2 = false);
template<typename T>
static RValue<T> DoRcp(RValue<T> x, Precision p, bool finite, bool exactAtPow2)
{
#if defined(__i386__) || defined(__x86_64__) // On x86, 1/x is fast enough, except for lower precision
bool approx = HasRcpApprox() && (p != Precision::Full);
#else
bool approx = HasRcpApprox();
#endif
T rcp;
if(approx)
{
rcp = RcpApprox(x, exactAtPow2);
if(p == Precision::Full)
{
// Perform one more iteration of Newton-Rhapson division to increase precision
rcp = (rcp + rcp) - (x * rcp * rcp);
}
}
else
{
rcp = T(1.0f) / x;
}
if(finite)
{
constexpr int big = 0x7F7FFFFF;
rcp = Min(rcp, T((float &)big));
}
return rcp;
}
RValue<Float4> Rcp(RValue<Float4> x, Precision p, bool finite, bool exactAtPow2)
{
RR_DEBUG_INFO_UPDATE_LOC();
return DoRcp(x, p, finite, exactAtPow2);
}
RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2)
{
RR_DEBUG_INFO_UPDATE_LOC();
return DoRcp(x, p, finite, exactAtPow2);
}
} // namespace rr } // namespace rr
...@@ -66,6 +66,16 @@ int DebugPrintf(const char *format, ...); ...@@ -66,6 +66,16 @@ int DebugPrintf(const char *format, ...);
namespace rr { namespace rr {
// These generally map to the precision types as specified by the Vulkan specification.
// See https://www.khronos.org/registry/vulkan/specs/1.2/html/chap37.html#spirvenv-precision-operation
enum class Precision
{
/*Exact,*/ // 0 ULP with correct rounding (i.e. Math.h)
Full, // Single precision, but not relaxed
Relaxed, // Single precision, relaxed
/*Half,*/ // Half precision
};
std::string BackendName(); std::string BackendName();
struct Capabilities struct Capabilities
...@@ -2155,8 +2165,11 @@ RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs); ...@@ -2155,8 +2165,11 @@ RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs);
RValue<Float> Abs(RValue<Float> x); RValue<Float> Abs(RValue<Float> x);
RValue<Float> Max(RValue<Float> x, RValue<Float> y); RValue<Float> Max(RValue<Float> x, RValue<Float> y);
RValue<Float> Min(RValue<Float> x, RValue<Float> y); RValue<Float> Min(RValue<Float> x, RValue<Float> y);
// Deprecated: use Rcp
// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false); RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
RValue<Float> RcpSqrt_pp(RValue<Float> val); RValue<Float> RcpSqrt_pp(RValue<Float> val);
RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
RValue<Float> Sqrt(RValue<Float> x); RValue<Float> Sqrt(RValue<Float> x);
// RValue<Int4> IsInf(RValue<Float> x); // RValue<Int4> IsInf(RValue<Float> x);
...@@ -2319,8 +2332,12 @@ RValue<Float4> operator-(RValue<Float4> val); ...@@ -2319,8 +2332,12 @@ RValue<Float4> operator-(RValue<Float4> val);
RValue<Float4> Abs(RValue<Float4> x); RValue<Float4> Abs(RValue<Float4> x);
RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y); RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y); RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
// Deprecated: use Rcp
// TODO(b/147516027): Remove when GLES frontend is removed
RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false); RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
RValue<Float4> RcpSqrt_pp(RValue<Float4> val); RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
RValue<Float4> Sqrt(RValue<Float4> x); RValue<Float4> Sqrt(RValue<Float4> x);
RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i); RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
RValue<Float> Extract(RValue<Float4> x, int i); RValue<Float> Extract(RValue<Float4> x, int i);
...@@ -2372,13 +2389,6 @@ RValue<Float4> Frac(RValue<Float4> x); ...@@ -2372,13 +2389,6 @@ RValue<Float4> Frac(RValue<Float4> x);
RValue<Float4> Floor(RValue<Float4> x); RValue<Float4> Floor(RValue<Float4> x);
RValue<Float4> Ceil(RValue<Float4> x); RValue<Float4> Ceil(RValue<Float4> x);
enum class Precision
{
Full,
Relaxed,
//Half,
};
// Trigonometric functions // Trigonometric functions
// TODO: Currently unimplemented for Subzero. // TODO: Currently unimplemented for Subzero.
RValue<Float4> Sin(RValue<Float4> x); RValue<Float4> Sin(RValue<Float4> x);
......
...@@ -3932,6 +3932,26 @@ RValue<Float4> RcpSqrt_pp(RValue<Float4> x) ...@@ -3932,6 +3932,26 @@ RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
return Rcp_pp(Sqrt(x)); return Rcp_pp(Sqrt(x));
} }
bool HasRcpApprox()
{
// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
return false;
}
RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
{
// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
UNREACHABLE("RValue<Float4> RcpApprox()");
return { 0.0f };
}
RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
{
// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
UNREACHABLE("RValue<Float> RcpApprox()");
return { 0.0f };
}
RValue<Float4> Sqrt(RValue<Float4> x) RValue<Float4> Sqrt(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment