Optimize reciprocal operation

This change deprecates rr::Rcp_pp with rr::Rcp, which makes sure to correctly compute the reciprocal using the Newton-Rhapson refinement only if the current target supports the required instrinsic, otherwise using 1 / x. Currently, only LLVM on Intel will use NR. Note that passing in Precision::Relaxed will produce a faster, but less precise reciprocal. Also removed PixelProgram::linearToSRGB as it's unused. Bug: b/169760262 Bug: b/149574741 Change-Id: I4a2f943aa60116c4397d7a8ae18583a260824788 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/50648Reviewed-by: Alexis Hétu <sugoi@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com> Tested-by: Antonio Maiorano <amaiorano@google.com> Commit-Queue: Antonio Maiorano <amaiorano@google.com>

Optimize reciprocal operation
d1561871 · Antonio Maiorano · swiftshader-scoped@luci-project-accounts.iam.gserviceaccount.com · 1ca6504e · d1561871 · d1561871
Commit d1561871 authored Dec 14, 2020 by Antonio Maiorano Committed by swiftshader-scoped@luci-project-accounts.iam.gserviceaccount.com Dec 17, 2020
9 changed files
--- a/src/Pipeline/PixelProgram.cpp
+++ b/src/Pipeline/PixelProgram.cpp
@@ -373,12 +373,4 @@ void PixelProgram::clampColor(Vector4f oC[RENDERTARGETS])
 	}
 }
-Float4 PixelProgram::linearToSRGB(const Float4 &x)  // Approximates x^(1.0/2.2)
-{
-	Float4 sqrtx = Rcp_pp(RcpSqrt_pp(x));
-	Float4 sRGB = sqrtx * Float4(1.14f) - x * Float4(0.14f);
-	return Min(Max(sRGB, Float4(0.0f)), Float4(1.0f));
-}
 }  // namespace sw
--- a/src/Pipeline/PixelProgram.hpp
+++ b/src/Pipeline/PixelProgram.hpp
@@ -48,7 +48,6 @@ private:
 	Int4 maskAny(Int cMask[4]) const;
 	Int4 maskAny(Int cMask[4], Int sMask[4], Int zMask[4]) const;
-	Float4 linearToSRGB(const Float4 &x);
 };
 }  // namespace sw

--- a/src/Pipeline/PixelRoutine.cpp
+++ b/src/Pipeline/PixelRoutine.cpp
@@ -133,7 +133,7 @@ void PixelRoutine::quad(Pointer<Byte> cBuffer[RENDERTARGETS], Pointer<Byte> &zBu
 				WWWW += *Pointer<Float4>(constants + OFFSET(Constants, weight) + 16 * cMask[q]);
 			}
-			WWWW = Rcp_pp(WWWW);
+			WWWW = Rcp(WWWW, Precision::Relaxed);
 			XXXX *= WWWW;
 			YYYY *= WWWW;

--- a/src/Pipeline/SamplerCore.cpp
+++ b/src/Pipeline/SamplerCore.cpp
@@ -1226,10 +1226,10 @@ void SamplerCore::computeLod2D(Pointer<Byte> &texture, Float &lod, Float &anisot
 		uDelta = As<Float4>((As<Int4>(dudx) & mask) | ((As<Int4>(dudy) & ~mask)));
 		vDelta = As<Float4>((As<Int4>(dvdx) & mask) | ((As<Int4>(dvdy) & ~mask)));
-		anisotropy = lod * Rcp_pp(det);
+		anisotropy = lod * Rcp(det, Precision::Relaxed);
 		anisotropy = Min(anisotropy, state.maxAnisotropy);
-		lod *= Rcp_pp(anisotropy * anisotropy);
+		lod *= Rcp(anisotropy * anisotropy, Precision::Relaxed);
 	}
 	lod = log2sqrt(lod);  // log2(sqrt(lod))

--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -224,22 +224,7 @@ Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp)
 Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
 {
-	//return Float4(1.0f) / x;
+	return Rcp(x, pp ? Precision::Relaxed : Precision::Full, finite, exactAtPow2);
-	Float4 rcp = Rcp_pp(x, exactAtPow2);
-	if(!pp)
-	{
-		rcp = (rcp + rcp) - (x * rcp * rcp);
-	}
-	if(finite)
-	{
-		int big = 0x7F7FFFFF;
-		rcp = Min(rcp, Float4((float &)big));
-	}
-	return rcp;
 }
 Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)

--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2865,6 +2865,47 @@ RValue<Float> RcpSqrt_pp(RValue<Float> x)
 #endif
 }
+bool HasRcpApprox()
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return true;
+#else
+	return false;
+#endif
+}
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpps uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpps(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	if(exactAtPow2)
+	{
+		// rcpss uses a piecewise-linear approximation which minimizes the relative error
+		// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
+		return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
+	}
+	return x86::rcpss(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
 RValue<Float> Sqrt(RValue<Float> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();

--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -14,6 +14,7 @@
 #include "Reactor.hpp"
+#include "CPUID.hpp"
 #include "Debug.hpp"
 #include "Print.hpp"
@@ -4659,4 +4660,56 @@ int DebugPrintf(const char *format, ...)
 #endif  // ENABLE_RR_PRINT
+// Functions implemented by backends
+bool HasRcpApprox();
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2 = false);
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2 = false);
+template<typename T>
+static RValue<T> DoRcp(RValue<T> x, Precision p, bool finite, bool exactAtPow2)
+{
+#if defined(__i386__) || defined(__x86_64__)  // On x86, 1/x is fast enough, except for lower precision
+	bool approx = HasRcpApprox() && (p != Precision::Full);
+#else
+	bool approx = HasRcpApprox();
+#endif
+	T rcp;
+	if(approx)
+	{
+		rcp = RcpApprox(x, exactAtPow2);
+		if(p == Precision::Full)
+		{
+			// Perform one more iteration of Newton-Rhapson division to increase precision
+			rcp = (rcp + rcp) - (x * rcp * rcp);
+		}
+	}
+	else
+	{
+		rcp = T(1.0f) / x;
+	}
+	if(finite)
+	{
+		constexpr int big = 0x7F7FFFFF;
+		rcp = Min(rcp, T((float &)big));
+	}
+	return rcp;
+}
+RValue<Float4> Rcp(RValue<Float4> x, Precision p, bool finite, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return DoRcp(x, p, finite, exactAtPow2);
+}
+RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	return DoRcp(x, p, finite, exactAtPow2);
+}
 }  // namespace rr
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -66,6 +66,16 @@ int DebugPrintf(const char *format, ...);
 namespace rr {
+// These generally map to the precision types as specified by the Vulkan specification.
+// See https://www.khronos.org/registry/vulkan/specs/1.2/html/chap37.html#spirvenv-precision-operation
+enum class Precision
+{
+	/*Exact,*/  // 0 ULP with correct rounding (i.e. Math.h)
+	Full,       // Single precision, but not relaxed
+	Relaxed,    // Single precision, relaxed
+	/*Half,*/   // Half precision
+};
 std::string BackendName();
 struct Capabilities
@@ -2155,8 +2165,11 @@ RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs);
 RValue<Float> Abs(RValue<Float> x);
 RValue<Float> Max(RValue<Float> x, RValue<Float> y);
 RValue<Float> Min(RValue<Float> x, RValue<Float> y);
+// Deprecated: use Rcp
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
 RValue<Float> RcpSqrt_pp(RValue<Float> val);
+RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
 RValue<Float> Sqrt(RValue<Float> x);
 //	RValue<Int4> IsInf(RValue<Float> x);
@@ -2319,8 +2332,12 @@ RValue<Float4> operator-(RValue<Float4> val);
 RValue<Float4> Abs(RValue<Float4> x);
 RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
 RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
+// Deprecated: use Rcp
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
 RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
+RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
 RValue<Float4> Sqrt(RValue<Float4> x);
 RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
 RValue<Float> Extract(RValue<Float4> x, int i);
@@ -2372,13 +2389,6 @@ RValue<Float4> Frac(RValue<Float4> x);
 RValue<Float4> Floor(RValue<Float4> x);
 RValue<Float4> Ceil(RValue<Float4> x);
-enum class Precision
-{
-	Full,
-	Relaxed,
-	//Half,
-};
 // Trigonometric functions
 // TODO: Currently unimplemented for Subzero.
 RValue<Float4> Sin(RValue<Float4> x);

--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3932,6 +3932,26 @@ RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
 	return Rcp_pp(Sqrt(x));
 }
+bool HasRcpApprox()
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	return false;
+}
+RValue<Float4> RcpApprox(RValue<Float4> x, bool exactAtPow2)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float4> RcpApprox()");
+	return { 0.0f };
+}
+RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float> RcpApprox()");
+	return { 0.0f };
+}
 RValue<Float4> Sqrt(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();