Optimize reciprocal sqrt operation

This change deprecates rr::RcpSqrt_pp with rr::RcpSqrt. As with Rcp, RcpSqrt computes the result using Newton-Rhapson if it's faster and the initial approximation intrinsic is available on the current target. Currently, only LLVM on Intel will use NR for RelaxedPrecision. Note that passing in Precision::Relaxed will produce a faster, but less precise reciprocal sqrt. Also made it so that SprivShader instruction GLSLstd450InverseSqrt now invokes RcpSqrt(x, Precision::Full) instead of performing 1/sqrt(x). Note that the Vulkan spec states that inversesqrt()'s precision is 2 ULP, and sqrt()'s precision is inherited from 1.0 / inversesqrt(); however, our rr::Sqrt is implemented in terms of x86's sqrt intrinsic on x86, or as calls to sqrt from Math.h. Bug: b/169760262 Change-Id: I65ba9a64d1db934c523dda11c1a2c186059d220b Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51268 Commit-Queue: Antonio Maiorano <amaiorano@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com> Tested-by: Antonio Maiorano <amaiorano@google.com>

Optimize reciprocal sqrt operation
1cc5b335 · Antonio Maiorano · swiftshader-scoped@luci-project-accounts.iam.gserviceaccount.com · d1561871 · 1cc5b335 · 1cc5b335
Commit 1cc5b335 authored Dec 14, 2020 by Antonio Maiorano Committed by swiftshader-scoped@luci-project-accounts.iam.gserviceaccount.com Dec 17, 2020
6 changed files
--- a/src/Pipeline/ShaderCore.cpp
+++ b/src/Pipeline/ShaderCore.cpp
@@ -236,25 +236,7 @@ Float4 reciprocalSquareRoot(RValue<Float4> x, bool absolute, bool pp)
 		abs = Abs(abs);
 	}
-	Float4 rsq;
+	return Rcp(abs, pp ? Precision::Relaxed : Precision::Full);
-	if(!pp)
-	{
-		rsq = Float4(1.0f) / Sqrt(abs);
-	}
-	else
-	{
-		rsq = RcpSqrt_pp(abs);
-		if(!pp)
-		{
-			rsq = rsq * (Float4(3.0f) - rsq * rsq * abs) * Float4(0.5f);
-		}
-		rsq = As<Float4>(CmpNEQ(As<Int4>(abs), Int4(0x7F800000)) & As<Int4>(rsq));
-	}
-	return rsq;
 }
 Float4 modulo(RValue<Float4> x, RValue<Float4> y)

--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -750,19 +750,10 @@ SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitSt
 			auto val = Operand(this, state, insn.word(5));
 			Decorations d;
 			ApplyDecorationsForId(&d, insn.word(5));
-			if(d.RelaxedPrecision)
-			{
+			for(auto i = 0u; i < type.componentCount; i++)
-				for(auto i = 0u; i < type.componentCount; i++)
-				{
-					dst.move(i, RcpSqrt_pp(val.Float(i)));
-				}
-			}
-			else
 			{
-				for(auto i = 0u; i < type.componentCount; i++)
+				dst.move(i, RcpSqrt(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
-				{
-					dst.move(i, SIMD::Float(1.0f) / Sqrt(val.Float(i)));
-				}
 			}
 			break;
 		}

--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2906,6 +2906,35 @@ RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
 #endif
 }
+bool HasRcpSqrtApprox()
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return true;
+#else
+	return false;
+#endif
+}
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtps(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
+RValue<Float> RcpSqrtApprox(RValue<Float> x)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return x86::rsqrtss(x);
+#else
+	UNREACHABLE("RValue<Float4> RcpSqrtApprox() not available on this platform");
+	return { 0.0f };
+#endif
+}
 RValue<Float> Sqrt(RValue<Float> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();

--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -4712,4 +4712,69 @@ RValue<Float> Rcp(RValue<Float> x, Precision p, bool finite, bool exactAtPow2)
 	return DoRcp(x, p, finite, exactAtPow2);
 }
+// Functions implemented by backends
+bool HasRcpSqrtApprox();
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x);
+RValue<Float> RcpSqrtApprox(RValue<Float> x);
+template<typename T>
+struct CastToIntType;
+template<>
+struct CastToIntType<Float4>
+{
+	using type = Int4;
+};
+template<>
+struct CastToIntType<Float>
+{
+	using type = Int;
+};
+// TODO: move to Reactor.hpp?
+RValue<Int> CmpNEQ(RValue<Int> x, RValue<Int> y)
+{
+	return IfThenElse(x != y, Int(~0), Int(0));
+}
+template<typename T>
+static RValue<T> DoRcpSqrt(RValue<T> x, Precision p)
+{
+#if defined(__i386__) || defined(__x86_64__)  // On x86, 1/x is fast enough, except for lower precision
+	bool approx = HasRcpApprox() && (p != Precision::Full);
+#else
+	bool approx = HasRcpApprox();
+#endif
+	if(approx)
+	{
+		using IntType = typename CastToIntType<T>::type;
+		T rsq = RcpSqrtApprox(x);
+		if(p == Precision::Full)
+		{
+			rsq = rsq * (T(3.0f) - rsq * rsq * x) * T(0.5f);
+			rsq = As<T>(CmpNEQ(As<IntType>(x), IntType(0x7F800000)) & As<IntType>(rsq));
+		}
+		return rsq;
+	}
+	else
+	{
+		return T(1.0f) / Sqrt(x);
+	}
+}
+RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p)
+{
+	return DoRcpSqrt(x, p);
+}
+RValue<Float> RcpSqrt(RValue<Float> x, Precision p)
+{
+	return DoRcpSqrt(x, p);
+}
 }  // namespace rr
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2168,8 +2168,11 @@ RValue<Float> Min(RValue<Float> x, RValue<Float> y);
 // Deprecated: use Rcp
 // TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
+// Deprecated: use RcpSqrt
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float> RcpSqrt_pp(RValue<Float> val);
 RValue<Float> Rcp(RValue<Float> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
+RValue<Float> RcpSqrt(RValue<Float> x, Precision p = Precision::Full);
 RValue<Float> Sqrt(RValue<Float> x);
 //	RValue<Int4> IsInf(RValue<Float> x);
@@ -2336,8 +2339,11 @@ RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
 // Deprecated: use Rcp
 // TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
+// Deprecated: use RcpSqrt
+// TODO(b/147516027): Remove when GLES frontend is removed
 RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
 RValue<Float4> Rcp(RValue<Float4> x, Precision p = Precision::Full, bool finite = false, bool exactAtPow2 = false);
+RValue<Float4> RcpSqrt(RValue<Float4> x, Precision p = Precision::Full);
 RValue<Float4> Sqrt(RValue<Float4> x);
 RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
 RValue<Float> Extract(RValue<Float4> x, int i);

--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3952,6 +3952,25 @@ RValue<Float> RcpApprox(RValue<Float> x, bool exactAtPow2)
 	return { 0.0f };
 }
+bool HasRcpSqrtApprox()
+{
+	return false;
+}
+RValue<Float4> RcpSqrtApprox(RValue<Float4> x)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float4> RcpSqrtApprox()");
+	return { 0.0f };
+}
+RValue<Float> RcpSqrtApprox(RValue<Float> x)
+{
+	// TODO(b/175612820): Update once we implement x86 SSE rcp_ss and rsqrt_ss intrinsics in Subzero
+	UNREACHABLE("RValue<Float> RcpSqrtApprox()");
+	return { 0.0f };
+}
 RValue<Float4> Sqrt(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();