Optimize transcendentals for Subzero

With this change, we can now select implementations of most transcendentals from either the "emulated" or "optimal" namespaces. The emulated versions generally call the math.h standard function on each component for vector types, while the optimal versions typically implement some approximation in Reactor to produce vectorized code. Most of the optimal versions were taken directly from ShaderCore.cpp, except for ASin, for which I implemented an 8-term approximation. The new versions are faster, and pass all deqp precision tests. Here's a table of benchmarks that show the performance improvements that were made. Note that Asin and Acos now take a Precision parameter for Full and Relaxed precision: Before After rr_Sin 48.6 ns 10.6 ns rr_Cos 67.1 ns 9.62 ns rr_Tan 75.5 ns 19.4 ns rr_Asin_fullp 24.2 ns 23.0 ns rr_Asin_relaxedp N/A 9.31 ns rr_Acos_fullp 14.3 ns 6.35 ns rr_Acos_relaxedp N/A 4.56 ns rr_Atan 66.8 ns 12.9 ns rr_Sinh 79.7 ns 11.5 ns rr_Cosh 80.1 ns 11.5 ns rr_Tanh 62.9 ns 12.1 ns rr_Asinh 104 ns 9.44 ns rr_Acosh 14.4 ns 10.2 ns rr_Atanh 170 ns 9.81 ns rr_Atan2 73.5 ns 22.8 ns rr_Pow 87.9 ns 16.3 ns rr_Exp 40.2 ns 5.72 ns rr_Log 44.0 ns 7.35 ns rr_Exp2 101 ns 5.38 ns rr_Log2 106 ns 9.24 ns Bug: b/147818976 Change-Id: I791893bd9f005dbbae4770fb474de338a04845be Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/48588 Kokoro-Result: kokoro <noreply+kokoro@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com> Tested-by: Antonio Maiorano <amaiorano@google.com>

Optimize transcendentals for Subzero
9c14bda0 · Antonio Maiorano · b042f4e7 · 9c14bda0 · 9c14bda0 · 9c14bda0
Commit 9c14bda0 authored Sep 18, 2020 by Antonio Maiorano
12 changed files
--- a/src/Android.bp
+++ b/src/Android.bp
@@ -192,10 +192,11 @@ cc_defaults {
    srcs: [
        "Reactor/CPUID.cpp",
        "Reactor/Debug.cpp",
-        "Reactor/EmulatedReactor.cpp",
+        "Reactor/EmulatedIntrinsics.cpp",
        "Reactor/ExecutableMemory.cpp",
        "Reactor/LLVMJIT.cpp",
        "Reactor/LLVMReactor.cpp",
+        "Reactor/OptimalIntrinsics.cpp",
        "Reactor/Reactor.cpp",
    ],
@@ -223,10 +224,11 @@ cc_defaults {
    srcs: [
        "Reactor/CPUID.cpp",
        "Reactor/Debug.cpp",
-        "Reactor/EmulatedReactor.cpp",
+        "Reactor/EmulatedIntrinsics.cpp",
        "Reactor/ExecutableMemory.cpp",
        "Reactor/LLVMJIT.cpp",
        "Reactor/LLVMReactor.cpp",
+        "Reactor/OptimalIntrinsics.cpp",
        "Reactor/Reactor.cpp",
    ],

--- a/src/Pipeline/SpirvShaderGLSLstd450.cpp
+++ b/src/Pipeline/SpirvShaderGLSLstd450.cpp
@@ -598,18 +598,22 @@ SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitSt
 		case GLSLstd450Asin:
 		{
 			auto val = Operand(this, state, insn.word(5));
+			Decorations d;
+			ApplyDecorationsForId(&d, insn.word(5));
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, Asin(val.Float(i)));
+				dst.move(i, Asin(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
 			}
 			break;
 		}
 		case GLSLstd450Acos:
 		{
 			auto val = Operand(this, state, insn.word(5));
+			Decorations d;
+			ApplyDecorationsForId(&d, insn.word(5));
 			for(auto i = 0u; i < type.componentCount; i++)
 			{
-				dst.move(i, Acos(val.Float(i)));
+				dst.move(i, Acos(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
 			}
 			break;
 		}

--- a/src/Reactor/BUILD.gn
+++ b/src/Reactor/BUILD.gn
@@ -47,8 +47,9 @@ config("swiftshader_reactor_private_config") {
 swiftshader_source_set("swiftshader_reactor_base") {
  sources = [
    "Debug.cpp",
-    "EmulatedReactor.cpp",
+    "EmulatedIntrinsics.cpp",
    "ExecutableMemory.cpp",
+    "OptimalIntrinsics.cpp"
    "Reactor.cpp",
  ]
 }

--- a/src/Reactor/CMakeLists.txt
+++ b/src/Reactor/CMakeLists.txt
@@ -20,10 +20,13 @@ set(ROOT_PROJECT_COMPILE_OPTIONS
 set(REACTOR_SRC_FILES
    Debug.cpp
    Debug.hpp
-    EmulatedReactor.cpp
+    EmulatedIntrinsics.cpp
+    EmulatedIntrinsics.hpp
    ExecutableMemory.cpp
    ExecutableMemory.hpp
    Nucleus.hpp
+    OptimalIntrinsics.cpp
+    OptimalIntrinsics.hpp
    Print.hpp
    Reactor.cpp
    Reactor.hpp

--- a/src/Reactor/EmulatedReactor.cpp
+++ b/src/Reactor/EmulatedReactor.cpp
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "EmulatedReactor.hpp"
+#include "EmulatedIntrinsics.hpp"
 #include <algorithm>
 #include <cmath>

--- a/src/Reactor/EmulatedReactor.hpp
+++ b/src/Reactor/EmulatedReactor.hpp
@@ -14,7 +14,7 @@
 #include "Reactor.hpp"
-// Implementation of Reactor functions that are "emulated" - that is,
+// Implementation of intrinsics that are "emulated" - that is,
 // implemented either in terms of Reactor code, or make use of
 // rr::Call to C functions. These are typically slower than implementing
 // in terms of direct calls to the JIT backend; however, provide a good

--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -16,7 +16,7 @@
 #include "CPUID.hpp"
 #include "Debug.hpp"
-#include "EmulatedReactor.hpp"
+#include "EmulatedIntrinsics.hpp"
 #include "LLVMReactorDebugInfo.hpp"
 #include "Print.hpp"
 #include "Reactor.hpp"
@@ -3220,13 +3220,13 @@ static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *na
 	return RValue<Float4>(V(out));
 }
-RValue<Float4> Asin(RValue<Float4> v)
+RValue<Float4> Asin(RValue<Float4> v, Precision p)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
 	return TransformFloat4PerElement(v, "asinf");
 }
-RValue<Float4> Acos(RValue<Float4> v)
+RValue<Float4> Acos(RValue<Float4> v, Precision p)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
 	return TransformFloat4PerElement(v, "acosf");

--- a/src/Reactor/OptimalIntrinsics.cpp
+++ b/src/Reactor/OptimalIntrinsics.cpp
+// Copyright 2020 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "OptimalIntrinsics.hpp"
+namespace rr {
+namespace {
+Float4 Reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false)
+{
+	Float4 rcp = Rcp_pp(x, exactAtPow2);
+	if(!pp)
+	{
+		rcp = (rcp + rcp) - (x * rcp * rcp);
+	}
+	if(finite)
+	{
+		int big = 0x7F7FFFFF;
+		rcp = Min(rcp, Float4((float &)big));
+	}
+	return rcp;
+}
+Float4 SinOrCos(RValue<Float4> x, bool sin)
+{
+	// Reduce to [-0.5, 0.5] range
+	Float4 y = x * Float4(1.59154943e-1f);  // 1/2pi
+	y = y - Round(y);
+	// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
+	// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
+	// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
+	//  pp : 4 mul, 2 add, 2 abs
+	Float4 y2 = y * y;
+	Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
+	Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
+	Float4 c2 = (c1 * c1) - (s1 * s1);
+	Float4 s2 = Float4(2.0f) * s1 * c1;
+	Float4 r = Reciprocal(s2 * s2 + c2 * c2, false, true, false);
+	if(sin)
+	{
+		return Float4(2.0f) * s2 * c2 * r;
+	}
+	else
+	{
+		return ((c2 * c2) - (s2 * s2)) * r;
+	}
+}
+// Approximation of atan in [0..1]
+Float4 Atan_01(Float4 x)
+{
+	// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+	const Float4 a2(-0.3333314528f);
+	const Float4 a4(0.1999355085f);
+	const Float4 a6(-0.1420889944f);
+	const Float4 a8(0.1065626393f);
+	const Float4 a10(-0.0752896400f);
+	const Float4 a12(0.0429096138f);
+	const Float4 a14(-0.0161657367f);
+	const Float4 a16(0.0028662257f);
+	Float4 x2 = x * x;
+	return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
+}
+}  // namespace
+namespace optimal {
+Float4 Sin(RValue<Float4> x)
+{
+	return SinOrCos(x, true);
+}
+Float4 Cos(RValue<Float4> x)
+{
+	return SinOrCos(x, false);
+}
+Float4 Tan(RValue<Float4> x)
+{
+	return SinOrCos(x, true) / SinOrCos(x, false);
+}
+Float4 Asin_4_terms(RValue<Float4> x)
+{
+	// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+	// |e(x)| <= 5e-8
+	const Float4 half_pi(1.57079632f);
+	const Float4 a0(1.5707288f);
+	const Float4 a1(-0.2121144f);
+	const Float4 a2(0.0742610f);
+	const Float4 a3(-0.0187293f);
+	Float4 absx = Abs(x);
+	return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
+	                  (As<Int4>(x) & Int4(0x80000000)));
+}
+Float4 Asin_8_terms(RValue<Float4> x)
+{
+	// From 4.4.46, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
+	// |e(x)| <= 0e-8
+	const Float4 half_pi(1.5707963268f);
+	const Float4 a0(1.5707963050f);
+	const Float4 a1(-0.2145988016f);
+	const Float4 a2(0.0889789874f);
+	const Float4 a3(-0.0501743046f);
+	const Float4 a4(0.0308918810f);
+	const Float4 a5(-0.0170881256f);
+	const Float4 a6(0.006700901f);
+	const Float4 a7(-0.0012624911f);
+	Float4 absx = Abs(x);
+	return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * (a3 + absx * (a4 + absx * (a5 + absx * (a6 + absx * a7)))))))) ^
+	                  (As<Int4>(x) & Int4(0x80000000)));
+}
+Float4 Acos_4_terms(RValue<Float4> x)
+{
+	// pi/2 - arcsin(x)
+	return Float4(1.57079632e+0f) - Asin_4_terms(x);
+}
+Float4 Acos_8_terms(RValue<Float4> x)
+{
+	// pi/2 - arcsin(x)
+	return Float4(1.57079632e+0f) - Asin_8_terms(x);
+}
+Float4 Atan(RValue<Float4> x)
+{
+	Float4 absx = Abs(x);
+	Int4 O = CmpNLT(absx, Float4(1.0f));
+	Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx)));  // FIXME: Vector select
+	const Float4 half_pi(1.57079632f);
+	Float4 theta = Atan_01(y);
+	return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^  // FIXME: Vector select
+	                  (As<Int4>(x) & Int4(0x80000000)));
+}
+Float4 Atan2(RValue<Float4> y, RValue<Float4> x)
+{
+	const Float4 pi(3.14159265f);             // pi
+	const Float4 minus_pi(-3.14159265f);      // -pi
+	const Float4 half_pi(1.57079632f);        // pi/2
+	const Float4 quarter_pi(7.85398163e-1f);  // pi/4
+	// Rotate to upper semicircle when in lower semicircle
+	Int4 S = CmpLT(y, Float4(0.0f));
+	Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
+	Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
+	Float4 y0 = Abs(y);
+	// Rotate to right quadrant when in left quadrant
+	Int4 Q = CmpLT(x0, Float4(0.0f));
+	theta += As<Float4>(Q & As<Int4>(half_pi));
+	Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0)));   // FIXME: Vector select
+	Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0)));  // FIXME: Vector select
+	// Mirror to first octant when in second octant
+	Int4 O = CmpNLT(y1, x1);
+	Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1)));  // FIXME: Vector select
+	Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1)));  // FIXME: Vector select
+	// Approximation of atan in [0..1]
+	Int4 zero_x = CmpEQ(x2, Float4(0.0f));
+	Int4 inf_y = IsInf(y2);  // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
+	Float4 atan2_theta = Atan_01(y2 / x2);
+	theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) |  // FIXME: Vector select
+	                    (inf_y & As<Int4>(quarter_pi)));
+	// Recover loss of precision for tiny theta angles
+	// This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
+	Int4 precision_loss = S & Q & O & ~inf_y;
+	return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta)));  // FIXME: Vector select
+}
+Float4 Exp2(RValue<Float4> x)
+{
+	// This implementation is based on 2^(i + f) = 2^i * 2^f,
+	// where i is the integer part of x and f is the fraction.
+	// For 2^i we can put the integer part directly in the exponent of
+	// the IEEE-754 floating-point number. Clamp to prevent overflow
+	// past the representation of infinity.
+	Float4 x0 = x;
+	x0 = Min(x0, As<Float4>(Int4(0x43010000)));  // 129.00000e+0f
+	x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF)));  // -126.99999e+0f
+	Int4 i = RoundInt(x0 - Float4(0.5f));
+	Float4 ii = As<Float4>((i + Int4(127)) << 23);  // Add single-precision bias, and shift into exponent.
+	// For the fractional part use a polynomial
+	// which approximates 2^f in the 0 to 1 range.
+	Float4 f = x0 - Float4(i);
+	Float4 ff = As<Float4>(Int4(0x3AF61905));    // 1.8775767e-3f
+	ff = ff * f + As<Float4>(Int4(0x3C134806));  // 8.9893397e-3f
+	ff = ff * f + As<Float4>(Int4(0x3D64AA23));  // 5.5826318e-2f
+	ff = ff * f + As<Float4>(Int4(0x3E75EAD4));  // 2.4015361e-1f
+	ff = ff * f + As<Float4>(Int4(0x3F31727B));  // 6.9315308e-1f
+	ff = ff * f + Float4(1.0f);
+	return ii * ff;
+}
+Float4 Log2(RValue<Float4> x)
+{
+	Float4 x0;
+	Float4 x1;
+	Float4 x2;
+	Float4 x3;
+	x0 = x;
+	x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
+	x1 = As<Float4>(As<UInt4>(x1) >> 8);
+	x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
+	x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f);  // FIXME: (x1 - 1.4960938f) * 256.0f;
+	x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
+	x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
+	x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
+	x2 /= x3;
+	x1 += (x0 - Float4(1.0f)) * x2;
+	Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
+	return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
+}
+Float4 Exp(RValue<Float4> x)
+{
+	// TODO: Propagate the constant
+	return optimal::Exp2(Float4(1.44269504f) * x);  // 1/ln(2)
+}
+Float4 Log(RValue<Float4> x)
+{
+	// TODO: Propagate the constant
+	return Float4(6.93147181e-1f) * optimal::Log2(x);  // ln(2)
+}
+Float4 Pow(RValue<Float4> x, RValue<Float4> y)
+{
+	Float4 log = optimal::Log2(x);
+	log *= y;
+	return optimal::Exp2(log);
+}
+Float4 Sinh(RValue<Float4> x)
+{
+	return (optimal::Exp(x) - optimal::Exp(-x)) * Float4(0.5f);
+}
+Float4 Cosh(RValue<Float4> x)
+{
+	return (optimal::Exp(x) + optimal::Exp(-x)) * Float4(0.5f);
+}
+Float4 Tanh(RValue<Float4> x)
+{
+	Float4 e_x = optimal::Exp(x);
+	Float4 e_minus_x = optimal::Exp(-x);
+	return (e_x - e_minus_x) / (e_x + e_minus_x);
+}
+Float4 Asinh(RValue<Float4> x)
+{
+	return optimal::Log(x + Sqrt(x * x + Float4(1.0f)));
+}
+Float4 Acosh(RValue<Float4> x)
+{
+	return optimal::Log(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)));
+}
+Float4 Atanh(RValue<Float4> x)
+{
+	return optimal::Log((Float4(1.0f) + x) / (Float4(1.0f) - x)) * Float4(0.5f);
+}
+}  // namespace optimal
+}  // namespace rr
\ No newline at end of file
--- a/src/Reactor/OptimalIntrinsics.hpp
+++ b/src/Reactor/OptimalIntrinsics.hpp
+// Copyright 2020 The SwiftShader Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "Reactor.hpp"
+// Implementation of intrinsic functions that purport to be as optimal as
+// possible, in contrast to the rr::emulated versions, typically by
+// implementing approximations of the same math functions.
+namespace rr {
+namespace optimal {
+Float4 Sin(RValue<Float4> x);
+Float4 Cos(RValue<Float4> x);
+Float4 Tan(RValue<Float4> x);
+Float4 Asin_4_terms(RValue<Float4> x);
+Float4 Asin_8_terms(RValue<Float4> x);
+Float4 Acos_4_terms(RValue<Float4> x);
+Float4 Acos_8_terms(RValue<Float4> x);
+Float4 Atan(RValue<Float4> x);
+Float4 Atan2(RValue<Float4> y, RValue<Float4> x);
+Float4 Exp2(RValue<Float4> x);
+Float4 Log2(RValue<Float4> x);
+Float4 Exp(RValue<Float4> x);
+Float4 Log(RValue<Float4> x);
+Float4 Pow(RValue<Float4> x, RValue<Float4> y);
+Float4 Sinh(RValue<Float4> x);
+Float4 Cosh(RValue<Float4> x);
+Float4 Tanh(RValue<Float4> x);
+Float4 Asinh(RValue<Float4> x);
+Float4 Acosh(RValue<Float4> x);
+Float4 Atanh(RValue<Float4> x);
+}  // namespace optimal
+}  // namespace rr
--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2345,13 +2345,20 @@ RValue<Float4> Frac(RValue<Float4> x);
 RValue<Float4> Floor(RValue<Float4> x);
 RValue<Float4> Ceil(RValue<Float4> x);
+enum class Precision
+{
+	Full,
+	Relaxed,
+	//Half,
+};
 // Trigonometric functions
 // TODO: Currently unimplemented for Subzero.
 RValue<Float4> Sin(RValue<Float4> x);
 RValue<Float4> Cos(RValue<Float4> x);
 RValue<Float4> Tan(RValue<Float4> x);
-RValue<Float4> Asin(RValue<Float4> x);
+RValue<Float4> Asin(RValue<Float4> x, Precision p);
-RValue<Float4> Acos(RValue<Float4> x);
+RValue<Float4> Acos(RValue<Float4> x, Precision p);
 RValue<Float4> Atan(RValue<Float4> x);
 RValue<Float4> Sinh(RValue<Float4> x);
 RValue<Float4> Cosh(RValue<Float4> x);

--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -2050,6 +2050,11 @@ using IntrinsicTestParams_Float = IntrinsicTestParams<RValue<Float>(RValue<Float
 using IntrinsicTestParams_Float4 = IntrinsicTestParams<RValue<Float4>(RValue<Float4>), float(float), float>;
 using IntrinsicTestParams_Float4_Float4 = IntrinsicTestParams<RValue<Float4>(RValue<Float4>, RValue<Float4>), float(float, float), std::pair<float, float>>;
+// TODO(b/147818976): Each function has its own precision requirements for Vulkan, sometimes broken down
+// by input range. These are currently validated by deqp, but we can improve our own tests as well.
+// See https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#spirvenv-precision-operation
+constexpr double INTRINSIC_PRECISION = 1e-4;
 struct IntrinsicTest_Float : public testing::TestWithParam<IntrinsicTestParams_Float>
 {
 	void test()
@@ -2064,7 +2069,7 @@ struct IntrinsicTest_Float : public testing::TestWithParam<IntrinsicTestParams_F
 		for(auto &&v : GetParam().testValues)
 		{
 			SCOPED_TRACE(v);
-			EXPECT_FLOAT_EQ(routine(v), GetParam().refFunc(v));
+			EXPECT_NEAR(routine(v), GetParam().refFunc(v), INTRINSIC_PRECISION);
 		}
 	}
 };
@@ -2145,10 +2150,10 @@ struct IntrinsicTest_Float4 : public testing::TestWithParam<IntrinsicTestParams_
 			SCOPED_TRACE(v);
 			float4_value result = invokeRoutine(routine, float4_value{ v });
 			float4_value expected = float4_value{ GetParam().refFunc(v) };
-			EXPECT_FLOAT_EQ(result.v[0], expected.v[0]);
+			EXPECT_NEAR(result.v[0], expected.v[0], INTRINSIC_PRECISION);
-			EXPECT_FLOAT_EQ(result.v[1], expected.v[1]);
+			EXPECT_NEAR(result.v[1], expected.v[1], INTRINSIC_PRECISION);
-			EXPECT_FLOAT_EQ(result.v[2], expected.v[2]);
+			EXPECT_NEAR(result.v[2], expected.v[2], INTRINSIC_PRECISION);
-			EXPECT_FLOAT_EQ(result.v[3], expected.v[3]);
+			EXPECT_NEAR(result.v[3], expected.v[3], INTRINSIC_PRECISION);
 		}
 	}
 };
@@ -2172,19 +2177,19 @@ struct IntrinsicTest_Float4_Float4 : public testing::TestWithParam<IntrinsicTest
 			SCOPED_TRACE(v);
 			float4_value result = invokeRoutine(routine, float4_value{ v.first }, float4_value{ v.second });
 			float4_value expected = float4_value{ GetParam().refFunc(v.first, v.second) };
-			EXPECT_FLOAT_EQ(result.v[0], expected.v[0]);
+			EXPECT_NEAR(result.v[0], expected.v[0], INTRINSIC_PRECISION);
-			EXPECT_FLOAT_EQ(result.v[1], expected.v[1]);
+			EXPECT_NEAR(result.v[1], expected.v[1], INTRINSIC_PRECISION);
-			EXPECT_FLOAT_EQ(result.v[2], expected.v[2]);
+			EXPECT_NEAR(result.v[2], expected.v[2], INTRINSIC_PRECISION);
-			EXPECT_FLOAT_EQ(result.v[3], expected.v[3]);
+			EXPECT_NEAR(result.v[3], expected.v[3], INTRINSIC_PRECISION);
 		}
 	}
 };
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float, IntrinsicTest_Float, testing::Values(
-	IntrinsicTestParams_Float{ [](Float v) { return rr::Exp2(v); }, exp2f, {0.f, 1.f, 12345.f} },
+	IntrinsicTestParams_Float{ [](Float v) { return rr::Exp2(v); }, exp2f, {0.f, 1.f, 123.f} },
-	IntrinsicTestParams_Float{ [](Float v) { return rr::Log2(v); }, log2f, {0.f, 1.f, 12345.f} },
+	IntrinsicTestParams_Float{ [](Float v) { return rr::Log2(v); }, log2f, {1.f, 123.f} },
-	IntrinsicTestParams_Float{ [](Float v) { return rr::Sqrt(v); }, sqrtf, {0.f, 1.f, 12345.f} }
+	IntrinsicTestParams_Float{ [](Float v) { return rr::Sqrt(v); }, sqrtf, {0.f, 1.f, 123.f} }
 ));
 // clang-format on
@@ -2201,30 +2206,30 @@ float vulkan_coshf(float a)
 // clang-format off
 constexpr float PI = 3.141592653589793f;
 INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float4, IntrinsicTest_Float4, testing::Values(
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sin(v); },   sinf,   {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sin(v); },                    sinf,          {0.f, 1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cos(v); },   cosf,   {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cos(v); },                    cosf,          {0.f, 1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tan(v); },   tanf,   {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tan(v); },                    tanf,          {0.f, 1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asin(v); },  asinf,  {0.f, 1.f, -1.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asin(v, Precision::Full); },  asinf,         {0.f, 1.f, -1.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acos(v); },  acosf,  {0.f, 1.f, -1.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acos(v, Precision::Full); },  acosf,         {0.f, 1.f, -1.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atan(v); },  atanf,  {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atan(v); },                   atanf,         {0.f, 1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sinh(v); },  vulkan_sinhf,  {0.f, 1.f, PI, 12345.f, 0x1.65a84ep6}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sinh(v); },                   vulkan_sinhf,  {0.f, 1.f, PI}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cosh(v); },  vulkan_coshf,  {0.f, 1.f, PI, 12345.f, 0x1.65a84ep6} },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cosh(v); },                   vulkan_coshf,  {0.f, 1.f, PI} },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tanh(v); },  tanhf,  {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tanh(v); },                   tanhf,         {0.f, 1.f, PI}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asinh(v); }, asinhf, {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asinh(v); },                  asinhf,        {0.f, 1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acosh(v); }, acoshf, {     1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acosh(v); },                  acoshf,        {     1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atanh(v); }, atanhf, {0.f, 1.f, -1.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atanh(v); },                  atanhf,        {0.f, 0.9999f, -0.9999f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp(v); },   expf,   {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp(v); },                    expf,          {0.f, 1.f, PI}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log(v); },   logf,   {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log(v); },                    logf,          {1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp2(v); },  exp2f,  {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp2(v); },                   exp2f,         {0.f, 1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log2(v); },  log2f,  {0.f, 1.f, PI, 12345.f}  },
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log2(v); },                   log2f,         {1.f, PI, 123.f}  },
-	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sqrt(v); },  sqrtf,  {0.f, 1.f, PI, 12345.f}  }
+	IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sqrt(v); },                   sqrtf,         {0.f, 1.f, PI, 123.f}  }
 ));
 // clang-format on
 // clang-format off
 INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float4_Float4, IntrinsicTest_Float4_Float4, testing::Values(
-	IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Atan2(v1, v2); }, atan2f, { {0.f, 0.f}, {0.f, -1.f}, {-1.f, 0.f}, {12345.f, 12345.f} } },
+	IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Atan2(v1, v2); }, atan2f, { {0.f, 0.f}, {0.f, -1.f}, {-1.f, 0.f}, {123.f, 123.f} } },
-	IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Pow(v1, v2); },   powf,   { {0.f, 0.f}, {0.f, -1.f}, {-1.f, 0.f}, {12345.f, 12345.f} } }
+	IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Pow(v1, v2); },   powf,   { {1.f, 0.f}, {1.f, -1.f}, {-1.f, 0.f} } }
 ));
 // clang-format on

--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -13,7 +13,8 @@
 // limitations under the License.
 #include "Debug.hpp"
-#include "EmulatedReactor.hpp"
+#include "EmulatedIntrinsics.hpp"
+#include "OptimalIntrinsics.hpp"
 #include "Print.hpp"
 #include "Reactor.hpp"
 #include "ReactorDebugInfo.hpp"
@@ -4258,109 +4259,115 @@ RValue<Float> Log2(RValue<Float> x)
 RValue<Float4> Sin(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Sin(x);
+	return optimal::Sin(x);
 }
 RValue<Float4> Cos(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Cos(x);
+	return optimal::Cos(x);
 }
 RValue<Float4> Tan(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Tan(x);
+	return optimal::Tan(x);
 }
-RValue<Float4> Asin(RValue<Float4> x)
+RValue<Float4> Asin(RValue<Float4> x, Precision p)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Asin(x);
+	if(p == Precision::Full)
+	{
+		return emulated::Asin(x);
+	}
+	return optimal::Asin_8_terms(x);
 }
-RValue<Float4> Acos(RValue<Float4> x)
+RValue<Float4> Acos(RValue<Float4> x, Precision p)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Acos(x);
+	// Surprisingly, deqp-vk's precision.acos.highp/mediump tests pass when using the 4-term polynomial approximation
+	// version of acos, unlike for Asin, which requires higher precision algorithms.
+	return optimal::Acos_4_terms(x);
 }
 RValue<Float4> Atan(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Atan(x);
+	return optimal::Atan(x);
 }
 RValue<Float4> Sinh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Sinh(x);
+	return optimal::Sinh(x);
 }
 RValue<Float4> Cosh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Cosh(x);
+	return optimal::Cosh(x);
 }
 RValue<Float4> Tanh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Tanh(x);
+	return optimal::Tanh(x);
 }
 RValue<Float4> Asinh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Asinh(x);
+	return optimal::Asinh(x);
 }
 RValue<Float4> Acosh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Acosh(x);
+	return optimal::Acosh(x);
 }
 RValue<Float4> Atanh(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Atanh(x);
+	return optimal::Atanh(x);
 }
 RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Atan2(x, y);
+	return optimal::Atan2(x, y);
 }
 RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Pow(x, y);
+	return optimal::Pow(x, y);
 }
 RValue<Float4> Exp(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Exp(x);
+	return optimal::Exp(x);
 }
 RValue<Float4> Log(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Log(x);
+	return optimal::Log(x);
 }
 RValue<Float4> Exp2(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Exp2(x);
+	return optimal::Exp2(x);
 }
 RValue<Float4> Log2(RValue<Float4> x)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();
-	return emulated::Log2(x);
+	return optimal::Log2(x);
 }
 RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)