Commit 9c14bda0 by Antonio Maiorano

Optimize transcendentals for Subzero

With this change, we can now select implementations of most transcendentals from either the "emulated" or "optimal" namespaces. The emulated versions generally call the math.h standard function on each component for vector types, while the optimal versions typically implement some approximation in Reactor to produce vectorized code. Most of the optimal versions were taken directly from ShaderCore.cpp, except for ASin, for which I implemented an 8-term approximation. The new versions are faster, and pass all deqp precision tests. Here's a table of benchmarks that show the performance improvements that were made. Note that Asin and Acos now take a Precision parameter for Full and Relaxed precision: Before After rr_Sin 48.6 ns 10.6 ns rr_Cos 67.1 ns 9.62 ns rr_Tan 75.5 ns 19.4 ns rr_Asin_fullp 24.2 ns 23.0 ns rr_Asin_relaxedp N/A 9.31 ns rr_Acos_fullp 14.3 ns 6.35 ns rr_Acos_relaxedp N/A 4.56 ns rr_Atan 66.8 ns 12.9 ns rr_Sinh 79.7 ns 11.5 ns rr_Cosh 80.1 ns 11.5 ns rr_Tanh 62.9 ns 12.1 ns rr_Asinh 104 ns 9.44 ns rr_Acosh 14.4 ns 10.2 ns rr_Atanh 170 ns 9.81 ns rr_Atan2 73.5 ns 22.8 ns rr_Pow 87.9 ns 16.3 ns rr_Exp 40.2 ns 5.72 ns rr_Log 44.0 ns 7.35 ns rr_Exp2 101 ns 5.38 ns rr_Log2 106 ns 9.24 ns Bug: b/147818976 Change-Id: I791893bd9f005dbbae4770fb474de338a04845be Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/48588 Kokoro-Result: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Tested-by: 's avatarAntonio Maiorano <amaiorano@google.com>
parent b042f4e7
...@@ -192,10 +192,11 @@ cc_defaults { ...@@ -192,10 +192,11 @@ cc_defaults {
srcs: [ srcs: [
"Reactor/CPUID.cpp", "Reactor/CPUID.cpp",
"Reactor/Debug.cpp", "Reactor/Debug.cpp",
"Reactor/EmulatedReactor.cpp", "Reactor/EmulatedIntrinsics.cpp",
"Reactor/ExecutableMemory.cpp", "Reactor/ExecutableMemory.cpp",
"Reactor/LLVMJIT.cpp", "Reactor/LLVMJIT.cpp",
"Reactor/LLVMReactor.cpp", "Reactor/LLVMReactor.cpp",
"Reactor/OptimalIntrinsics.cpp",
"Reactor/Reactor.cpp", "Reactor/Reactor.cpp",
], ],
...@@ -223,10 +224,11 @@ cc_defaults { ...@@ -223,10 +224,11 @@ cc_defaults {
srcs: [ srcs: [
"Reactor/CPUID.cpp", "Reactor/CPUID.cpp",
"Reactor/Debug.cpp", "Reactor/Debug.cpp",
"Reactor/EmulatedReactor.cpp", "Reactor/EmulatedIntrinsics.cpp",
"Reactor/ExecutableMemory.cpp", "Reactor/ExecutableMemory.cpp",
"Reactor/LLVMJIT.cpp", "Reactor/LLVMJIT.cpp",
"Reactor/LLVMReactor.cpp", "Reactor/LLVMReactor.cpp",
"Reactor/OptimalIntrinsics.cpp",
"Reactor/Reactor.cpp", "Reactor/Reactor.cpp",
], ],
......
...@@ -598,18 +598,22 @@ SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitSt ...@@ -598,18 +598,22 @@ SpirvShader::EmitResult SpirvShader::EmitExtGLSLstd450(InsnIterator insn, EmitSt
case GLSLstd450Asin: case GLSLstd450Asin:
{ {
auto val = Operand(this, state, insn.word(5)); auto val = Operand(this, state, insn.word(5));
Decorations d;
ApplyDecorationsForId(&d, insn.word(5));
for(auto i = 0u; i < type.componentCount; i++) for(auto i = 0u; i < type.componentCount; i++)
{ {
dst.move(i, Asin(val.Float(i))); dst.move(i, Asin(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
} }
break; break;
} }
case GLSLstd450Acos: case GLSLstd450Acos:
{ {
auto val = Operand(this, state, insn.word(5)); auto val = Operand(this, state, insn.word(5));
Decorations d;
ApplyDecorationsForId(&d, insn.word(5));
for(auto i = 0u; i < type.componentCount; i++) for(auto i = 0u; i < type.componentCount; i++)
{ {
dst.move(i, Acos(val.Float(i))); dst.move(i, Acos(val.Float(i), d.RelaxedPrecision ? Precision::Relaxed : Precision::Full));
} }
break; break;
} }
......
...@@ -47,8 +47,9 @@ config("swiftshader_reactor_private_config") { ...@@ -47,8 +47,9 @@ config("swiftshader_reactor_private_config") {
swiftshader_source_set("swiftshader_reactor_base") { swiftshader_source_set("swiftshader_reactor_base") {
sources = [ sources = [
"Debug.cpp", "Debug.cpp",
"EmulatedReactor.cpp", "EmulatedIntrinsics.cpp",
"ExecutableMemory.cpp", "ExecutableMemory.cpp",
"OptimalIntrinsics.cpp"
"Reactor.cpp", "Reactor.cpp",
] ]
} }
......
...@@ -20,10 +20,13 @@ set(ROOT_PROJECT_COMPILE_OPTIONS ...@@ -20,10 +20,13 @@ set(ROOT_PROJECT_COMPILE_OPTIONS
set(REACTOR_SRC_FILES set(REACTOR_SRC_FILES
Debug.cpp Debug.cpp
Debug.hpp Debug.hpp
EmulatedReactor.cpp EmulatedIntrinsics.cpp
EmulatedIntrinsics.hpp
ExecutableMemory.cpp ExecutableMemory.cpp
ExecutableMemory.hpp ExecutableMemory.hpp
Nucleus.hpp Nucleus.hpp
OptimalIntrinsics.cpp
OptimalIntrinsics.hpp
Print.hpp Print.hpp
Reactor.cpp Reactor.cpp
Reactor.hpp Reactor.hpp
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "EmulatedReactor.hpp" #include "EmulatedIntrinsics.hpp"
#include <algorithm> #include <algorithm>
#include <cmath> #include <cmath>
......
...@@ -14,7 +14,7 @@ ...@@ -14,7 +14,7 @@
#include "Reactor.hpp" #include "Reactor.hpp"
// Implementation of Reactor functions that are "emulated" - that is, // Implementation of intrinsics that are "emulated" - that is,
// implemented either in terms of Reactor code, or make use of // implemented either in terms of Reactor code, or make use of
// rr::Call to C functions. These are typically slower than implementing // rr::Call to C functions. These are typically slower than implementing
// in terms of direct calls to the JIT backend; however, provide a good // in terms of direct calls to the JIT backend; however, provide a good
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include "CPUID.hpp" #include "CPUID.hpp"
#include "Debug.hpp" #include "Debug.hpp"
#include "EmulatedReactor.hpp" #include "EmulatedIntrinsics.hpp"
#include "LLVMReactorDebugInfo.hpp" #include "LLVMReactorDebugInfo.hpp"
#include "Print.hpp" #include "Print.hpp"
#include "Reactor.hpp" #include "Reactor.hpp"
...@@ -3220,13 +3220,13 @@ static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *na ...@@ -3220,13 +3220,13 @@ static RValue<Float4> TransformFloat4PerElement(RValue<Float4> v, const char *na
return RValue<Float4>(V(out)); return RValue<Float4>(V(out));
} }
RValue<Float4> Asin(RValue<Float4> v) RValue<Float4> Asin(RValue<Float4> v, Precision p)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return TransformFloat4PerElement(v, "asinf"); return TransformFloat4PerElement(v, "asinf");
} }
RValue<Float4> Acos(RValue<Float4> v) RValue<Float4> Acos(RValue<Float4> v, Precision p)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return TransformFloat4PerElement(v, "acosf"); return TransformFloat4PerElement(v, "acosf");
......
// Copyright 2020 The SwiftShader Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "OptimalIntrinsics.hpp"
namespace rr {
namespace {
Float4 Reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false)
{
Float4 rcp = Rcp_pp(x, exactAtPow2);
if(!pp)
{
rcp = (rcp + rcp) - (x * rcp * rcp);
}
if(finite)
{
int big = 0x7F7FFFFF;
rcp = Min(rcp, Float4((float &)big));
}
return rcp;
}
Float4 SinOrCos(RValue<Float4> x, bool sin)
{
// Reduce to [-0.5, 0.5] range
Float4 y = x * Float4(1.59154943e-1f); // 1/2pi
y = y - Round(y);
// From the paper: "A Fast, Vectorizable Algorithm for Producing Single-Precision Sine-Cosine Pairs"
// This implementation passes OpenGL ES 3.0 precision requirements, at the cost of more operations:
// !pp : 17 mul, 7 add, 1 sub, 1 reciprocal
// pp : 4 mul, 2 add, 2 abs
Float4 y2 = y * y;
Float4 c1 = y2 * (y2 * (y2 * Float4(-0.0204391631f) + Float4(0.2536086171f)) + Float4(-1.2336977925f)) + Float4(1.0f);
Float4 s1 = y * (y2 * (y2 * (y2 * Float4(-0.0046075748f) + Float4(0.0796819754f)) + Float4(-0.645963615f)) + Float4(1.5707963235f));
Float4 c2 = (c1 * c1) - (s1 * s1);
Float4 s2 = Float4(2.0f) * s1 * c1;
Float4 r = Reciprocal(s2 * s2 + c2 * c2, false, true, false);
if(sin)
{
return Float4(2.0f) * s2 * c2 * r;
}
else
{
return ((c2 * c2) - (s2 * s2)) * r;
}
}
// Approximation of atan in [0..1]
Float4 Atan_01(Float4 x)
{
// From 4.4.49, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
const Float4 a2(-0.3333314528f);
const Float4 a4(0.1999355085f);
const Float4 a6(-0.1420889944f);
const Float4 a8(0.1065626393f);
const Float4 a10(-0.0752896400f);
const Float4 a12(0.0429096138f);
const Float4 a14(-0.0161657367f);
const Float4 a16(0.0028662257f);
Float4 x2 = x * x;
return (x + x * (x2 * (a2 + x2 * (a4 + x2 * (a6 + x2 * (a8 + x2 * (a10 + x2 * (a12 + x2 * (a14 + x2 * a16)))))))));
}
} // namespace
namespace optimal {
Float4 Sin(RValue<Float4> x)
{
return SinOrCos(x, true);
}
Float4 Cos(RValue<Float4> x)
{
return SinOrCos(x, false);
}
Float4 Tan(RValue<Float4> x)
{
return SinOrCos(x, true) / SinOrCos(x, false);
}
Float4 Asin_4_terms(RValue<Float4> x)
{
// From 4.4.45, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
// |e(x)| <= 5e-8
const Float4 half_pi(1.57079632f);
const Float4 a0(1.5707288f);
const Float4 a1(-0.2121144f);
const Float4 a2(0.0742610f);
const Float4 a3(-0.0187293f);
Float4 absx = Abs(x);
return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * a3)))) ^
(As<Int4>(x) & Int4(0x80000000)));
}
Float4 Asin_8_terms(RValue<Float4> x)
{
// From 4.4.46, page 81 of the Handbook of Mathematical Functions, by Milton Abramowitz and Irene Stegun
// |e(x)| <= 0e-8
const Float4 half_pi(1.5707963268f);
const Float4 a0(1.5707963050f);
const Float4 a1(-0.2145988016f);
const Float4 a2(0.0889789874f);
const Float4 a3(-0.0501743046f);
const Float4 a4(0.0308918810f);
const Float4 a5(-0.0170881256f);
const Float4 a6(0.006700901f);
const Float4 a7(-0.0012624911f);
Float4 absx = Abs(x);
return As<Float4>(As<Int4>(half_pi - Sqrt(Float4(1.0f) - absx) * (a0 + absx * (a1 + absx * (a2 + absx * (a3 + absx * (a4 + absx * (a5 + absx * (a6 + absx * a7)))))))) ^
(As<Int4>(x) & Int4(0x80000000)));
}
Float4 Acos_4_terms(RValue<Float4> x)
{
// pi/2 - arcsin(x)
return Float4(1.57079632e+0f) - Asin_4_terms(x);
}
Float4 Acos_8_terms(RValue<Float4> x)
{
// pi/2 - arcsin(x)
return Float4(1.57079632e+0f) - Asin_8_terms(x);
}
Float4 Atan(RValue<Float4> x)
{
Float4 absx = Abs(x);
Int4 O = CmpNLT(absx, Float4(1.0f));
Float4 y = As<Float4>((O & As<Int4>(Float4(1.0f) / absx)) | (~O & As<Int4>(absx))); // FIXME: Vector select
const Float4 half_pi(1.57079632f);
Float4 theta = Atan_01(y);
return As<Float4>(((O & As<Int4>(half_pi - theta)) | (~O & As<Int4>(theta))) ^ // FIXME: Vector select
(As<Int4>(x) & Int4(0x80000000)));
}
Float4 Atan2(RValue<Float4> y, RValue<Float4> x)
{
const Float4 pi(3.14159265f); // pi
const Float4 minus_pi(-3.14159265f); // -pi
const Float4 half_pi(1.57079632f); // pi/2
const Float4 quarter_pi(7.85398163e-1f); // pi/4
// Rotate to upper semicircle when in lower semicircle
Int4 S = CmpLT(y, Float4(0.0f));
Float4 theta = As<Float4>(S & As<Int4>(minus_pi));
Float4 x0 = As<Float4>((As<Int4>(y) & Int4(0x80000000)) ^ As<Int4>(x));
Float4 y0 = Abs(y);
// Rotate to right quadrant when in left quadrant
Int4 Q = CmpLT(x0, Float4(0.0f));
theta += As<Float4>(Q & As<Int4>(half_pi));
Float4 x1 = As<Float4>((Q & As<Int4>(y0)) | (~Q & As<Int4>(x0))); // FIXME: Vector select
Float4 y1 = As<Float4>((Q & As<Int4>(-x0)) | (~Q & As<Int4>(y0))); // FIXME: Vector select
// Mirror to first octant when in second octant
Int4 O = CmpNLT(y1, x1);
Float4 x2 = As<Float4>((O & As<Int4>(y1)) | (~O & As<Int4>(x1))); // FIXME: Vector select
Float4 y2 = As<Float4>((O & As<Int4>(x1)) | (~O & As<Int4>(y1))); // FIXME: Vector select
// Approximation of atan in [0..1]
Int4 zero_x = CmpEQ(x2, Float4(0.0f));
Int4 inf_y = IsInf(y2); // Since x2 >= y2, this means x2 == y2 == inf, so we use 45 degrees or pi/4
Float4 atan2_theta = Atan_01(y2 / x2);
theta += As<Float4>((~zero_x & ~inf_y & ((O & As<Int4>(half_pi - atan2_theta)) | (~O & (As<Int4>(atan2_theta))))) | // FIXME: Vector select
(inf_y & As<Int4>(quarter_pi)));
// Recover loss of precision for tiny theta angles
// This combination results in (-pi + half_pi + half_pi - atan2_theta) which is equivalent to -atan2_theta
Int4 precision_loss = S & Q & O & ~inf_y;
return As<Float4>((precision_loss & As<Int4>(-atan2_theta)) | (~precision_loss & As<Int4>(theta))); // FIXME: Vector select
}
Float4 Exp2(RValue<Float4> x)
{
// This implementation is based on 2^(i + f) = 2^i * 2^f,
// where i is the integer part of x and f is the fraction.
// For 2^i we can put the integer part directly in the exponent of
// the IEEE-754 floating-point number. Clamp to prevent overflow
// past the representation of infinity.
Float4 x0 = x;
x0 = Min(x0, As<Float4>(Int4(0x43010000))); // 129.00000e+0f
x0 = Max(x0, As<Float4>(Int4(0xC2FDFFFF))); // -126.99999e+0f
Int4 i = RoundInt(x0 - Float4(0.5f));
Float4 ii = As<Float4>((i + Int4(127)) << 23); // Add single-precision bias, and shift into exponent.
// For the fractional part use a polynomial
// which approximates 2^f in the 0 to 1 range.
Float4 f = x0 - Float4(i);
Float4 ff = As<Float4>(Int4(0x3AF61905)); // 1.8775767e-3f
ff = ff * f + As<Float4>(Int4(0x3C134806)); // 8.9893397e-3f
ff = ff * f + As<Float4>(Int4(0x3D64AA23)); // 5.5826318e-2f
ff = ff * f + As<Float4>(Int4(0x3E75EAD4)); // 2.4015361e-1f
ff = ff * f + As<Float4>(Int4(0x3F31727B)); // 6.9315308e-1f
ff = ff * f + Float4(1.0f);
return ii * ff;
}
Float4 Log2(RValue<Float4> x)
{
Float4 x0;
Float4 x1;
Float4 x2;
Float4 x3;
x0 = x;
x1 = As<Float4>(As<Int4>(x0) & Int4(0x7F800000));
x1 = As<Float4>(As<UInt4>(x1) >> 8);
x1 = As<Float4>(As<Int4>(x1) | As<Int4>(Float4(1.0f)));
x1 = (x1 - Float4(1.4960938f)) * Float4(256.0f); // FIXME: (x1 - 1.4960938f) * 256.0f;
x0 = As<Float4>((As<Int4>(x0) & Int4(0x007FFFFF)) | As<Int4>(Float4(1.0f)));
x2 = (Float4(9.5428179e-2f) * x0 + Float4(4.7779095e-1f)) * x0 + Float4(1.9782813e-1f);
x3 = ((Float4(1.6618466e-2f) * x0 + Float4(2.0350508e-1f)) * x0 + Float4(2.7382900e-1f)) * x0 + Float4(4.0496687e-2f);
x2 /= x3;
x1 += (x0 - Float4(1.0f)) * x2;
Int4 pos_inf_x = CmpEQ(As<Int4>(x), Int4(0x7F800000));
return As<Float4>((pos_inf_x & As<Int4>(x)) | (~pos_inf_x & As<Int4>(x1)));
}
Float4 Exp(RValue<Float4> x)
{
// TODO: Propagate the constant
return optimal::Exp2(Float4(1.44269504f) * x); // 1/ln(2)
}
Float4 Log(RValue<Float4> x)
{
// TODO: Propagate the constant
return Float4(6.93147181e-1f) * optimal::Log2(x); // ln(2)
}
Float4 Pow(RValue<Float4> x, RValue<Float4> y)
{
Float4 log = optimal::Log2(x);
log *= y;
return optimal::Exp2(log);
}
Float4 Sinh(RValue<Float4> x)
{
return (optimal::Exp(x) - optimal::Exp(-x)) * Float4(0.5f);
}
Float4 Cosh(RValue<Float4> x)
{
return (optimal::Exp(x) + optimal::Exp(-x)) * Float4(0.5f);
}
Float4 Tanh(RValue<Float4> x)
{
Float4 e_x = optimal::Exp(x);
Float4 e_minus_x = optimal::Exp(-x);
return (e_x - e_minus_x) / (e_x + e_minus_x);
}
Float4 Asinh(RValue<Float4> x)
{
return optimal::Log(x + Sqrt(x * x + Float4(1.0f)));
}
Float4 Acosh(RValue<Float4> x)
{
return optimal::Log(x + Sqrt(x + Float4(1.0f)) * Sqrt(x - Float4(1.0f)));
}
Float4 Atanh(RValue<Float4> x)
{
return optimal::Log((Float4(1.0f) + x) / (Float4(1.0f) - x)) * Float4(0.5f);
}
} // namespace optimal
} // namespace rr
\ No newline at end of file
// Copyright 2020 The SwiftShader Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "Reactor.hpp"
// Implementation of intrinsic functions that purport to be as optimal as
// possible, in contrast to the rr::emulated versions, typically by
// implementing approximations of the same math functions.
namespace rr {
namespace optimal {
Float4 Sin(RValue<Float4> x);
Float4 Cos(RValue<Float4> x);
Float4 Tan(RValue<Float4> x);
Float4 Asin_4_terms(RValue<Float4> x);
Float4 Asin_8_terms(RValue<Float4> x);
Float4 Acos_4_terms(RValue<Float4> x);
Float4 Acos_8_terms(RValue<Float4> x);
Float4 Atan(RValue<Float4> x);
Float4 Atan2(RValue<Float4> y, RValue<Float4> x);
Float4 Exp2(RValue<Float4> x);
Float4 Log2(RValue<Float4> x);
Float4 Exp(RValue<Float4> x);
Float4 Log(RValue<Float4> x);
Float4 Pow(RValue<Float4> x, RValue<Float4> y);
Float4 Sinh(RValue<Float4> x);
Float4 Cosh(RValue<Float4> x);
Float4 Tanh(RValue<Float4> x);
Float4 Asinh(RValue<Float4> x);
Float4 Acosh(RValue<Float4> x);
Float4 Atanh(RValue<Float4> x);
} // namespace optimal
} // namespace rr
...@@ -2345,13 +2345,20 @@ RValue<Float4> Frac(RValue<Float4> x); ...@@ -2345,13 +2345,20 @@ RValue<Float4> Frac(RValue<Float4> x);
RValue<Float4> Floor(RValue<Float4> x); RValue<Float4> Floor(RValue<Float4> x);
RValue<Float4> Ceil(RValue<Float4> x); RValue<Float4> Ceil(RValue<Float4> x);
enum class Precision
{
Full,
Relaxed,
//Half,
};
// Trigonometric functions // Trigonometric functions
// TODO: Currently unimplemented for Subzero. // TODO: Currently unimplemented for Subzero.
RValue<Float4> Sin(RValue<Float4> x); RValue<Float4> Sin(RValue<Float4> x);
RValue<Float4> Cos(RValue<Float4> x); RValue<Float4> Cos(RValue<Float4> x);
RValue<Float4> Tan(RValue<Float4> x); RValue<Float4> Tan(RValue<Float4> x);
RValue<Float4> Asin(RValue<Float4> x); RValue<Float4> Asin(RValue<Float4> x, Precision p);
RValue<Float4> Acos(RValue<Float4> x); RValue<Float4> Acos(RValue<Float4> x, Precision p);
RValue<Float4> Atan(RValue<Float4> x); RValue<Float4> Atan(RValue<Float4> x);
RValue<Float4> Sinh(RValue<Float4> x); RValue<Float4> Sinh(RValue<Float4> x);
RValue<Float4> Cosh(RValue<Float4> x); RValue<Float4> Cosh(RValue<Float4> x);
......
...@@ -2050,6 +2050,11 @@ using IntrinsicTestParams_Float = IntrinsicTestParams<RValue<Float>(RValue<Float ...@@ -2050,6 +2050,11 @@ using IntrinsicTestParams_Float = IntrinsicTestParams<RValue<Float>(RValue<Float
using IntrinsicTestParams_Float4 = IntrinsicTestParams<RValue<Float4>(RValue<Float4>), float(float), float>; using IntrinsicTestParams_Float4 = IntrinsicTestParams<RValue<Float4>(RValue<Float4>), float(float), float>;
using IntrinsicTestParams_Float4_Float4 = IntrinsicTestParams<RValue<Float4>(RValue<Float4>, RValue<Float4>), float(float, float), std::pair<float, float>>; using IntrinsicTestParams_Float4_Float4 = IntrinsicTestParams<RValue<Float4>(RValue<Float4>, RValue<Float4>), float(float, float), std::pair<float, float>>;
// TODO(b/147818976): Each function has its own precision requirements for Vulkan, sometimes broken down
// by input range. These are currently validated by deqp, but we can improve our own tests as well.
// See https://www.khronos.org/registry/vulkan/specs/1.2-extensions/html/vkspec.html#spirvenv-precision-operation
constexpr double INTRINSIC_PRECISION = 1e-4;
struct IntrinsicTest_Float : public testing::TestWithParam<IntrinsicTestParams_Float> struct IntrinsicTest_Float : public testing::TestWithParam<IntrinsicTestParams_Float>
{ {
void test() void test()
...@@ -2064,7 +2069,7 @@ struct IntrinsicTest_Float : public testing::TestWithParam<IntrinsicTestParams_F ...@@ -2064,7 +2069,7 @@ struct IntrinsicTest_Float : public testing::TestWithParam<IntrinsicTestParams_F
for(auto &&v : GetParam().testValues) for(auto &&v : GetParam().testValues)
{ {
SCOPED_TRACE(v); SCOPED_TRACE(v);
EXPECT_FLOAT_EQ(routine(v), GetParam().refFunc(v)); EXPECT_NEAR(routine(v), GetParam().refFunc(v), INTRINSIC_PRECISION);
} }
} }
}; };
...@@ -2145,10 +2150,10 @@ struct IntrinsicTest_Float4 : public testing::TestWithParam<IntrinsicTestParams_ ...@@ -2145,10 +2150,10 @@ struct IntrinsicTest_Float4 : public testing::TestWithParam<IntrinsicTestParams_
SCOPED_TRACE(v); SCOPED_TRACE(v);
float4_value result = invokeRoutine(routine, float4_value{ v }); float4_value result = invokeRoutine(routine, float4_value{ v });
float4_value expected = float4_value{ GetParam().refFunc(v) }; float4_value expected = float4_value{ GetParam().refFunc(v) };
EXPECT_FLOAT_EQ(result.v[0], expected.v[0]); EXPECT_NEAR(result.v[0], expected.v[0], INTRINSIC_PRECISION);
EXPECT_FLOAT_EQ(result.v[1], expected.v[1]); EXPECT_NEAR(result.v[1], expected.v[1], INTRINSIC_PRECISION);
EXPECT_FLOAT_EQ(result.v[2], expected.v[2]); EXPECT_NEAR(result.v[2], expected.v[2], INTRINSIC_PRECISION);
EXPECT_FLOAT_EQ(result.v[3], expected.v[3]); EXPECT_NEAR(result.v[3], expected.v[3], INTRINSIC_PRECISION);
} }
} }
}; };
...@@ -2172,19 +2177,19 @@ struct IntrinsicTest_Float4_Float4 : public testing::TestWithParam<IntrinsicTest ...@@ -2172,19 +2177,19 @@ struct IntrinsicTest_Float4_Float4 : public testing::TestWithParam<IntrinsicTest
SCOPED_TRACE(v); SCOPED_TRACE(v);
float4_value result = invokeRoutine(routine, float4_value{ v.first }, float4_value{ v.second }); float4_value result = invokeRoutine(routine, float4_value{ v.first }, float4_value{ v.second });
float4_value expected = float4_value{ GetParam().refFunc(v.first, v.second) }; float4_value expected = float4_value{ GetParam().refFunc(v.first, v.second) };
EXPECT_FLOAT_EQ(result.v[0], expected.v[0]); EXPECT_NEAR(result.v[0], expected.v[0], INTRINSIC_PRECISION);
EXPECT_FLOAT_EQ(result.v[1], expected.v[1]); EXPECT_NEAR(result.v[1], expected.v[1], INTRINSIC_PRECISION);
EXPECT_FLOAT_EQ(result.v[2], expected.v[2]); EXPECT_NEAR(result.v[2], expected.v[2], INTRINSIC_PRECISION);
EXPECT_FLOAT_EQ(result.v[3], expected.v[3]); EXPECT_NEAR(result.v[3], expected.v[3], INTRINSIC_PRECISION);
} }
} }
}; };
// clang-format off // clang-format off
INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float, IntrinsicTest_Float, testing::Values( INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float, IntrinsicTest_Float, testing::Values(
IntrinsicTestParams_Float{ [](Float v) { return rr::Exp2(v); }, exp2f, {0.f, 1.f, 12345.f} }, IntrinsicTestParams_Float{ [](Float v) { return rr::Exp2(v); }, exp2f, {0.f, 1.f, 123.f} },
IntrinsicTestParams_Float{ [](Float v) { return rr::Log2(v); }, log2f, {0.f, 1.f, 12345.f} }, IntrinsicTestParams_Float{ [](Float v) { return rr::Log2(v); }, log2f, {1.f, 123.f} },
IntrinsicTestParams_Float{ [](Float v) { return rr::Sqrt(v); }, sqrtf, {0.f, 1.f, 12345.f} } IntrinsicTestParams_Float{ [](Float v) { return rr::Sqrt(v); }, sqrtf, {0.f, 1.f, 123.f} }
)); ));
// clang-format on // clang-format on
...@@ -2201,30 +2206,30 @@ float vulkan_coshf(float a) ...@@ -2201,30 +2206,30 @@ float vulkan_coshf(float a)
// clang-format off // clang-format off
constexpr float PI = 3.141592653589793f; constexpr float PI = 3.141592653589793f;
INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float4, IntrinsicTest_Float4, testing::Values( INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float4, IntrinsicTest_Float4, testing::Values(
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sin(v); }, sinf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sin(v); }, sinf, {0.f, 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cos(v); }, cosf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cos(v); }, cosf, {0.f, 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tan(v); }, tanf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tan(v); }, tanf, {0.f, 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asin(v); }, asinf, {0.f, 1.f, -1.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asin(v, Precision::Full); }, asinf, {0.f, 1.f, -1.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acos(v); }, acosf, {0.f, 1.f, -1.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acos(v, Precision::Full); }, acosf, {0.f, 1.f, -1.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atan(v); }, atanf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atan(v); }, atanf, {0.f, 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sinh(v); }, vulkan_sinhf, {0.f, 1.f, PI, 12345.f, 0x1.65a84ep6} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sinh(v); }, vulkan_sinhf, {0.f, 1.f, PI} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cosh(v); }, vulkan_coshf, {0.f, 1.f, PI, 12345.f, 0x1.65a84ep6} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Cosh(v); }, vulkan_coshf, {0.f, 1.f, PI} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tanh(v); }, tanhf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Tanh(v); }, tanhf, {0.f, 1.f, PI} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asinh(v); }, asinhf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Asinh(v); }, asinhf, {0.f, 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acosh(v); }, acoshf, { 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Acosh(v); }, acoshf, { 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atanh(v); }, atanhf, {0.f, 1.f, -1.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Atanh(v); }, atanhf, {0.f, 0.9999f, -0.9999f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp(v); }, expf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp(v); }, expf, {0.f, 1.f, PI} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log(v); }, logf, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log(v); }, logf, {1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp2(v); }, exp2f, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Exp2(v); }, exp2f, {0.f, 1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log2(v); }, log2f, {0.f, 1.f, PI, 12345.f} }, IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Log2(v); }, log2f, {1.f, PI, 123.f} },
IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sqrt(v); }, sqrtf, {0.f, 1.f, PI, 12345.f} } IntrinsicTestParams_Float4{ [](RValue<Float4> v) { return rr::Sqrt(v); }, sqrtf, {0.f, 1.f, PI, 123.f} }
)); ));
// clang-format on // clang-format on
// clang-format off // clang-format off
INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float4_Float4, IntrinsicTest_Float4_Float4, testing::Values( INSTANTIATE_TEST_SUITE_P(IntrinsicTestParams_Float4_Float4, IntrinsicTest_Float4_Float4, testing::Values(
IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Atan2(v1, v2); }, atan2f, { {0.f, 0.f}, {0.f, -1.f}, {-1.f, 0.f}, {12345.f, 12345.f} } }, IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Atan2(v1, v2); }, atan2f, { {0.f, 0.f}, {0.f, -1.f}, {-1.f, 0.f}, {123.f, 123.f} } },
IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Pow(v1, v2); }, powf, { {0.f, 0.f}, {0.f, -1.f}, {-1.f, 0.f}, {12345.f, 12345.f} } } IntrinsicTestParams_Float4_Float4{ [](RValue<Float4> v1, RValue<Float4> v2) { return Pow(v1, v2); }, powf, { {1.f, 0.f}, {1.f, -1.f}, {-1.f, 0.f} } }
)); ));
// clang-format on // clang-format on
......
...@@ -13,7 +13,8 @@ ...@@ -13,7 +13,8 @@
// limitations under the License. // limitations under the License.
#include "Debug.hpp" #include "Debug.hpp"
#include "EmulatedReactor.hpp" #include "EmulatedIntrinsics.hpp"
#include "OptimalIntrinsics.hpp"
#include "Print.hpp" #include "Print.hpp"
#include "Reactor.hpp" #include "Reactor.hpp"
#include "ReactorDebugInfo.hpp" #include "ReactorDebugInfo.hpp"
...@@ -4258,109 +4259,115 @@ RValue<Float> Log2(RValue<Float> x) ...@@ -4258,109 +4259,115 @@ RValue<Float> Log2(RValue<Float> x)
RValue<Float4> Sin(RValue<Float4> x) RValue<Float4> Sin(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Sin(x); return optimal::Sin(x);
} }
RValue<Float4> Cos(RValue<Float4> x) RValue<Float4> Cos(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Cos(x); return optimal::Cos(x);
} }
RValue<Float4> Tan(RValue<Float4> x) RValue<Float4> Tan(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Tan(x); return optimal::Tan(x);
} }
RValue<Float4> Asin(RValue<Float4> x) RValue<Float4> Asin(RValue<Float4> x, Precision p)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
if(p == Precision::Full)
{
return emulated::Asin(x); return emulated::Asin(x);
}
return optimal::Asin_8_terms(x);
} }
RValue<Float4> Acos(RValue<Float4> x) RValue<Float4> Acos(RValue<Float4> x, Precision p)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Acos(x); // Surprisingly, deqp-vk's precision.acos.highp/mediump tests pass when using the 4-term polynomial approximation
// version of acos, unlike for Asin, which requires higher precision algorithms.
return optimal::Acos_4_terms(x);
} }
RValue<Float4> Atan(RValue<Float4> x) RValue<Float4> Atan(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Atan(x); return optimal::Atan(x);
} }
RValue<Float4> Sinh(RValue<Float4> x) RValue<Float4> Sinh(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Sinh(x); return optimal::Sinh(x);
} }
RValue<Float4> Cosh(RValue<Float4> x) RValue<Float4> Cosh(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Cosh(x); return optimal::Cosh(x);
} }
RValue<Float4> Tanh(RValue<Float4> x) RValue<Float4> Tanh(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Tanh(x); return optimal::Tanh(x);
} }
RValue<Float4> Asinh(RValue<Float4> x) RValue<Float4> Asinh(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Asinh(x); return optimal::Asinh(x);
} }
RValue<Float4> Acosh(RValue<Float4> x) RValue<Float4> Acosh(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Acosh(x); return optimal::Acosh(x);
} }
RValue<Float4> Atanh(RValue<Float4> x) RValue<Float4> Atanh(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Atanh(x); return optimal::Atanh(x);
} }
RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y) RValue<Float4> Atan2(RValue<Float4> x, RValue<Float4> y)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Atan2(x, y); return optimal::Atan2(x, y);
} }
RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y) RValue<Float4> Pow(RValue<Float4> x, RValue<Float4> y)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Pow(x, y); return optimal::Pow(x, y);
} }
RValue<Float4> Exp(RValue<Float4> x) RValue<Float4> Exp(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Exp(x); return optimal::Exp(x);
} }
RValue<Float4> Log(RValue<Float4> x) RValue<Float4> Log(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Log(x); return optimal::Log(x);
} }
RValue<Float4> Exp2(RValue<Float4> x) RValue<Float4> Exp2(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Exp2(x); return optimal::Exp2(x);
} }
RValue<Float4> Log2(RValue<Float4> x) RValue<Float4> Log2(RValue<Float4> x)
{ {
RR_DEBUG_INFO_UPDATE_LOC(); RR_DEBUG_INFO_UPDATE_LOC();
return emulated::Log2(x); return optimal::Log2(x);
} }
RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef) RValue<UInt> Ctlz(RValue<UInt> x, bool isZeroUndef)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment