Commit 05b3d665 by Nicolas Capens

Correct reciprocal approximation for power-of-two values.

Intel's reciprocal approximation instruction is not exact for power-of-two values. It provides 12 bits of mantissa precision and keeps a balance between positive and negative errors, but the reciprocal of 2^x is not 2^-x. This affects conformance tests which expect varyings not to be affected by the perspective division. Correct for this by multiplying by the inverse. Bug 27165393 Change-Id: Ie52ec511a14a4f447adc47ce9c875bbad03cd274 Reviewed-on: https://swiftshader-review.googlesource.com/4903Tested-by: 's avatarNicolas Capens <capn@google.com> Reviewed-by: 's avatarAlexis Hétu <sugoi@google.com> Reviewed-by: 's avatarNicolas Capens <capn@google.com>
parent 407813b4
...@@ -33,6 +33,7 @@ ...@@ -33,6 +33,7 @@
#include "Thread.hpp" #include "Thread.hpp"
#include "Memory.hpp" #include "Memory.hpp"
#include <xmmintrin.h>
#include <fstream> #include <fstream>
#if defined(__x86_64__) && defined(_WIN32) #if defined(__x86_64__) && defined(_WIN32)
...@@ -4635,9 +4636,9 @@ namespace sw ...@@ -4635,9 +4636,9 @@ namespace sw
Constant *shuffle[2]; Constant *shuffle[2];
shuffle[0] = Nucleus::createConstantInt(0); shuffle[0] = Nucleus::createConstantInt(0);
shuffle[1] = Nucleus::createConstantInt(1); shuffle[1] = Nucleus::createConstantInt(1);
Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, VectorType::get(Int::getType(), 1)), Nucleus::createBitCast(hi.value, VectorType::get(Int::getType(), 1)), Nucleus::createConstantVector(shuffle, 2)); Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, VectorType::get(Int::getType(), 1)), Nucleus::createBitCast(hi.value, VectorType::get(Int::getType(), 1)), Nucleus::createConstantVector(shuffle, 2));
storeValue(Nucleus::createBitCast(packed, Int2::getType())); storeValue(Nucleus::createBitCast(packed, Int2::getType()));
} }
} }
...@@ -5199,7 +5200,7 @@ namespace sw ...@@ -5199,7 +5200,7 @@ namespace sw
Value *element = Nucleus::createBitCast(cast.value, Long::getType()); Value *element = Nucleus::createBitCast(cast.value, Long::getType());
long2 = Nucleus::createInsertElement(long2, element, 0); long2 = Nucleus::createInsertElement(long2, element, 0);
RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType())); RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
if(CPUID::supportsSSE4_1()) if(CPUID::supportsSSE4_1())
{ {
storeValue(x86::pmovsxwd(vector).value); storeValue(x86::pmovsxwd(vector).value);
...@@ -6069,9 +6070,18 @@ namespace sw ...@@ -6069,9 +6070,18 @@ namespace sw
return IfThenElse(x < y, x, y); return IfThenElse(x < y, x, y);
} }
RValue<Float> Rcp_pp(RValue<Float> x) RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
{ {
return x86::rcpss(x); if(exactAtPow2)
{
// rcpss uses a piecewise-linear approximation which minimizes the relative error
// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
}
else
{
return x86::rcpss(x);
}
} }
RValue<Float> RcpSqrt_pp(RValue<Float> x) RValue<Float> RcpSqrt_pp(RValue<Float> x)
...@@ -6580,9 +6590,18 @@ namespace sw ...@@ -6580,9 +6590,18 @@ namespace sw
return x86::minps(x, y); return x86::minps(x, y);
} }
RValue<Float4> Rcp_pp(RValue<Float4> x) RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
{ {
return x86::rcpps(x); if(exactAtPow2)
{
// rcpps uses a piecewise-linear approximation which minimizes the relative error
// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
}
else
{
return x86::rcpps(x);
}
} }
RValue<Float4> RcpSqrt_pp(RValue<Float4> x) RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
......
...@@ -1899,7 +1899,7 @@ namespace sw ...@@ -1899,7 +1899,7 @@ namespace sw
RValue<Float> Abs(RValue<Float> x); RValue<Float> Abs(RValue<Float> x);
RValue<Float> Max(RValue<Float> x, RValue<Float> y); RValue<Float> Max(RValue<Float> x, RValue<Float> y);
RValue<Float> Min(RValue<Float> x, RValue<Float> y); RValue<Float> Min(RValue<Float> x, RValue<Float> y);
RValue<Float> Rcp_pp(RValue<Float> val); RValue<Float> Rcp_pp(RValue<Float> val, bool exactAtPow2 = false);
RValue<Float> RcpSqrt_pp(RValue<Float> val); RValue<Float> RcpSqrt_pp(RValue<Float> val);
RValue<Float> Sqrt(RValue<Float> x); RValue<Float> Sqrt(RValue<Float> x);
RValue<Float> Round(RValue<Float> val); RValue<Float> Round(RValue<Float> val);
...@@ -2377,7 +2377,7 @@ namespace sw ...@@ -2377,7 +2377,7 @@ namespace sw
RValue<Float4> Abs(RValue<Float4> x); RValue<Float4> Abs(RValue<Float4> x);
RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y); RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y);
RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y); RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y);
RValue<Float4> Rcp_pp(RValue<Float4> val); RValue<Float4> Rcp_pp(RValue<Float4> val, bool exactAtPow2 = false);
RValue<Float4> RcpSqrt_pp(RValue<Float4> val); RValue<Float4> RcpSqrt_pp(RValue<Float4> val);
RValue<Float4> Sqrt(RValue<Float4> x); RValue<Float4> Sqrt(RValue<Float4> x);
RValue<Float4> Insert(const Float4 &val, RValue<Float> element, int i); RValue<Float4> Insert(const Float4 &val, RValue<Float> element, int i);
......
...@@ -140,7 +140,7 @@ namespace sw ...@@ -140,7 +140,7 @@ namespace sw
if(interpolateW()) if(interpolateW())
{ {
w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false); w = interpolate(xxxx, Dw, rhw, primitive + OFFSET(Primitive,w), false, false);
rhw = reciprocal(w); rhw = reciprocal(w, false, false, true);
if(state.centroid) if(state.centroid)
{ {
...@@ -518,7 +518,7 @@ namespace sw ...@@ -518,7 +518,7 @@ namespace sw
zMask = SignMask(zTest) & cMask; zMask = SignMask(zTest) & cMask;
break; break;
} }
if(state.stencilActive) if(state.stencilActive)
{ {
zMask &= sMask; zMask &= sMask;
...@@ -687,12 +687,12 @@ namespace sw ...@@ -687,12 +687,12 @@ namespace sw
Int pitch; Int pitch;
if(!state.quadLayoutDepthBuffer) if(!state.quadLayoutDepthBuffer)
{ {
buffer = zBuffer + 4 * x; buffer = zBuffer + 4 * x;
pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB)); pitch = *Pointer<Int>(data + OFFSET(DrawData,depthPitchB));
} }
else else
{ {
buffer = zBuffer + 8 * x; buffer = zBuffer + 8 * x;
} }
...@@ -761,7 +761,7 @@ namespace sw ...@@ -761,7 +761,7 @@ namespace sw
} }
Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer))); Byte8 bufferValue = As<Byte8>(Long1(*Pointer<UInt>(buffer)));
Byte8 newValue; Byte8 newValue;
stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask); stencilOperation(newValue, bufferValue, state.stencilPassOperation, state.stencilZFailOperation, state.stencilFailOperation, false, zMask, sMask);
...@@ -945,7 +945,7 @@ namespace sw ...@@ -945,7 +945,7 @@ namespace sw
ASSERT(false); ASSERT(false);
} }
} }
void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive) void PixelRoutine::blendFactorAlpha(const Vector4s &blendFactor, const Vector4s &current, const Vector4s &pixel, BlendFactor blendFactorAlphaActive)
{ {
switch(blendFactorAlphaActive) switch(blendFactorAlphaActive)
...@@ -1170,7 +1170,7 @@ namespace sw ...@@ -1170,7 +1170,7 @@ namespace sw
current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y)); current.y = MulHigh(As<UShort4>(current.y), As<UShort4>(sourceFactor.y));
current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z)); current.z = MulHigh(As<UShort4>(current.z), As<UShort4>(sourceFactor.z));
} }
if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO) if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
{ {
pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x)); pixel.x = MulHigh(As<UShort4>(pixel.x), As<UShort4>(destFactor.x));
...@@ -1229,7 +1229,7 @@ namespace sw ...@@ -1229,7 +1229,7 @@ namespace sw
{ {
current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w)); current.w = MulHigh(As<UShort4>(current.w), As<UShort4>(sourceFactor.w));
} }
if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO) if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
{ {
pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w)); pixel.w = MulHigh(As<UShort4>(pixel.w), As<UShort4>(destFactor.w));
...@@ -1828,7 +1828,7 @@ namespace sw ...@@ -1828,7 +1828,7 @@ namespace sw
} }
} }
void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive) void PixelRoutine::blendFactor(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorActive)
{ {
switch(blendFactorActive) switch(blendFactorActive)
{ {
...@@ -1899,7 +1899,7 @@ namespace sw ...@@ -1899,7 +1899,7 @@ namespace sw
} }
} }
void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive) void PixelRoutine::blendFactorAlpha(const Vector4f &blendFactor, const Vector4f &oC, const Vector4f &pixel, BlendFactor blendFactorAlphaActive)
{ {
switch(blendFactorAlphaActive) switch(blendFactorAlphaActive)
{ {
...@@ -2041,7 +2041,7 @@ namespace sw ...@@ -2041,7 +2041,7 @@ namespace sw
oC.y *= sourceFactor.y; oC.y *= sourceFactor.y;
oC.z *= sourceFactor.z; oC.z *= sourceFactor.z;
} }
if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO) if(state.destBlendFactor != BLEND_ONE && state.destBlendFactor != BLEND_ZERO)
{ {
pixel.x *= destFactor.x; pixel.x *= destFactor.x;
...@@ -2100,7 +2100,7 @@ namespace sw ...@@ -2100,7 +2100,7 @@ namespace sw
{ {
oC.w *= sourceFactor.w; oC.w *= sourceFactor.w;
} }
if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO) if(state.destBlendFactorAlpha != BLEND_ONE && state.destBlendFactorAlpha != BLEND_ZERO)
{ {
pixel.w *= destFactor.w; pixel.w *= destFactor.w;
...@@ -2118,10 +2118,10 @@ namespace sw ...@@ -2118,10 +2118,10 @@ namespace sw
pixel.w -= oC.w; pixel.w -= oC.w;
oC.w = pixel.w; oC.w = pixel.w;
break; break;
case BLENDOP_MIN: case BLENDOP_MIN:
oC.w = Min(oC.w, pixel.w); oC.w = Min(oC.w, pixel.w);
break; break;
case BLENDOP_MAX: case BLENDOP_MAX:
oC.w = Max(oC.w, pixel.w); oC.w = Max(oC.w, pixel.w);
break; break;
case BLENDOP_SOURCE: case BLENDOP_SOURCE:
...@@ -2272,7 +2272,7 @@ namespace sw ...@@ -2272,7 +2272,7 @@ namespace sw
masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked)); oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(masked));
} }
oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16)); oC.x = As<Float4>(As<Int4>(oC.x) & *Pointer<Int4>(constants + OFFSET(Constants,maskX0X) + xMask * 16, 16));
value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16)); value = As<Float4>(As<Int4>(value) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskX0X) + xMask * 16, 16));
oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value)); oC.x = As<Float4>(As<Int4>(oC.x) | As<Int4>(value));
...@@ -2283,7 +2283,7 @@ namespace sw ...@@ -2283,7 +2283,7 @@ namespace sw
value = *Pointer<Float4>(buffer + 16, 16); value = *Pointer<Float4>(buffer + 16, 16);
if(rgbaWriteMask != 0x0000000F) if(rgbaWriteMask != 0x0000000F)
{ {
Float4 masked = value; Float4 masked = value;
oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0]))); oC.y = As<Float4>(As<Int4>(oC.y) & *Pointer<Int4>(constants + OFFSET(Constants,maskD4X[rgbaWriteMask][0])));
masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0]))); masked = As<Float4>(As<Int4>(masked) & *Pointer<Int4>(constants + OFFSET(Constants,invMaskD4X[rgbaWriteMask][0])));
......
...@@ -271,7 +271,7 @@ namespace sw ...@@ -271,7 +271,7 @@ namespace sw
return exponential2(log, pp); return exponential2(log, pp);
} }
Float4 reciprocal(RValue<Float4> x, bool pp, bool finite) Float4 reciprocal(RValue<Float4> x, bool pp, bool finite, bool exactAtPow2)
{ {
Float4 rcp; Float4 rcp;
...@@ -281,7 +281,7 @@ namespace sw ...@@ -281,7 +281,7 @@ namespace sw
} }
else else
{ {
rcp = Rcp_pp(x); rcp = Rcp_pp(x, exactAtPow2);
if(!pp) if(!pp)
{ {
......
...@@ -87,7 +87,7 @@ namespace sw ...@@ -87,7 +87,7 @@ namespace sw
Float4 exponential(RValue<Float4> x, bool pp = false); Float4 exponential(RValue<Float4> x, bool pp = false);
Float4 logarithm(RValue<Float4> x, bool abs, bool pp = false); Float4 logarithm(RValue<Float4> x, bool abs, bool pp = false);
Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false); Float4 power(RValue<Float4> x, RValue<Float4> y, bool pp = false);
Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false); Float4 reciprocal(RValue<Float4> x, bool pp = false, bool finite = false, bool exactAtPow2 = false);
Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false); Float4 reciprocalSquareRoot(RValue<Float4> x, bool abs, bool pp = false);
Float4 modulo(RValue<Float4> x, RValue<Float4> y); Float4 modulo(RValue<Float4> x, RValue<Float4> y);
Float4 sine_pi(RValue<Float4> x, bool pp = false); // limited to [-pi, pi] range Float4 sine_pi(RValue<Float4> x, bool pp = false); // limited to [-pi, pi] range
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment