Fix back-face culling for vertices near w=0

When the homogeneous w coordinate is near 0, the projected x or y coordinates (after viewport and subpixel scaling) may be larger than what's representable by 32-bit signed integers. On x86 this produces the "indefinite" integer value 0x80000000. This negative value is returned even for positive inputs. This causes the back-face culling calculations to reject triangles which are in fact visible. This change addresses that by introducing the rr::RoundIntClamped() function, which guarantees that very large positive floating-point values are converted to a positive integer value near INT_MAX. Note that the Vulkan spec states that "[the determination whether the triangle is back-facing or front-facing] is made based on the sign of the (clipped or unclipped) polygon’s area computed in framebuffer coordinates." Thus the alternative of performing culling using the unprojected coordinates would have to take negative viewport dimensions into account, and may not produce the same results as sub-pixel snapped coordinates. Adjust back-face culling of wireframe triangles to use the same calculations as solid triangles. Bug: b/177382194 Change-Id: Ice8493d4cfd164638f091bf5a39b5396d65458e2 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/51648Tested-by: Nicolas Capens <nicolascapens@google.com> Reviewed-by: Alexis Hétu <sugoi@google.com>

Fix back-face culling for vertices near w=0
eeb81843 · Nicolas Capens · Nicolas Capens · 3549479d · eeb81843 · eeb81843
Commit eeb81843 authored Jan 12, 2021 by Nicolas Capens Committed by Nicolas Capens Jan 13, 2021
7 changed files
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -712,11 +712,18 @@ int DrawCall::setupWireframeTriangles(Triangle *triangles, Primitive *primitives
 		const Vertex &v1 = triangles[i].v1;
 		const Vertex &v2 = triangles[i].v2;
-		float d = (v0.y * v1.x - v0.x * v1.y) * v2.w +
+		float A = ((float)v0.projected.y - (float)v2.projected.y) * (float)v1.projected.x +
-		          (v0.x * v2.y - v0.y * v2.x) * v1.w +
+		          ((float)v2.projected.y - (float)v1.projected.y) * (float)v0.projected.x +
-		          (v2.x * v1.y - v1.x * v2.y) * v0.w;
+		          ((float)v1.projected.y - (float)v0.projected.y) * (float)v2.projected.x;  // Area
+		int w0w1w2 = bit_cast<int>(v0.w) ^
+		             bit_cast<int>(v1.w) ^
+		             bit_cast<int>(v2.w);
+		A = w0w1w2 < 0 ? -A : A;
+		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);
-		bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (d > 0) : (d < 0);
 		if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 		{
 			if(frontFacing) continue;

--- a/src/Pipeline/SetupRoutine.cpp
+++ b/src/Pipeline/SetupRoutine.cpp
@@ -81,18 +81,13 @@ void SetupRoutine::generate()
 			Float A = (y0 - y2) * x1 + (y2 - y1) * x0 + (y1 - y0) * x2;  // Area
-			If(A == 0.0f)
-			{
-				Return(0);
-			}
 			Int w0w1w2 = *Pointer<Int>(v0 + OFFSET(Vertex, w)) ^
 			             *Pointer<Int>(v1 + OFFSET(Vertex, w)) ^
 			             *Pointer<Int>(v2 + OFFSET(Vertex, w));
 			A = IfThenElse(w0w1w2 < 0, -A, A);
-			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? A > 0.0f : A < 0.0f;
+			Bool frontFacing = (state.frontFace == VK_FRONT_FACE_COUNTER_CLOCKWISE) ? (A >= 0.0f) : (A <= 0.0f);
 			if(state.cullMode & VK_CULL_MODE_FRONT_BIT)
 			{

--- a/src/Pipeline/VertexRoutine.cpp
+++ b/src/Pipeline/VertexRoutine.cpp
@@ -546,8 +546,8 @@ void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCac
 	Float4 rhw = Float4(1.0f) / w;
 	Vector4f proj;
-	proj.x = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
+	proj.x = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, X0xF)) + pos.x * rhw * *Pointer<Float4>(data + OFFSET(DrawData, WxF))));
-	proj.y = As<Float4>(RoundInt(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
+	proj.y = As<Float4>(RoundIntClamped(*Pointer<Float4>(data + OFFSET(DrawData, Y0xF)) + pos.y * rhw * *Pointer<Float4>(data + OFFSET(DrawData, HxF))));
 	proj.z = pos.z * rhw;
 	proj.w = rhw;

--- a/src/Reactor/LLVMReactor.cpp
+++ b/src/Reactor/LLVMReactor.cpp
@@ -2674,6 +2674,21 @@ RValue<Int4> RoundInt(RValue<Float4> cast)
 #endif
 }
+RValue<Int4> RoundIntClamped(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+#if defined(__i386__) || defined(__x86_64__)
+	// cvtps2dq produces 0x80000000, a negative value, for input larger than
+	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
+	// saturate to 0x80000000.
+	return x86::cvtps2dq(Min(cast, Float4(0x7FFFFF80)));
+#else
+	// ARM saturates to the largest positive or negative integer. Unit tests
+	// verify that lowerRoundInt() behaves as desired.
+	return As<Int4>(V(lowerRoundInt(V(cast.value()), T(Int4::type()))));
+#endif
+}
 RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();

--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -2007,7 +2007,14 @@ inline RValue<Int4> CmpGE(RValue<Int4> x, RValue<Int4> y)
 }
 RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y);
 RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y);
+// Convert to nearest integer. If a converted value is outside of the integer
+// range, the returned result is undefined.
 RValue<Int4> RoundInt(RValue<Float4> cast);
+// Rounds to the nearest integer, but clamps very large values to an
+// implementation-dependent range.
+// Specifically, on x86, values larger than 2147483583.0 are converted to
+// 2147483583 (0x7FFFFFBF) instead of producing 0x80000000.
+RValue<Int4> RoundIntClamped(RValue<Float4> cast);
 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y);
 RValue<UShort8> PackUnsigned(RValue<Int4> x, RValue<Int4> y);
 RValue<Int> Extract(RValue<Int4> val, int i);

--- a/src/Reactor/SubzeroReactor.cpp
+++ b/src/Reactor/SubzeroReactor.cpp
@@ -3589,6 +3589,33 @@ RValue<Int4> RoundInt(RValue<Float4> cast)
 	}
 }
+RValue<Int4> RoundIntClamped(RValue<Float4> cast)
+{
+	RR_DEBUG_INFO_UPDATE_LOC();
+	// cvtps2dq produces 0x80000000, a negative value, for input larger than
+	// 2147483520.0, so clamp to 2147483520. Values less than -2147483520.0
+	// saturate to 0x80000000.
+	RValue<Float4> clamped = Min(cast, Float4(0x7FFFFF80));
+	if(emulateIntrinsics || CPUID::ARM)
+	{
+		// Push the fractional part off the mantissa. Accurate up to +/-2^22.
+		return Int4((clamped + Float4(0x00C00000)) - Float4(0x00C00000));
+	}
+	else
+	{
+		Ice::Variable *result = ::function->makeVariable(Ice::IceType_v4i32);
+		const Ice::Intrinsics::IntrinsicInfo intrinsic = { Ice::Intrinsics::Nearbyint, Ice::Intrinsics::SideEffects_F, Ice::Intrinsics::ReturnsTwice_F, Ice::Intrinsics::MemoryWrite_F };
+		auto target = ::context->getConstantUndef(Ice::IceType_i32);
+		auto nearbyint = Ice::InstIntrinsicCall::create(::function, 1, result, target, intrinsic);
+		nearbyint->addArg(clamped.value());
+		::basicBlock->appendInst(nearbyint);
+		return RValue<Int4>(V(result));
+	}
+}
 RValue<Short8> PackSigned(RValue<Int4> x, RValue<Int4> y)
 {
 	RR_DEBUG_INFO_UPDATE_LOC();

--- a/tests/ReactorUnitTests/ReactorUnitTests.cpp
+++ b/tests/ReactorUnitTests/ReactorUnitTests.cpp
@@ -780,6 +780,40 @@ TEST(ReactorUnitTests, NotNeg)
 	EXPECT_EQ(out[8][3], 0x00000000u);
 }
+TEST(ReactorUnitTests, RoundInt)
+{
+	FunctionT<int(void *)> function;
+	{
+		Pointer<Byte> out = function.Arg<0>();
+		*Pointer<Int4>(out + 0) = RoundInt(Float4(3.1f, 3.6f, -3.1f, -3.6f));
+		*Pointer<Int4>(out + 16) = RoundIntClamped(Float4(2147483648.0f, -2147483648.0f, 2147483520, -2147483520));
+		Return(0);
+	}
+	auto routine = function(testName().c_str());
+	int out[2][4];
+	memset(&out, 0, sizeof(out));
+	routine(&out);
+	EXPECT_EQ(out[0][0], 3);
+	EXPECT_EQ(out[0][1], 4);
+	EXPECT_EQ(out[0][2], -3);
+	EXPECT_EQ(out[0][3], -4);
+	// x86 returns 0x80000000 for values which cannot be represented in a 32-bit
+	// integer, but RoundIntClamped() clamps to ensure a positive value for
+	// positive input. ARM saturates to the largest representable integers.
+	EXPECT_GE(out[1][0], 2147483520);
+	EXPECT_LT(out[1][1], -2147483647);
+	EXPECT_EQ(out[1][2], 2147483520);
+	EXPECT_EQ(out[1][3], -2147483520);
+}
 TEST(ReactorUnitTests, FPtoUI)
 {
 	FunctionT<int(void *)> function;