Add unit test for 11/10-bit minifloat denormal underflow

The reference code for conversion from 32-bit floating-point to 11- and 10-bit minifloat formats supports producing denormals for values smaller than what can be represented as normalized representations. The arithmetic can underflow to produce zero for values too small to be represented as denormals. This arithmetic contains a 32-bit shift operation which can shift by an amount greater than 32, which has undefined behavior in C++ but produces zero on x86 processors. This change adds unit tests for the intended behavior around the cutoff between the smallest denormal, and zero, to help validate the fix for the UB in a future change. Bug: b/147900455 Bug: chromium:1117433 Change-Id: Ic5e495dd822231d52a5551ee12733a616728d486 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/48068Reviewed-by: Antonio Maiorano <amaiorano@google.com> Tested-by: Nicolas Capens <nicolascapens@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com>

Add unit test for 11/10-bit minifloat denormal underflow
558540fe · Nicolas Capens · Nicolas Capens · 2d5bbdc4 · 558540fe · 558540fe
Commit 558540fe authored Aug 21, 2020 by Nicolas Capens Committed by Nicolas Capens Sep 01, 2020
Hide whitespace changes
Inline Side-by-side

Showing with 41 additions and 28 deletions

Half.hpp src/System/Half.hpp +29 -28

unittests.cpp tests/MathUnitTests/unittests.cpp +12 -0

No files found.
--- a/src/System/Half.hpp
+++ b/src/System/Half.hpp
@@ -128,9 +128,25 @@ public:
 class R11G11B10F
 {
-	unsigned int R : 11;
+public:
-	unsigned int G : 11;
+	R11G11B10F(float rgb[3])
-	unsigned int B : 10;
+	{
+		R = float32ToFloat11(rgb[0]);
+		G = float32ToFloat11(rgb[1]);
+		B = float32ToFloat10(rgb[2]);
+	}
+	operator unsigned int() const
+	{
+		return *reinterpret_cast<const unsigned int *>(this);
+	}
+	void toRGB16F(half rgb[3]) const
+	{
+		rgb[0] = float11ToFloat16(R);
+		rgb[1] = float11ToFloat16(G);
+		rgb[2] = float10ToFloat16(B);
+	}
 	static inline half float11ToFloat16(unsigned short fp11)
 	{
@@ -142,7 +158,7 @@ class R11G11B10F
 		return shortAsHalf(fp10 << 5);  // Sign bit 0
 	}
-	inline unsigned short float32ToFloat11(float fp32)
+	static inline unsigned short float32ToFloat11(float fp32)
 	{
 		const unsigned int float32MantissaMask = 0x7FFFFF;
 		const unsigned int float32ExponentMask = 0x7F800000;
@@ -215,7 +231,7 @@ class R11G11B10F
 		}
 	}
-	inline unsigned short float32ToFloat10(float fp32)
+	static inline unsigned short float32ToFloat10(float fp32)
 	{
 		const unsigned int float32MantissaMask = 0x7FFFFF;
 		const unsigned int float32ExponentMask = 0x7F800000;
@@ -249,7 +265,7 @@ class R11G11B10F
 			}
 			else if(float32Sign)
 			{
-				// -INF is clamped to 0 since float11 is positive only
+				// -INF is clamped to 0 since float10 is positive only
 				return 0;
 			}
 			else
@@ -264,14 +280,14 @@ class R11G11B10F
 		}
 		else if(float32Val > float32Maxfloat10)
 		{
-			// The number is too large to be represented as a float11, set to max
+			// The number is too large to be represented as a float10, set to max
 			return float10Max;
 		}
 		else
 		{
 			if(float32Val < float32Minfloat10)
 			{
-				// The number is too small to be represented as a normalized float11
+				// The number is too small to be represented as a normalized float10
 				// Convert it to a denormalized value.
 				const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
 				                           (float32Val >> float32ExponentFirstBit);
@@ -280,7 +296,7 @@ class R11G11B10F
 			}
 			else
 			{
-				// Rebias the exponent to represent the value as a normalized float11
+				// Rebias the exponent to represent the value as a normalized float10
 				float32Val += 0xC8000000;
 			}
@@ -288,25 +304,10 @@ class R11G11B10F
 		}
 	}
-public:
+private:
-	R11G11B10F(float rgb[3])
+	unsigned int R : 11;
-	{
+	unsigned int G : 11;
-		R = float32ToFloat11(rgb[0]);
+	unsigned int B : 10;
-		G = float32ToFloat11(rgb[1]);
-		B = float32ToFloat10(rgb[2]);
-	}
-	operator unsigned int() const
-	{
-		return *reinterpret_cast<const unsigned int *>(this);
-	}
-	void toRGB16F(half rgb[3]) const
-	{
-		rgb[0] = float11ToFloat16(R);
-		rgb[1] = float11ToFloat16(G);
-		rgb[2] = float10ToFloat16(B);
-	}
 };
 }  // namespace sw

--- a/tests/MathUnitTests/unittests.cpp
+++ b/tests/MathUnitTests/unittests.cpp
@@ -21,6 +21,18 @@
 using namespace sw;
+TEST(MathTest, UnsignedFloat11_10)
+{
+	// Test the largest value which causes underflow to 0, and the smallest value
+	// which produces a denormalized result.
+	EXPECT_EQ(R11G11B10F::float32ToFloat11(bit_cast<float>(0x3500007F)), 0x0000);
+	EXPECT_EQ(R11G11B10F::float32ToFloat11(bit_cast<float>(0x35000080)), 0x0001);
+	EXPECT_EQ(R11G11B10F::float32ToFloat10(bit_cast<float>(0x3580003F)), 0x0000);
+	EXPECT_EQ(R11G11B10F::float32ToFloat10(bit_cast<float>(0x35800040)), 0x0001);
+}
 // Clamps to the [0, hi] range. NaN input produces 0, hi must be non-NaN.
 float clamp0hi(float x, float hi)
 {