Commit 558540fe by Nicolas Capens Committed by Nicolas Capens

Add unit test for 11/10-bit minifloat denormal underflow

The reference code for conversion from 32-bit floating-point to 11- and 10-bit minifloat formats supports producing denormals for values smaller than what can be represented as normalized representations. The arithmetic can underflow to produce zero for values too small to be represented as denormals. This arithmetic contains a 32-bit shift operation which can shift by an amount greater than 32, which has undefined behavior in C++ but produces zero on x86 processors. This change adds unit tests for the intended behavior around the cutoff between the smallest denormal, and zero, to help validate the fix for the UB in a future change. Bug: b/147900455 Bug: chromium:1117433 Change-Id: Ic5e495dd822231d52a5551ee12733a616728d486 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/48068Reviewed-by: 's avatarAntonio Maiorano <amaiorano@google.com> Tested-by: 's avatarNicolas Capens <nicolascapens@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com>
parent 2d5bbdc4
...@@ -128,9 +128,25 @@ public: ...@@ -128,9 +128,25 @@ public:
class R11G11B10F class R11G11B10F
{ {
unsigned int R : 11; public:
unsigned int G : 11; R11G11B10F(float rgb[3])
unsigned int B : 10; {
R = float32ToFloat11(rgb[0]);
G = float32ToFloat11(rgb[1]);
B = float32ToFloat10(rgb[2]);
}
operator unsigned int() const
{
return *reinterpret_cast<const unsigned int *>(this);
}
void toRGB16F(half rgb[3]) const
{
rgb[0] = float11ToFloat16(R);
rgb[1] = float11ToFloat16(G);
rgb[2] = float10ToFloat16(B);
}
static inline half float11ToFloat16(unsigned short fp11) static inline half float11ToFloat16(unsigned short fp11)
{ {
...@@ -142,7 +158,7 @@ class R11G11B10F ...@@ -142,7 +158,7 @@ class R11G11B10F
return shortAsHalf(fp10 << 5); // Sign bit 0 return shortAsHalf(fp10 << 5); // Sign bit 0
} }
inline unsigned short float32ToFloat11(float fp32) static inline unsigned short float32ToFloat11(float fp32)
{ {
const unsigned int float32MantissaMask = 0x7FFFFF; const unsigned int float32MantissaMask = 0x7FFFFF;
const unsigned int float32ExponentMask = 0x7F800000; const unsigned int float32ExponentMask = 0x7F800000;
...@@ -215,7 +231,7 @@ class R11G11B10F ...@@ -215,7 +231,7 @@ class R11G11B10F
} }
} }
inline unsigned short float32ToFloat10(float fp32) static inline unsigned short float32ToFloat10(float fp32)
{ {
const unsigned int float32MantissaMask = 0x7FFFFF; const unsigned int float32MantissaMask = 0x7FFFFF;
const unsigned int float32ExponentMask = 0x7F800000; const unsigned int float32ExponentMask = 0x7F800000;
...@@ -249,7 +265,7 @@ class R11G11B10F ...@@ -249,7 +265,7 @@ class R11G11B10F
} }
else if(float32Sign) else if(float32Sign)
{ {
// -INF is clamped to 0 since float11 is positive only // -INF is clamped to 0 since float10 is positive only
return 0; return 0;
} }
else else
...@@ -264,14 +280,14 @@ class R11G11B10F ...@@ -264,14 +280,14 @@ class R11G11B10F
} }
else if(float32Val > float32Maxfloat10) else if(float32Val > float32Maxfloat10)
{ {
// The number is too large to be represented as a float11, set to max // The number is too large to be represented as a float10, set to max
return float10Max; return float10Max;
} }
else else
{ {
if(float32Val < float32Minfloat10) if(float32Val < float32Minfloat10)
{ {
// The number is too small to be represented as a normalized float11 // The number is too small to be represented as a normalized float10
// Convert it to a denormalized value. // Convert it to a denormalized value.
const unsigned int shift = (float32ExponentBias - float10ExponentBias) - const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
(float32Val >> float32ExponentFirstBit); (float32Val >> float32ExponentFirstBit);
...@@ -280,7 +296,7 @@ class R11G11B10F ...@@ -280,7 +296,7 @@ class R11G11B10F
} }
else else
{ {
// Rebias the exponent to represent the value as a normalized float11 // Rebias the exponent to represent the value as a normalized float10
float32Val += 0xC8000000; float32Val += 0xC8000000;
} }
...@@ -288,25 +304,10 @@ class R11G11B10F ...@@ -288,25 +304,10 @@ class R11G11B10F
} }
} }
public: private:
R11G11B10F(float rgb[3]) unsigned int R : 11;
{ unsigned int G : 11;
R = float32ToFloat11(rgb[0]); unsigned int B : 10;
G = float32ToFloat11(rgb[1]);
B = float32ToFloat10(rgb[2]);
}
operator unsigned int() const
{
return *reinterpret_cast<const unsigned int *>(this);
}
void toRGB16F(half rgb[3]) const
{
rgb[0] = float11ToFloat16(R);
rgb[1] = float11ToFloat16(G);
rgb[2] = float10ToFloat16(B);
}
}; };
} // namespace sw } // namespace sw
......
...@@ -21,6 +21,18 @@ ...@@ -21,6 +21,18 @@
using namespace sw; using namespace sw;
TEST(MathTest, UnsignedFloat11_10)
{
// Test the largest value which causes underflow to 0, and the smallest value
// which produces a denormalized result.
EXPECT_EQ(R11G11B10F::float32ToFloat11(bit_cast<float>(0x3500007F)), 0x0000);
EXPECT_EQ(R11G11B10F::float32ToFloat11(bit_cast<float>(0x35000080)), 0x0001);
EXPECT_EQ(R11G11B10F::float32ToFloat10(bit_cast<float>(0x3580003F)), 0x0000);
EXPECT_EQ(R11G11B10F::float32ToFloat10(bit_cast<float>(0x35800040)), 0x0001);
}
// Clamps to the [0, hi] range. NaN input produces 0, hi must be non-NaN. // Clamps to the [0, hi] range. NaN input produces 0, hi must be non-NaN.
float clamp0hi(float x, float hi) float clamp0hi(float x, float hi)
{ {
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment