Half float packing and unpacking intrinsic functions

Implementation for packHalf2x16, unpackHalf2x16 intrinsic functions. Change-Id: I55212f8bc2ecd30e0108858d74117c3cf60733ed Reviewed-on: https://swiftshader-review.googlesource.com/5056Tested-by: Alexis Hétu <sugoi@google.com> Reviewed-by: Nicolas Capens <capn@google.com>

Half float packing and unpacking intrinsic functions
ffb35eb4 · Alexis Hetu · Alexis Hétu · b9a781da · ffb35eb4 · ffb35eb4
Commit ffb35eb4 authored Apr 06, 2016 by Alexis Hetu Committed by Alexis Hétu Apr 07, 2016
Showing with 59 additions and 0 deletions

PixelProgram.cpp src/Shader/PixelProgram.cpp +2 -0

ShaderCore.cpp src/Shader/ShaderCore.cpp +51 -0

ShaderCore.hpp src/Shader/ShaderCore.hpp +4 -0

VertexProgram.cpp src/Shader/VertexProgram.cpp +2 -0

No files found.
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -230,8 +230,10 @@ namespace sw
 			case Shader::OPCODE_UINTBITSTOFLOAT: d = s0;                                   break;
 			case Shader::OPCODE_PACKSNORM2x16:   packSnorm2x16(d, s0);                     break;
 			case Shader::OPCODE_PACKUNORM2x16:   packUnorm2x16(d, s0);                     break;
+			case Shader::OPCODE_PACKHALF2x16:    packHalf2x16(d, s0);                      break;
 			case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0);                   break;
 			case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0);                   break;
+			case Shader::OPCODE_UNPACKHALF2x16:  unpackHalf2x16(d, s0);                    break;
 			case Shader::OPCODE_POWX:       powx(d, s0, s1, pp);                           break;
 			case Shader::OPCODE_POW:        pow(d, s0, s1, pp);                            break;
 			case Shader::OPCODE_SGN:        sgn(d, s0);                                    break;

--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -1123,6 +1123,57 @@ namespace sw
 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
 	}
+	void ShaderCore::floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits)
+	{
+		static const uint32_t mask_sign = 0x80000000u;
+		static const uint32_t mask_round = ~0xfffu;
+		static const uint32_t c_f32infty = 255 << 23;
+		static const uint32_t c_magic = 15 << 23;
+		static const uint32_t c_nanbit = 0x200;
+		static const uint32_t c_infty_as_fp16 = 0x7c00;
+		static const uint32_t c_clamp = (31 << 23) - 0x1000;
+		UInt4 justsign = UInt4(mask_sign) & As<UInt4>(floatBits);
+		UInt4 absf = As<UInt4>(floatBits) ^ justsign;
+		UInt4 b_isnormal = CmpNLE(UInt4(c_f32infty), absf);
+		// Note: this version doesn't round to the nearest even in case of a tie as defined by IEEE 754-2008, it rounds to +inf
+		//       instead of nearest even, since that's fine for GLSL ES 3.0's needs (see section 2.1.1 Floating-Point Computation)
+		UInt4 joined = ((((As<UInt4>(Min(As<Float4>(absf & UInt4(mask_round)) * As<Float4>(UInt4(c_magic)),
+		                                 As<Float4>(UInt4(c_clamp))))) - UInt4(mask_round)) >> 13) & b_isnormal) |
+		               ((b_isnormal ^ UInt4(0xFFFFFFFF)) & ((CmpNLE(absf, UInt4(c_f32infty)) & UInt4(c_nanbit)) |
+		               UInt4(c_infty_as_fp16)));
+		dst = As<Float4>(storeInUpperBits ? As<UInt4>(dst) | ((joined << 16) | justsign) : joined | (justsign >> 16));
+	}
+	void ShaderCore::halfToFloatBits(Float4& dst, const Float4& halfBits)
+	{
+		static const uint32_t mask_nosign = 0x7FFF;
+		static const uint32_t magic = (254 - 15) << 23;
+		static const uint32_t was_infnan = 0x7BFF;
+		static const uint32_t exp_infnan = 255 << 23;
+		UInt4 expmant = As<UInt4>(halfBits) & UInt4(mask_nosign);
+		dst = As<Float4>(As<UInt4>(As<Float4>(expmant << 13) * As<Float4>(UInt4(magic))) |
+		                 ((As<UInt4>(halfBits) ^ UInt4(expmant)) << 16) |
+		                 (CmpNLE(As<UInt4>(expmant), UInt4(was_infnan)) & UInt4(exp_infnan)));
+	}
+	void ShaderCore::packHalf2x16(Vector4f &d, const Vector4f &s0)
+	{
+		// half2 | half1
+		floatToHalfBits(d.x, s0.x, false);
+		floatToHalfBits(d.x, s0.y, true);
+	}
+	void ShaderCore::unpackHalf2x16(Vector4f &dst, const Vector4f &s0)
+	{
+		// half2 | half1
+		halfToFloatBits(dst.x, As<Float4>(As<UInt4>(s0.x) & UInt4(0x0000FFFF)));
+		halfToFloatBits(dst.y, As<Float4>((As<UInt4>(s0.x) & UInt4(0xFFFF0000)) >> 16));
+	}
 	void ShaderCore::packSnorm2x16(Vector4f &d, const Vector4f &s0)
 	{
 		// round(clamp(c, -1.0, 1.0) * 32767.0)

--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -313,6 +313,8 @@ namespace sw
 		void att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
 		void smooth(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void packHalf2x16(Vector4f &dst, const Vector4f &src);
+		void unpackHalf2x16(Vector4f &dst, const Vector4f &src);
 		void packSnorm2x16(Vector4f &dst, const Vector4f &src);
 		void packUnorm2x16(Vector4f &dst, const Vector4f &src);
 		void unpackSnorm2x16(Vector4f &dst, const Vector4f &src);
@@ -383,6 +385,8 @@ namespace sw
 		void cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
 		void cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
 		void select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2);
+		void floatToHalfBits(Float4& dst, const Float4& floatBits, bool storeInUpperBits);
+		void halfToFloatBits(Float4& dst, const Float4& halfBits);
 	};
 }

--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -205,8 +205,10 @@ namespace sw
 			case Shader::OPCODE_UINTBITSTOFLOAT: d = s0;                    break;
 			case Shader::OPCODE_PACKSNORM2x16:   packSnorm2x16(d, s0);      break;
 			case Shader::OPCODE_PACKUNORM2x16:   packUnorm2x16(d, s0);      break;
+			case Shader::OPCODE_PACKHALF2x16:    packHalf2x16(d, s0);       break;
 			case Shader::OPCODE_UNPACKSNORM2x16: unpackSnorm2x16(d, s0);    break;
 			case Shader::OPCODE_UNPACKUNORM2x16: unpackUnorm2x16(d, s0);    break;
+			case Shader::OPCODE_UNPACKHALF2x16:  unpackHalf2x16(d, s0);     break;
 			case Shader::OPCODE_M3X2:       M3X2(d, s0, src1);              break;
 			case Shader::OPCODE_M3X3:       M3X3(d, s0, src1);              break;
 			case Shader::OPCODE_M3X4:       M3X4(d, s0, src1);              break;