New integer related core functions

- Removed float <-> int bit conversion functions, as these will not be needed if everything is stored as float. - Added ineg for the minus (-) sign in from of a value. - Added f2i/i2f/f2u/u2f for float <-> int conversions - Added b2i/i2b/b2u/u2b for bool <-> int conversions - Added iadd, isub, imul, imad, [iu]div, [iu]mod, [iu]min, [iu]max for these basic operations as integer operations. - Added left and right shifts - Added ucmp to compare unsigned values - Modified or/xor/and to support vectors instead of only scalars. - Added vector equality comparison functions Change-Id: I0f138e3707242ec0fffc1c12b95064ddc98f0087 Reviewed-on: https://swiftshader-review.googlesource.com/3888Tested-by: Alexis Hétu <sugoi@google.com> Reviewed-by: Nicolas Capens <capn@google.com>

New integer related core functions
c4f2c297 · Alexis Hetu · Alexis Hétu · c4b57f53 · c4f2c297 · c4f2c297
Commit c4f2c297 authored Aug 18, 2015 by Alexis Hetu Committed by Alexis Hétu Aug 28, 2015
Showing with 350 additions and 82 deletions

PixelProgram.cpp src/Shader/PixelProgram.cpp +3 -3

ShaderCore.cpp src/Shader/ShaderCore.cpp +312 -65

ShaderCore.hpp src/Shader/ShaderCore.hpp +32 -11

VertexProgram.cpp src/Shader/VertexProgram.cpp +3 -3

No files found.
--- a/src/Shader/PixelProgram.cpp
+++ b/src/Shader/PixelProgram.cpp
@@ -263,9 +263,9 @@ namespace sw
 			case Shader::OPCODE_ALL:        all(d.x, s0);                                  break;
 			case Shader::OPCODE_ANY:        any(d.x, s0);                                  break;
 			case Shader::OPCODE_NOT:        not(d, s0);                                    break;
-			case Shader::OPCODE_OR:         or(d.x, s0.x, s1.x);                           break;
-			case Shader::OPCODE_XOR:        xor(d.x, s0.x, s1.x);                          break;
-			case Shader::OPCODE_AND:        and(d.x, s0.x, s1.x);                          break;
+			case Shader::OPCODE_OR:         or(d, s0, s1);                                 break;
+			case Shader::OPCODE_XOR:        xor(d, s0, s1);                                break;
+			case Shader::OPCODE_AND:        and(d, s0, s1);                                break;
 			case Shader::OPCODE_END:                                                       break;
 			default:
 				ASSERT(false);

--- a/src/Shader/ShaderCore.cpp
+++ b/src/Shader/ShaderCore.cpp
@@ -484,26 +484,6 @@ namespace sw
 		return logarithm((Float4(1.0f) + x) / (Float4(1.0f) - x), pp) * Float4(0.5f);
 	}

-	Int4 floatBitsToInt(RValue<Float4> x)
-	{
-		return As<Int4>(x);
-	}
-	
-	UInt4 floatBitsToUInt(RValue<Float4> x)
-	{
-		return As<UInt4>(x);
-	}
-
-	Float4 intBitsToFloat(RValue<Int4> x)
-	{
-		return As<Float4>(x);
-	}
-
-	Float4 uintBitsToFloat(RValue<UInt4> x)
-	{
-		return As<Float4>(x);
-	}
-
 	Float4 dot2(const Vector4f &v0, const Vector4f &v1)
 	{
 		return v0.x * v1.x + v0.y * v1.y;
@@ -613,6 +593,22 @@ namespace sw
 		}
 	}

+	void ShaderCore::neg(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = -src.x;
+		dst.y = -src.y;
+		dst.z = -src.z;
+		dst.w = -src.w;
+	}
+
+	void ShaderCore::ineg(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(-As<Int4>(src.x));
+		dst.y = As<Float4>(-As<Int4>(src.y));
+		dst.z = As<Float4>(-As<Int4>(src.z));
+		dst.w = As<Float4>(-As<Int4>(src.w));
+	}
+
 	void ShaderCore::f2b(Vector4f &dst, const Vector4f &src)
 	{
 		dst.x = As<Float4>(CmpNEQ(src.x, Float4(0.0f)));
@@ -629,6 +625,70 @@ namespace sw
 		dst.w = As<Float4>(As<Int4>(src.w) & As<Int4>(Float4(1.0f)));
 	}

+	void ShaderCore::f2i(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(Int4(src.x));
+		dst.y = As<Float4>(Int4(src.y));
+		dst.z = As<Float4>(Int4(src.z));
+		dst.w = As<Float4>(Int4(src.w));
+	}
+
+	void ShaderCore::i2f(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Float4(As<Int4>(src.x));
+		dst.y = Float4(As<Int4>(src.y));
+		dst.z = Float4(As<Int4>(src.z));
+		dst.w = Float4(As<Int4>(src.w));
+	}
+
+	void ShaderCore::f2u(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(UInt4(src.x));
+		dst.y = As<Float4>(UInt4(src.y));
+		dst.z = As<Float4>(UInt4(src.z));
+		dst.w = As<Float4>(UInt4(src.w));
+	}
+
+	void ShaderCore::u2f(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = Float4(As<UInt4>(src.x));
+		dst.y = Float4(As<UInt4>(src.y));
+		dst.z = Float4(As<UInt4>(src.z));
+		dst.w = Float4(As<UInt4>(src.w));
+	}
+
+	void ShaderCore::i2b(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(CmpNEQ(As<Int4>(src.x), Int4(0)));
+		dst.y = As<Float4>(CmpNEQ(As<Int4>(src.y), Int4(0)));
+		dst.z = As<Float4>(CmpNEQ(As<Int4>(src.z), Int4(0)));
+		dst.w = As<Float4>(CmpNEQ(As<Int4>(src.w), Int4(0)));
+	}
+
+	void ShaderCore::b2i(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(As<Int4>(src.x) & Int4(1));
+		dst.y = As<Float4>(As<Int4>(src.y) & Int4(1));
+		dst.z = As<Float4>(As<Int4>(src.z) & Int4(1));
+		dst.w = As<Float4>(As<Int4>(src.w) & Int4(1));
+	}
+
+	void ShaderCore::u2b(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src.x), UInt4(0)));
+		dst.y = As<Float4>(CmpNEQ(As<UInt4>(src.y), UInt4(0)));
+		dst.z = As<Float4>(CmpNEQ(As<UInt4>(src.z), UInt4(0)));
+		dst.w = As<Float4>(CmpNEQ(As<UInt4>(src.w), UInt4(0)));
+	}
+
+	void ShaderCore::b2u(Vector4f &dst, const Vector4f &src)
+	{
+		dst.x = As<Float4>(As<UInt4>(src.x) & UInt4(1));
+		dst.y = As<Float4>(As<UInt4>(src.y) & UInt4(1));
+		dst.z = As<Float4>(As<UInt4>(src.z) & UInt4(1));
+		dst.w = As<Float4>(As<UInt4>(src.w) & UInt4(1));
+	}
+
 	void ShaderCore::add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
 		dst.x = src0.x + src1.x;
@@ -637,6 +697,14 @@ namespace sw
 		dst.w = src0.w + src1.w;
 	}

+	void ShaderCore::iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) + As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) + As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) + As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) + As<Int4>(src1.w));
+	}
+
 	void ShaderCore::sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
 		dst.x = src0.x - src1.x;
@@ -645,6 +713,14 @@ namespace sw
 		dst.w = src0.w - src1.w;
 	}

+	void ShaderCore::isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) - As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) - As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) - As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) - As<Int4>(src1.w));
+	}
+
 	void ShaderCore::mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
 	{
 		dst.x = src0.x * src1.x + src2.x;
@@ -653,6 +729,14 @@ namespace sw
 		dst.w = src0.w * src1.w + src2.w;
 	}

+	void ShaderCore::imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x) + As<Int4>(src2.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y) + As<Int4>(src2.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z) + As<Int4>(src2.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w) + As<Int4>(src2.w));
+	}
+
 	void ShaderCore::mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
 		dst.x = src0.x * src1.x;
@@ -661,6 +745,14 @@ namespace sw
 		dst.w = src0.w * src1.w;
 	}

+	void ShaderCore::imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) * As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) * As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) * As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) * As<Int4>(src1.w));
+	}
+
 	void ShaderCore::rcpx(Vector4f &dst, const Vector4f &src, bool pp)
 	{
 		Float4 rcp = reciprocal(src.x, pp, true);
@@ -679,6 +771,32 @@ namespace sw
 		dst.w = src0.w / src1.w;
 	}

+	void ShaderCore::idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 intMax(As<Float4>(Int4(INT_MAX)));
+		cmp0i(dst.x, src1.x, intMax, src1.x);
+		dst.x = As<Float4>(As<Int4>(src0.x) / As<Int4>(dst.x));
+		cmp0i(dst.y, src1.y, intMax, src1.y);
+		dst.y = As<Float4>(As<Int4>(src0.y) / As<Int4>(dst.y));
+		cmp0i(dst.z, src1.z, intMax, src1.z);
+		dst.z = As<Float4>(As<Int4>(src0.z) / As<Int4>(dst.z));
+		cmp0i(dst.w, src1.w, intMax, src1.w);
+		dst.w = As<Float4>(As<Int4>(src0.w) / As<Int4>(dst.w));
+	}
+
+	void ShaderCore::udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		Float4 uintMax(As<Float4>(UInt4(UINT_MAX)));
+		cmp0i(dst.x, src1.x, uintMax, src1.x);
+		dst.x = As<Float4>(As<UInt4>(src0.x) / As<UInt4>(dst.x));
+		cmp0i(dst.y, src1.y, uintMax, src1.y);
+		dst.y = As<Float4>(As<UInt4>(src0.y) / As<UInt4>(dst.y));
+		cmp0i(dst.z, src1.z, uintMax, src1.z);
+		dst.z = As<Float4>(As<UInt4>(src0.z) / As<UInt4>(dst.z));
+		cmp0i(dst.w, src1.w, uintMax, src1.w);
+		dst.w = As<Float4>(As<UInt4>(src0.w) / As<UInt4>(dst.w));
+	}
+
 	void ShaderCore::mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
 		dst.x = modulo(src0.x, src1.x);
@@ -687,6 +805,53 @@ namespace sw
 		dst.w = modulo(src0.w, src1.w);
 	}

+	void ShaderCore::imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		cmp0i(dst.x, src1.x, src0.x, src1.x);
+		dst.x = As<Float4>(As<Int4>(src0.x) % As<Int4>(dst.x));
+		cmp0i(dst.y, src1.y, src0.y, src1.y);
+		dst.y = As<Float4>(As<Int4>(src0.y) % As<Int4>(dst.y));
+		cmp0i(dst.z, src1.z, src0.z, src1.z);
+		dst.z = As<Float4>(As<Int4>(src0.z) % As<Int4>(dst.z));
+		cmp0i(dst.w, src1.w, src0.w, src1.w);
+		dst.w = As<Float4>(As<Int4>(src0.w) % As<Int4>(dst.w));
+	}
+	void ShaderCore::umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		cmp0i(dst.x, src1.x, src0.x, src1.x);
+		dst.x = As<Float4>(As<UInt4>(src0.x) % As<UInt4>(dst.x));
+		cmp0i(dst.y, src1.y, src0.y, src1.y);
+		dst.y = As<Float4>(As<UInt4>(src0.y) % As<UInt4>(dst.y));
+		cmp0i(dst.z, src1.z, src0.z, src1.z);
+		dst.z = As<Float4>(As<UInt4>(src0.z) % As<UInt4>(dst.z));
+		cmp0i(dst.w, src1.w, src0.w, src1.w);
+		dst.w = As<Float4>(As<UInt4>(src0.w) % As<UInt4>(dst.w));
+	}
+
+	void ShaderCore::shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) << As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) << As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) << As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) << As<Int4>(src1.w));
+	}
+
+	void ShaderCore::ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) >> As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) >> As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) >> As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) >> As<Int4>(src1.w));
+	}
+
+	void ShaderCore::ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<UInt4>(src0.x) >> As<UInt4>(src1.x));
+		dst.y = As<Float4>(As<UInt4>(src0.y) >> As<UInt4>(src1.y));
+		dst.z = As<Float4>(As<UInt4>(src0.z) >> As<UInt4>(src1.z));
+		dst.w = As<Float4>(As<UInt4>(src0.w) >> As<UInt4>(src1.w));
+	}
+
 	void ShaderCore::rsqx(Vector4f &dst, const Vector4f &src, bool pp)
 	{
 		Float4 rsq = reciprocalSquareRoot(src.x, true, pp);
@@ -818,6 +983,22 @@ namespace sw
 		dst.w = Min(src0.w, src1.w);
 	}

+	void ShaderCore::imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Min(As<Int4>(src0.x), As<Int4>(src1.x)));
+		dst.y = As<Float4>(Min(As<Int4>(src0.y), As<Int4>(src1.y)));
+		dst.z = As<Float4>(Min(As<Int4>(src0.z), As<Int4>(src1.z)));
+		dst.w = As<Float4>(Min(As<Int4>(src0.w), As<Int4>(src1.w)));
+	}
+
+	void ShaderCore::umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Min(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+		dst.y = As<Float4>(Min(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+		dst.z = As<Float4>(Min(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+		dst.w = As<Float4>(Min(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+	}
+
 	void ShaderCore::max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
 		dst.x = Max(src0.x, src1.x);
@@ -826,6 +1007,22 @@ namespace sw
 		dst.w = Max(src0.w, src1.w);
 	}

+	void ShaderCore::imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
+		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
+		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
+		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
+	}
+
+	void ShaderCore::umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(Max(As<Int4>(src0.x), As<Int4>(src1.x)));
+		dst.y = As<Float4>(Max(As<Int4>(src0.y), As<Int4>(src1.y)));
+		dst.z = As<Float4>(Max(As<Int4>(src0.z), As<Int4>(src1.z)));
+		dst.w = As<Float4>(Max(As<Int4>(src0.w), As<Int4>(src1.w)));
+	}
+
 	void ShaderCore::slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
 		dst.x = As<Float4>(As<Int4>(CmpLT(src0.x, src1.x)) & As<Int4>(Float4(1.0f)));
@@ -937,38 +1134,6 @@ namespace sw
 		Float4 tw = Min(Max((x.w - edge0.w) / (edge1.w - edge0.w), Float4(0.0f)), Float4(1.0f)); dst.w = tw * tw * (Float4(3.0f) - Float4(2.0f) * tw);
 	}

-	void ShaderCore::floatBitsToInt(Vector4i &dst, const Vector4f &src)
-	{
-		dst.x = sw::floatBitsToInt(src.x);
-		dst.y = sw::floatBitsToInt(src.y);
-		dst.z = sw::floatBitsToInt(src.z);
-		dst.w = sw::floatBitsToInt(src.w);
-	}
-
-	void ShaderCore::floatBitsToUInt(Vector4u &dst, const Vector4f &src)
-	{
-		dst.x = sw::floatBitsToUInt(src.x);
-		dst.y = sw::floatBitsToUInt(src.y);
-		dst.z = sw::floatBitsToUInt(src.z);
-		dst.w = sw::floatBitsToUInt(src.w);
-	}
-
-	void ShaderCore::intBitsToFloat(Vector4f &dst, const Vector4i &src)
-	{
-		dst.x = sw::intBitsToFloat(src.x);
-		dst.y = sw::intBitsToFloat(src.y);
-		dst.z = sw::intBitsToFloat(src.z);
-		dst.w = sw::intBitsToFloat(src.w);
-	}
-
-	void ShaderCore::uintBitsToFloat(Vector4f &dst, const Vector4u &src)
-	{
-		dst.x = sw::uintBitsToFloat(src.x);
-		dst.y = sw::uintBitsToFloat(src.y);
-		dst.z = sw::uintBitsToFloat(src.z);
-		dst.w = sw::uintBitsToFloat(src.w);
-	}
-
 	void ShaderCore::frc(Vector4f &dst, const Vector4f &src)
 	{
 		dst.x = Frac(src.x);
@@ -1393,17 +1558,17 @@ namespace sw

 	void ShaderCore::extract(Float4 &dst, const Vector4f &src0, const Float4 &src1)
 	{
-		select(dst, CmpEQ(src1, Float4(1.0f)), src0.y, src0.x);
-		select(dst, CmpEQ(src1, Float4(2.0f)), src0.z, dst);
-		select(dst, CmpEQ(src1, Float4(3.0f)), src0.w, dst);
+		select(dst, CmpEQ(As<Int4>(src1), Int4(1)), src0.y, src0.x);
+		select(dst, CmpEQ(As<Int4>(src1), Int4(2)), src0.z, dst);
+		select(dst, CmpEQ(As<Int4>(src1), Int4(3)), src0.w, dst);
 	}

 	void ShaderCore::insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index)
 	{
-		select(dst.x, CmpEQ(index, Float4(0.0f)), element, src.x);
-		select(dst.y, CmpEQ(index, Float4(1.0f)), element, src.y);
-		select(dst.z, CmpEQ(index, Float4(2.0f)), element, src.z);
-		select(dst.w, CmpEQ(index, Float4(3.0f)), element, src.w);
+		select(dst.x, CmpEQ(As<Int4>(index), Int4(0)), element, src.x);
+		select(dst.y, CmpEQ(As<Int4>(index), Int4(1)), element, src.y);
+		select(dst.z, CmpEQ(As<Int4>(index), Int4(2)), element, src.z);
+		select(dst.w, CmpEQ(As<Int4>(index), Int4(3)), element, src.w);
 	}

 	void ShaderCore::sgn(Float4 &dst, const Float4 &src)
@@ -1419,6 +1584,12 @@ namespace sw
 		select(dst, pos, src1, src2);
 	}

+	void ShaderCore::cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2)
+	{
+		Int4 pos = CmpEQ(Int4(0), As<Int4>(src0));
+		select(dst, pos, src1, src2);
+	}
+
 	void ShaderCore::select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2)
 	{
 		// FIXME: LLVM vector select
@@ -1515,6 +1686,51 @@ namespace sw
 		}
 	}

+	void ShaderCore::ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control)
+	{
+		switch(control)
+		{
+		case Shader::CONTROL_GT:
+			dst.x = As<Float4>(CmpNLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpNLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpNLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpNLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_EQ:
+			dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_GE:
+			dst.x = As<Float4>(CmpNLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpNLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpNLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpNLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_LT:
+			dst.x = As<Float4>(CmpLT(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpLT(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpLT(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpLT(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_NE:
+			dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		case Shader::CONTROL_LE:
+			dst.x = As<Float4>(CmpLE(As<UInt4>(src0.x), As<UInt4>(src1.x)));
+			dst.y = As<Float4>(CmpLE(As<UInt4>(src0.y), As<UInt4>(src1.y)));
+			dst.z = As<Float4>(CmpLE(As<UInt4>(src0.z), As<UInt4>(src1.z)));
+			dst.w = As<Float4>(CmpLE(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+			break;
+		default:
+			ASSERT(false);
+		}
+	}
+
 	void ShaderCore::all(Float4 &dst, const Vector4f &src)
 	{
 		dst = As<Float4>(As<Int4>(src.x) & As<Int4>(src.y) & As<Int4>(src.z) & As<Int4>(src.w));
@@ -1533,18 +1749,49 @@ namespace sw
 		dst.w = As<Float4>(As<Int4>(src.w) ^ Int4(0xFFFFFFFF));
 	}

-	void ShaderCore::or(Float4 &dst, const Float4 &src0, const Float4 &src1)
+	void ShaderCore::or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) | As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) | As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) | As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) | As<Int4>(src1.w));
+	}
+
+	void ShaderCore::xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
+	{
+		dst.x = As<Float4>(As<Int4>(src0.x) ^ As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) ^ As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) ^ As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) ^ As<Int4>(src1.w));
+	}
+
+	void ShaderCore::and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
-		dst = As<Float4>(As<Int4>(src0) | As<Int4>(src1));
+		dst.x = As<Float4>(As<Int4>(src0.x) & As<Int4>(src1.x));
+		dst.y = As<Float4>(As<Int4>(src0.y) & As<Int4>(src1.y));
+		dst.z = As<Float4>(As<Int4>(src0.z) & As<Int4>(src1.z));
+		dst.w = As<Float4>(As<Int4>(src0.w) & As<Int4>(src1.w));
 	}

-	void ShaderCore::xor(Float4 &dst, const Float4 &src0, const Float4 &src1)
+	void ShaderCore::equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
-		dst = As<Float4>(As<Int4>(src0) ^ As<Int4>(src1));
+		dst.x = As<Float4>(CmpEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) &
+		                   CmpEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) &
+		                   CmpEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) &
+		                   CmpEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+		dst.y = dst.x;
+		dst.z = dst.x;
+		dst.w = dst.x;
 	}

-	void ShaderCore::and(Float4 &dst, const Float4 &src0, const Float4 &src1)
+	void ShaderCore::notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1)
 	{
-		dst = As<Float4>(As<Int4>(src0) & As<Int4>(src1));
+		dst.x = As<Float4>(CmpNEQ(As<UInt4>(src0.x), As<UInt4>(src1.x)) |
+		                   CmpNEQ(As<UInt4>(src0.y), As<UInt4>(src1.y)) |
+		                   CmpNEQ(As<UInt4>(src0.z), As<UInt4>(src1.z)) |
+		                   CmpNEQ(As<UInt4>(src0.w), As<UInt4>(src1.w)));
+		dst.y = dst.x;
+		dst.z = dst.x;
+		dst.w = dst.x;
 	}
 }
--- a/src/Shader/ShaderCore.hpp
+++ b/src/Shader/ShaderCore.hpp
@@ -105,10 +105,6 @@ namespace sw
 	Float4 arccosh(RValue<Float4> x, bool pp = false);  // Limited to x >= 1
 	Float4 arcsinh(RValue<Float4> x, bool pp = false);
 	Float4 arctanh(RValue<Float4> x, bool pp = false);  // Limited to ]-1, 1[ range
-	Int4 floatBitsToInt(RValue<Float4> x);
-	UInt4 floatBitsToUInt(RValue<Float4> x);
-	Float4 intBitsToFloat(RValue<Int4> x);
-	Float4 uintBitsToFloat(RValue<UInt4> x);

 	Float4 dot2(const Vector4f &v0, const Vector4f &v1);
 	Float4 dot3(const Vector4f &v0, const Vector4f &v1);
@@ -243,15 +239,36 @@ namespace sw

 	public:
 		void mov(Vector4f &dst, const Vector4f &src, bool floorToInteger = false);
+		void neg(Vector4f &dst, const Vector4f &src);
+		void ineg(Vector4f &dst, const Vector4f &src);
 		void f2b(Vector4f &dst, const Vector4f &src);
 		void b2f(Vector4f &dst, const Vector4f &src);
+		void f2i(Vector4f &dst, const Vector4f &src);
+		void i2f(Vector4f &dst, const Vector4f &src);
+		void f2u(Vector4f &dst, const Vector4f &src);
+		void u2f(Vector4f &dst, const Vector4f &src);
+		void i2b(Vector4f &dst, const Vector4f &src);
+		void b2i(Vector4f &dst, const Vector4f &src);
+		void u2b(Vector4f &dst, const Vector4f &src);
+		void b2u(Vector4f &dst, const Vector4f &src);
 		void add(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void iadd(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void sub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void isub(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void mad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
+		void imad(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
 		void mul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imul(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void rcpx(Vector4f &dst, const Vector4f &src, bool pp = false);
 		void div(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void idiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void udiv(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void mod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void umod(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void shl(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void ishr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void ushr(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void rsqx(Vector4f &dst, const Vector4f &src, bool pp = false);
 		void sqrt(Vector4f &dst, const Vector4f &src, bool pp = false);
 		void rsq(Vector4f &dst, const Vector4f &src, bool pp = false);
@@ -268,7 +285,11 @@ namespace sw
 		void dp3(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void dp4(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void min(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void umin(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void max(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void imax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void umax(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void slt(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void step(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void exp2x(Vector4f &dst, const Vector4f &src, bool pp = false);
@@ -281,10 +302,6 @@ namespace sw
 		void att(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
 		void lrp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
 		void smooth(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
-		void floatBitsToInt(Vector4i &dst, const Vector4f &src);
-		void floatBitsToUInt(Vector4u &dst, const Vector4f &src);
-		void intBitsToFloat(Vector4f &dst, const Vector4i &src);
-		void uintBitsToFloat(Vector4f &dst, const Vector4u &src);
 		void frc(Vector4f &dst, const Vector4f &src);
 		void trunc(Vector4f &dst, const Vector4f &src);
 		void floor(Vector4f &dst, const Vector4f &src);
@@ -330,19 +347,23 @@ namespace sw
 		void cmp0(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
 		void cmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
 		void icmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
+		void ucmp(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, Control control);
 		void select(Vector4f &dst, const Vector4f &src0, const Vector4f &src1, const Vector4f &src2);
 		void extract(Float4 &dst, const Vector4f &src0, const Float4 &src1);
 		void insert(Vector4f &dst, const Vector4f &src, const Float4 &element, const Float4 &index);
 		void all(Float4 &dst, const Vector4f &src);
 		void any(Float4 &dst, const Vector4f &src);
 		void not(Vector4f &dst, const Vector4f &src);
-		void or(Float4 &dst, const Float4 &src0, const Float4 &src1);
-		void xor(Float4 &dst, const Float4 &src0, const Float4 &src1);
-		void and(Float4 &dst, const Float4 &src0, const Float4 &src1);
+		void or(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void xor(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void and(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void equal(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);
+		void notEqual(Vector4f &dst, const Vector4f &src0, const Vector4f &src1);

 	private:
 		void sgn(Float4 &dst, const Float4 &src);
 		void cmp0(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
+		void cmp0i(Float4 &dst, const Float4 &src0, const Float4 &src1, const Float4 &src2);
 		void select(Float4 &dst, RValue<Int4> src0, const Float4 &src1, const Float4 &src2);
 	};
 }

--- a/src/Shader/VertexProgram.cpp
+++ b/src/Shader/VertexProgram.cpp
@@ -246,9 +246,9 @@ namespace sw
 			case Shader::OPCODE_ALL:		all(d.x, s0);					break;
 			case Shader::OPCODE_ANY:		any(d.x, s0);					break;
 			case Shader::OPCODE_NOT:		not(d, s0);						break;
-			case Shader::OPCODE_OR:			or(d.x, s0.x, s1.x);			break;
-			case Shader::OPCODE_XOR:		xor(d.x, s0.x, s1.x);			break;
-			case Shader::OPCODE_AND:		and(d.x, s0.x, s1.x);			break;
+			case Shader::OPCODE_OR:			or(d, s0, s1);					break;
+			case Shader::OPCODE_XOR:		xor(d, s0, s1);					break;
+			case Shader::OPCODE_AND:		and(d, s0, s1);					break;
 			case Shader::OPCODE_TEXLDL:		TEXLDL(r, d, s0, src1);			break;
 			case Shader::OPCODE_TEX:		TEX(r, d, s0, src1);			break;
 			case Shader::OPCODE_END:										break;