[vulkan] Add VK_SUBGROUP_FEATURE_ARITHMETIC_BIT support

As the name suggest, this CL adds many operations related to arithmetic subgroup operations, i.e.: - Reduction / Inclusive Scan / Exclusive Scans for: - IntAdd, UIntAdd, FloatAdd - IntMul, UIntMul, FloatMul, - IntMin, UIntMin, FloatMin, - IntMax, UIntMax, FloatMax, - BitwiseAnd, BitwiseOr, BitwiseXor - LogicalAnd, LogicalOr, LogicalXor The implementation uses a single template function to implement all these, based on the fact that these are all binary commutative operations. NOTE: Only 32-bit values are supported. To make scans efficient, a new Reactor operation, named Blend() is introduced. It is used to mix two input vectors using 4 3-bit indices (encoded in a single 16-bit value) to select the result's lane values. A new unit-test is added to ReactorUnittests to check its behaviour. Unfortunately, the test takes about 2 minutes on a fast workstation when doing a full scan, so it will by default only check 1/11th of all possible values (see comments in the patch for more details). Also, Float4::positive_inf() and Float4::negative_inf() methods were added, since trying to build Float4(INFINITY) will trigger a DCHECK() in the Float4(float) constructor, and the infinity values are required by the subgroup floating-point scan operations. Bug: b/142002682 Test: dEQP-VK.subgroups.arithmetic.* Change-Id: I86f509fc47f7475ca126615ed698ee493ae835ef Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/38929Reviewed-by: Chris Forbes <chrisforbes@google.com> Reviewed-by: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: David Turner <digit@google.com>

[vulkan] Add VK_SUBGROUP_FEATURE_ARITHMETIC_BIT support
b9f03f47 · David 'Digit' Turner · David Turner · f6a128b6 · b9f03f47 · b9f03f47
Commit b9f03f47 authored Dec 04, 2019 by David 'Digit' Turner Committed by David Turner Dec 05, 2019
7 changed files
--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -363,6 +363,7 @@ namespace sw
 				case spv::CapabilityDerivativeControl: capabilities.DerivativeControl = true; break;
 				case spv::CapabilityGroupNonUniform: capabilities.GroupNonUniform = true; break;
 				case spv::CapabilityGroupNonUniformVote: capabilities.GroupNonUniformVote = true; break;
+				case spv::CapabilityGroupNonUniformArithmetic: capabilities.GroupNonUniformArithmetic = true; break;
 				case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
 				case spv::CapabilityGroupNonUniformShuffle: capabilities.GroupNonUniformShuffle = true; break;
 				case spv::CapabilityGroupNonUniformShuffleRelative: capabilities.GroupNonUniformShuffleRelative = true; break;
@@ -627,6 +628,22 @@ namespace sw
 			case spv::OpGroupNonUniformShuffleXor:
 			case spv::OpGroupNonUniformShuffleUp:
 			case spv::OpGroupNonUniformShuffleDown:
+			case spv::OpGroupNonUniformIAdd:
+			case spv::OpGroupNonUniformFAdd:
+			case spv::OpGroupNonUniformIMul:
+			case spv::OpGroupNonUniformFMul:
+			case spv::OpGroupNonUniformSMin:
+			case spv::OpGroupNonUniformUMin:
+			case spv::OpGroupNonUniformFMin:
+			case spv::OpGroupNonUniformSMax:
+			case spv::OpGroupNonUniformUMax:
+			case spv::OpGroupNonUniformFMax:
+			case spv::OpGroupNonUniformBitwiseAnd:
+			case spv::OpGroupNonUniformBitwiseOr:
+			case spv::OpGroupNonUniformBitwiseXor:
+			case spv::OpGroupNonUniformLogicalAnd:
+			case spv::OpGroupNonUniformLogicalOr:
+			case spv::OpGroupNonUniformLogicalXor:
 			case spv::OpCopyObject:
 			case spv::OpArrayLength:
 				// Instructions that yield an intermediate value or divergent pointer
@@ -1865,6 +1882,22 @@ namespace sw
 		case spv::OpGroupNonUniformShuffleXor:
 		case spv::OpGroupNonUniformShuffleUp:
 		case spv::OpGroupNonUniformShuffleDown:
+		case spv::OpGroupNonUniformIAdd:
+		case spv::OpGroupNonUniformFAdd:
+		case spv::OpGroupNonUniformIMul:
+		case spv::OpGroupNonUniformFMul:
+		case spv::OpGroupNonUniformSMin:
+		case spv::OpGroupNonUniformUMin:
+		case spv::OpGroupNonUniformFMin:
+		case spv::OpGroupNonUniformSMax:
+		case spv::OpGroupNonUniformUMax:
+		case spv::OpGroupNonUniformFMax:
+		case spv::OpGroupNonUniformBitwiseAnd:
+		case spv::OpGroupNonUniformBitwiseOr:
+		case spv::OpGroupNonUniformBitwiseXor:
+		case spv::OpGroupNonUniformLogicalAnd:
+		case spv::OpGroupNonUniformLogicalOr:
+		case spv::OpGroupNonUniformLogicalXor:
 			return EmitGroupNonUniform(insn, state);

 		case spv::OpArrayLength:

--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -499,6 +499,7 @@ namespace sw
 			bool GroupNonUniformBallot : 1;
 			bool GroupNonUniformShuffle : 1;
 			bool GroupNonUniformShuffleRelative : 1;
+			bool GroupNonUniformArithmetic : 1;
 			bool DeviceGroup : 1;
 			bool MultiView : 1;
 		};
@@ -1088,6 +1089,8 @@ namespace sw

 		// Returns 0 when invalid.
 		static VkShaderStageFlagBits executionModelToStage(spv::ExecutionModel model);
+
+		struct GroupOps;
 	};

 	class SpirvRoutine

--- a/src/Pipeline/SpirvShaderGroup.cpp
+++ b/src/Pipeline/SpirvShaderGroup.cpp
@@ -18,6 +18,64 @@

 namespace sw {

+struct SpirvShader::GroupOps {
+
+	// Template function to perform a binary operation.
+	// |TYPE| should be the type of the identity value (as an SIMD::<Type>).
+	// |APPLY| should be a callable object that takes two RValue<TYPE> parameters
+	// and returns a new RValue<TYPE> corresponding to the operation's result.
+	template <typename TYPE, typename APPLY>
+	static void BinaryOperation(
+		const SpirvShader*               shader,
+		const SpirvShader::InsnIterator& insn,
+		const SpirvShader::EmitState*    state,
+		Intermediate&                    dst,
+		const TYPE&                      identity,
+		APPLY&&                          apply)
+	{
+		SpirvShader::GenericValue value(shader, state, insn.word(5));
+		auto &type = shader->getType(SpirvShader::Type::ID(insn.word(1)));
+		for (auto i = 0u; i < type.sizeInComponents; i++)
+		{
+			auto mask = As<SIMD::UInt>(state->activeLaneMask());
+			SIMD::UInt v_uint = (value.UInt(i) & mask) | (As<SIMD::UInt>(identity) & ~mask);
+			TYPE v = As<TYPE>(v_uint);
+			switch (spv::GroupOperation(insn.word(4)))
+			{
+			case spv::GroupOperationReduce:
+			{
+				// NOTE: floating-point add and multiply are not really commutative so
+				//       ensure that all values in the final lanes are identical
+				TYPE v2 = apply(v.xxzz,  v.yyww);   // [xy]   [xy]   [zw]   [zw]
+				TYPE v3 = apply(v2.xxxx, v2.zzzz);  // [xyzw] [xyzw] [xyzw] [xyzw]
+				dst.move(i, v3);
+				break;
+			}
+			case spv::GroupOperationInclusiveScan:
+			{
+				TYPE v2 = apply(v,  Blend(v, identity, 0x4012) /* [id, v.y, v.z, v.w] */);    // [x] [xy] [yz]  [zw]
+				TYPE v3 = apply(v2, Blend(v2, identity, 0x4401) /* [id,  id, v2.x, v2.y] */); // [x] [xy] [xyz] [xyzw]
+				dst.move(i, v3);
+				break;
+			}
+			case spv::GroupOperationExclusiveScan:
+			{
+				TYPE v2 = apply(v,  Blend(v, identity, 0x4012) /* [id, v.y, v.z, v.w] */);    // [x] [xy] [yz]  [zw]
+				TYPE v3 = apply(v2, Blend(v2, identity, 0x4401) /* [id,  id, v2.x, v2.y] */); // [x] [xy] [xyz] [xyzw]
+				auto v4 = Blend(v3, identity, 0x4012 /* [id, v3.x, v3.y, v3.z] */);           // [i] [x]  [xy]  [xyz]
+				dst.move(i, v4);
+				break;
+			}
+			default:
+				UNIMPLEMENTED("EmitGroupNonUniform op: %s Group operation: %d",
+								SpirvShader::OpcodeName(type.opcode()).c_str(), insn.word(4));
+			}
+		}
+	}
+
+};
+
+
 SpirvShader::EmitResult SpirvShader::EmitGroupNonUniform(InsnIterator insn, EmitState *state) const
 {
 	static_assert(SIMD::Width == 4, "EmitGroupNonUniform makes many assumptions that the SIMD vector width is 4");
@@ -258,10 +316,195 @@ SpirvShader::EmitResult SpirvShader::EmitGroupNonUniform(InsnIterator insn, Emit
 		break;
 	}

+	case spv::OpGroupNonUniformIAdd:
+	{
+		using Type = SIMD::Int;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0),
+				[](RValue<Type>a, RValue<Type>b){ return a + b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformFAdd:
+	{
+		using Type = SIMD::Float;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0.),
+				[](RValue<Type>a, RValue<Type>b){ return a + b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformIMul:
+	{
+		using Type = SIMD::Int;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(1),
+				[](RValue<Type>a, RValue<Type>b){ return a * b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformFMul:
+	{
+		using Type = SIMD::Float;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(1.),
+				[](RValue<Type>a, RValue<Type>b){ return a * b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformBitwiseAnd:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(~0u),
+				[](RValue<Type>a, RValue<Type>b){ return a & b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformBitwiseOr:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0),
+				[](RValue<Type>a, RValue<Type>b){ return a | b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformBitwiseXor:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0),
+				[](RValue<Type>a, RValue<Type>b){ return a ^ b; }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformSMin:
+	{
+		using Type = SIMD::Int;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(INT32_MAX),
+				[](RValue<Type>a, RValue<Type>b){ return Min(a, b); }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformUMin:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(~0u),
+				[](RValue<Type>a, RValue<Type>b){ return Min(a, b); }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformFMin:
+	{
+		using Type = SIMD::Float;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type::positive_inf(),
+				[](RValue<Type>a, RValue<Type>b){ return NMin(a, b); }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformSMax:
+	{
+		using Type = SIMD::Int;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(INT32_MIN),
+				[](RValue<Type>a, RValue<Type>b){ return Max(a, b); }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformUMax:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0),
+				[](RValue<Type>a, RValue<Type>b){ return Max(a, b); }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformFMax:
+	{
+		using Type = SIMD::Float;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type::negative_inf(),
+				[](RValue<Type>a, RValue<Type>b){ return NMax(a, b); }
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformLogicalAnd:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(~0u),
+				[](RValue<Type>a, RValue<Type>b){
+					SIMD::UInt zero = SIMD::UInt(0);
+					return CmpNEQ(a, zero) & CmpNEQ(b, zero);
+				}
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformLogicalOr:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0),
+				[](RValue<Type>a, RValue<Type>b){
+					SIMD::UInt zero = SIMD::UInt(0);
+					return CmpNEQ(a, zero) | CmpNEQ(b, zero);
+				}
+		);
+		break;
+	}
+
+	case spv::OpGroupNonUniformLogicalXor:
+	{
+		using Type = SIMD::UInt;
+		SpirvShader::GroupOps::BinaryOperation(
+				this, insn, state, dst,
+				Type(0),
+				[](RValue<Type>a, RValue<Type>b){
+					SIMD::UInt zero = SIMD::UInt(0);
+					return CmpNEQ(a, zero) ^ CmpNEQ(b, zero);
+				}
+		);
+		break;
+	}
+
 	default:
 		UNIMPLEMENTED("EmitGroupNonUniform op: %s", OpcodeName(type.opcode()).c_str());
 	}
 	return EmitResult::Continue;
 }

-}  // namespace sw
\ No newline at end of file
+}  // namespace sw
--- a/src/Reactor/Reactor.cpp
+++ b/src/Reactor/Reactor.cpp
@@ -108,6 +108,35 @@ namespace rr
 		unmaterializedVariables.clear();
 	}

+	// NOTE: Only 12 bits out of 16 of the |select| value are used.
+	// More specifically, the value should look like:
+	//
+	//    msb               lsb
+	//     v                 v
+	//    [.aaa|.bbb|.ccc|.ddd]    where '.' means an ignored bit
+	//
+	// This format makes it easy to write calls with hexadecimal select values,
+	// since each hex digit is a separate swizzle index. Note that the order
+	// of indices is reversed compared to createSwizzle4() below!
+	//
+	// For example:
+	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x0123 ) -> [a,b,c,d]
+	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4567 ) -> [e,f,g,h]
+	//      createBlend4( [a,b,c,d], [e,f,g,h], 0x4012 ) -> [e,a,b,c]
+	//
+	static Value *createBlend4(Value *lhs, Value *rhs, unsigned short select)
+	{
+		int swizzle[4] =
+		{
+			(select >> 12) & 0x07,
+			(select >> 8)  & 0x07,
+			(select >> 4)  & 0x07,
+			(select >> 0)  & 0x07,
+		};
+
+		return Nucleus::createShuffleVector(lhs, rhs, swizzle);
+	}
+
 	static Value *createSwizzle4(Value *val, unsigned char select)
 	{
 		int swizzle[4] =
@@ -3481,6 +3510,11 @@ namespace rr
 		return RValue<Int4>(createSwizzle4(x.value, select));
 	}

+	RValue<Int4> Blend(RValue<Int4> x, RValue<Int4> y, unsigned short select)
+	{
+		return RValue<Int4>(createBlend4(x.value, y.value, select));
+	}
+
 	UInt4::UInt4() : XYZW(this)
 	{
 	}
@@ -3716,6 +3750,11 @@ namespace rr
 		return RValue<UInt4>(createSwizzle4(x.value, select));
 	}

+	RValue<UInt4> Blend(RValue<UInt4> x, RValue<UInt4> y, unsigned short select)
+	{
+		return RValue<UInt4>(createBlend4(x.value, y.value, select));
+	}
+
 	Half::Half(RValue<Float> cast)
 	{
 		UInt fp32i = As<UInt>(cast);
@@ -3805,7 +3844,7 @@ namespace rr
 		// being reinterpreted as float and then bitcast to integer again,
 		// which does not guarantee preserving the integer value.
 		//
-		// Should inifinty and NaN constants be required, methods like
+		// Should infinity and NaN constants be required, methods like
 		// infinity(), quiet_NaN(), and signaling_NaN() should be added
 		// to the Float class.
 		ASSERT(std::isfinite(x));
@@ -4026,6 +4065,27 @@ namespace rr
 		constant(x, y, z, w);
 	}

+	Float4 Float4::positive_inf()
+	{
+		Float4 result;
+		result.infinity_constant(false);
+		return result;
+	}
+
+	Float4 Float4::negative_inf()
+	{
+		Float4 result;
+		result.infinity_constant(true);
+		return result;
+	}
+
+	void Float4::infinity_constant(bool negative)
+	{
+		double inf = negative ? -INFINITY : INFINITY;
+		double constantVector[4] = {inf, inf, inf, inf};
+		storeValue(Nucleus::createConstantVector(constantVector, getType()));
+	}
+
 	void Float4::constant(float x, float y, float z, float w)
 	{
 		// See Float(float) constructor for the rationale behind this assert.
@@ -4190,6 +4250,11 @@ namespace rr
 		return RValue<Float4>(createSwizzle4(x.value, select));
 	}

+	RValue<Float4> Blend(RValue<Float4> x, RValue<Float4> y, unsigned short select)
+	{
+		return RValue<Float4>(createBlend4(x.value, y.value, select));
+	}
+
 	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
 	{
 		int shuffle[4] =

--- a/src/Reactor/Reactor.hpp
+++ b/src/Reactor/Reactor.hpp
@@ -1946,6 +1946,7 @@ namespace rr
 	RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
 	RValue<Int> SignMask(RValue<Int4> x);
 	RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select);
+	RValue<Int4> Blend(RValue<Int4> x, RValue<Int4> y, unsigned short select);
 	RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y);

 	class UInt4 : public LValue<UInt4>, public XYZW<UInt4>
@@ -2030,6 +2031,7 @@ namespace rr
 	RValue<UInt4> Insert(RValue<UInt4> val, RValue<UInt> element, int i);
 //	RValue<UInt4> RoundInt(RValue<Float4> cast);
 	RValue<UInt4> Swizzle(RValue<UInt4> x, unsigned char select);
+	RValue<UInt4> Blend(RValue<UInt4> x, RValue<UInt4> y, unsigned short select);

 	class Half : public LValue<Half>
 	{
@@ -2227,9 +2229,11 @@ namespace rr
 		RValue<Float4> operator=(const Swizzle4<Float4, T> &rhs);

 		static Type *getType();
-
+		static Float4 negative_inf();
+		static Float4 positive_inf();
 	private:
 		void constant(float x, float y, float z, float w);
+		void infinity_constant(bool negative);
 	};

 	RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs);
@@ -2254,6 +2258,7 @@ namespace rr
 	RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
 	RValue<Float> Extract(RValue<Float4> x, int i);
 	RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select);
+	RValue<Float4> Blend(RValue<Float4> x, RValue<Float4> y, unsigned short select);
 	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm);
 	RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y);
 	RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y);

--- a/src/Reactor/ReactorUnitTests.cpp
+++ b/src/Reactor/ReactorUnitTests.cpp
@@ -470,6 +470,106 @@ TEST(ReactorUnitTests, Swizzle)

 }

+TEST(ReactorUnitTests, Blend)
+{
+	{
+		// |select| is [0aaa:0bbb:0ccc:0ddd] where |aaa|, |bbb|, |ccc|
+		// and |ddd| are 7-bit selection indices. For a total (1 << 12)
+		// possibilities.
+		const int kSelectRange = 1 << 12;
+
+		// Unfortunately, testing the whole kSelectRange results in a test
+		// that is far too slow to run, because LLVM spends exponentially more
+		// time optimizing the function below as the number of test cases
+		// increases.
+		//
+		// To work-around the problem, only test a subset of the range by
+		// skipping every kRangeIncrement value.
+		//
+		// Set this value to 1 if you want to test the whole implementation,
+		// which will take a little less than 2 minutes on a fast workstation.
+		//
+		// The default value here takes about 1390ms, which is a little more than
+		// what the Swizzle test takes (993 ms) on my machine. A non-power-of-2
+		// value ensures a better spread over possible values.
+		const int kRangeIncrement = 11;
+
+		auto rangeIndexToSelect = [](int i) {
+			return static_cast<unsigned short>(
+				(((i >> 9) & 7) << 0) |
+				(((i >> 6) & 7) << 4) |
+				(((i >> 3) & 7) << 8) |
+				(((i >> 0) & 7) << 12)
+			);
+		};
+
+		FunctionT<int(void*)> function;
+		{
+			Pointer<Byte> out = function.Arg<0>();
+
+			for(int i = 0; i < kSelectRange; i += kRangeIncrement)
+			{
+				unsigned short select = rangeIndexToSelect(i);
+
+				*Pointer<Float4>(out + 16 * i) = Blend(Float4(1.0f, 2.0f, 3.0f, 4.0f),
+													   Float4(5.0f, 6.0f, 7.0f, 8.0f),
+													   select);
+
+				*Pointer<Int4>(out + (kSelectRange + i) * 16) = Blend(Int4(10, 11, 12, 13),
+																	  Int4(14, 15, 16, 17),
+																	  select);
+
+				*Pointer<UInt4>(out + (2 * kSelectRange + i) * 16) = Blend(UInt4(100, 101, 102, 103),
+																		   UInt4(104, 105, 106, 107),
+																		   select);
+			}
+
+			Return(0);
+		}
+
+		auto routine = function("one");
+
+		if(routine)
+		{
+			struct
+			{
+				float f[kSelectRange][4];
+				int i[kSelectRange][4];
+				unsigned u[kSelectRange][4];
+			} out;
+
+			memset(&out, 0, sizeof(out));
+
+			routine(&out);
+
+			for(int i = 0; i < kSelectRange; i += kRangeIncrement)
+			{
+				EXPECT_EQ(out.f[i][0], float(1.0f + (i & 7)));
+				EXPECT_EQ(out.f[i][1], float(1.0f + ((i >> 3) & 7)));
+				EXPECT_EQ(out.f[i][2], float(1.0f + ((i >> 6) & 7)));
+				EXPECT_EQ(out.f[i][3], float(1.0f + ((i >> 9) & 7)));
+			}
+
+			for(int i = 0; i < kSelectRange; i += kRangeIncrement)
+			{
+				EXPECT_EQ(out.i[i][0], int(10 + (i & 7)));
+				EXPECT_EQ(out.i[i][1], int(10 + ((i >> 3) & 7)));
+				EXPECT_EQ(out.i[i][2], int(10 + ((i >> 6) & 7)));
+				EXPECT_EQ(out.i[i][3], int(10 + ((i >> 9) & 7)));
+			}
+
+			for(int i = 0; i < kSelectRange; i += kRangeIncrement)
+			{
+				EXPECT_EQ(out.u[i][0], unsigned(100 + (i & 7)));
+				EXPECT_EQ(out.u[i][1], unsigned(100 + ((i >> 3) & 7)));
+				EXPECT_EQ(out.u[i][2], unsigned(100 + ((i >> 6) & 7)));
+				EXPECT_EQ(out.u[i][3], unsigned(100 + ((i >> 9) & 7)));
+			}
+		}
+	}
+
+}
+
 TEST(ReactorUnitTests, Branching)
 {
 	{

--- a/src/Vulkan/VkPhysicalDevice.cpp
+++ b/src/Vulkan/VkPhysicalDevice.cpp
@@ -347,6 +347,7 @@ void PhysicalDevice::getProperties(VkPhysicalDeviceSubgroupProperties* propertie
 	properties->supportedOperations =
 		VK_SUBGROUP_FEATURE_BASIC_BIT |
 		VK_SUBGROUP_FEATURE_VOTE_BIT |
+		VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
 		VK_SUBGROUP_FEATURE_BALLOT_BIT |
 		VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
 		VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;