Commit b9f03f47 by David 'Digit' Turner Committed by David Turner

[vulkan] Add VK_SUBGROUP_FEATURE_ARITHMETIC_BIT support

As the name suggest, this CL adds many operations related to arithmetic subgroup operations, i.e.: - Reduction / Inclusive Scan / Exclusive Scans for: - IntAdd, UIntAdd, FloatAdd - IntMul, UIntMul, FloatMul, - IntMin, UIntMin, FloatMin, - IntMax, UIntMax, FloatMax, - BitwiseAnd, BitwiseOr, BitwiseXor - LogicalAnd, LogicalOr, LogicalXor The implementation uses a single template function to implement all these, based on the fact that these are all binary commutative operations. NOTE: Only 32-bit values are supported. To make scans efficient, a new Reactor operation, named Blend() is introduced. It is used to mix two input vectors using 4 3-bit indices (encoded in a single 16-bit value) to select the result's lane values. A new unit-test is added to ReactorUnittests to check its behaviour. Unfortunately, the test takes about 2 minutes on a fast workstation when doing a full scan, so it will by default only check 1/11th of all possible values (see comments in the patch for more details). Also, Float4::positive_inf() and Float4::negative_inf() methods were added, since trying to build Float4(INFINITY) will trigger a DCHECK() in the Float4(float) constructor, and the infinity values are required by the subgroup floating-point scan operations. Bug: b/142002682 Test: dEQP-VK.subgroups.arithmetic.* Change-Id: I86f509fc47f7475ca126615ed698ee493ae835ef Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/38929Reviewed-by: 's avatarChris Forbes <chrisforbes@google.com> Reviewed-by: 's avatarBen Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: 's avatarDavid Turner <digit@google.com>
parent f6a128b6
......@@ -363,6 +363,7 @@ namespace sw
case spv::CapabilityDerivativeControl: capabilities.DerivativeControl = true; break;
case spv::CapabilityGroupNonUniform: capabilities.GroupNonUniform = true; break;
case spv::CapabilityGroupNonUniformVote: capabilities.GroupNonUniformVote = true; break;
case spv::CapabilityGroupNonUniformArithmetic: capabilities.GroupNonUniformArithmetic = true; break;
case spv::CapabilityGroupNonUniformBallot: capabilities.GroupNonUniformBallot = true; break;
case spv::CapabilityGroupNonUniformShuffle: capabilities.GroupNonUniformShuffle = true; break;
case spv::CapabilityGroupNonUniformShuffleRelative: capabilities.GroupNonUniformShuffleRelative = true; break;
......@@ -627,6 +628,22 @@ namespace sw
case spv::OpGroupNonUniformShuffleXor:
case spv::OpGroupNonUniformShuffleUp:
case spv::OpGroupNonUniformShuffleDown:
case spv::OpGroupNonUniformIAdd:
case spv::OpGroupNonUniformFAdd:
case spv::OpGroupNonUniformIMul:
case spv::OpGroupNonUniformFMul:
case spv::OpGroupNonUniformSMin:
case spv::OpGroupNonUniformUMin:
case spv::OpGroupNonUniformFMin:
case spv::OpGroupNonUniformSMax:
case spv::OpGroupNonUniformUMax:
case spv::OpGroupNonUniformFMax:
case spv::OpGroupNonUniformBitwiseAnd:
case spv::OpGroupNonUniformBitwiseOr:
case spv::OpGroupNonUniformBitwiseXor:
case spv::OpGroupNonUniformLogicalAnd:
case spv::OpGroupNonUniformLogicalOr:
case spv::OpGroupNonUniformLogicalXor:
case spv::OpCopyObject:
case spv::OpArrayLength:
// Instructions that yield an intermediate value or divergent pointer
......@@ -1865,6 +1882,22 @@ namespace sw
case spv::OpGroupNonUniformShuffleXor:
case spv::OpGroupNonUniformShuffleUp:
case spv::OpGroupNonUniformShuffleDown:
case spv::OpGroupNonUniformIAdd:
case spv::OpGroupNonUniformFAdd:
case spv::OpGroupNonUniformIMul:
case spv::OpGroupNonUniformFMul:
case spv::OpGroupNonUniformSMin:
case spv::OpGroupNonUniformUMin:
case spv::OpGroupNonUniformFMin:
case spv::OpGroupNonUniformSMax:
case spv::OpGroupNonUniformUMax:
case spv::OpGroupNonUniformFMax:
case spv::OpGroupNonUniformBitwiseAnd:
case spv::OpGroupNonUniformBitwiseOr:
case spv::OpGroupNonUniformBitwiseXor:
case spv::OpGroupNonUniformLogicalAnd:
case spv::OpGroupNonUniformLogicalOr:
case spv::OpGroupNonUniformLogicalXor:
return EmitGroupNonUniform(insn, state);
case spv::OpArrayLength:
......
......@@ -499,6 +499,7 @@ namespace sw
bool GroupNonUniformBallot : 1;
bool GroupNonUniformShuffle : 1;
bool GroupNonUniformShuffleRelative : 1;
bool GroupNonUniformArithmetic : 1;
bool DeviceGroup : 1;
bool MultiView : 1;
};
......@@ -1088,6 +1089,8 @@ namespace sw
// Returns 0 when invalid.
static VkShaderStageFlagBits executionModelToStage(spv::ExecutionModel model);
struct GroupOps;
};
class SpirvRoutine
......
......@@ -18,6 +18,64 @@
namespace sw {
struct SpirvShader::GroupOps {
// Template function to perform a binary operation.
// |TYPE| should be the type of the identity value (as an SIMD::<Type>).
// |APPLY| should be a callable object that takes two RValue<TYPE> parameters
// and returns a new RValue<TYPE> corresponding to the operation's result.
template <typename TYPE, typename APPLY>
static void BinaryOperation(
const SpirvShader* shader,
const SpirvShader::InsnIterator& insn,
const SpirvShader::EmitState* state,
Intermediate& dst,
const TYPE& identity,
APPLY&& apply)
{
SpirvShader::GenericValue value(shader, state, insn.word(5));
auto &type = shader->getType(SpirvShader::Type::ID(insn.word(1)));
for (auto i = 0u; i < type.sizeInComponents; i++)
{
auto mask = As<SIMD::UInt>(state->activeLaneMask());
SIMD::UInt v_uint = (value.UInt(i) & mask) | (As<SIMD::UInt>(identity) & ~mask);
TYPE v = As<TYPE>(v_uint);
switch (spv::GroupOperation(insn.word(4)))
{
case spv::GroupOperationReduce:
{
// NOTE: floating-point add and multiply are not really commutative so
// ensure that all values in the final lanes are identical
TYPE v2 = apply(v.xxzz, v.yyww); // [xy] [xy] [zw] [zw]
TYPE v3 = apply(v2.xxxx, v2.zzzz); // [xyzw] [xyzw] [xyzw] [xyzw]
dst.move(i, v3);
break;
}
case spv::GroupOperationInclusiveScan:
{
TYPE v2 = apply(v, Blend(v, identity, 0x4012) /* [id, v.y, v.z, v.w] */); // [x] [xy] [yz] [zw]
TYPE v3 = apply(v2, Blend(v2, identity, 0x4401) /* [id, id, v2.x, v2.y] */); // [x] [xy] [xyz] [xyzw]
dst.move(i, v3);
break;
}
case spv::GroupOperationExclusiveScan:
{
TYPE v2 = apply(v, Blend(v, identity, 0x4012) /* [id, v.y, v.z, v.w] */); // [x] [xy] [yz] [zw]
TYPE v3 = apply(v2, Blend(v2, identity, 0x4401) /* [id, id, v2.x, v2.y] */); // [x] [xy] [xyz] [xyzw]
auto v4 = Blend(v3, identity, 0x4012 /* [id, v3.x, v3.y, v3.z] */); // [i] [x] [xy] [xyz]
dst.move(i, v4);
break;
}
default:
UNIMPLEMENTED("EmitGroupNonUniform op: %s Group operation: %d",
SpirvShader::OpcodeName(type.opcode()).c_str(), insn.word(4));
}
}
}
};
SpirvShader::EmitResult SpirvShader::EmitGroupNonUniform(InsnIterator insn, EmitState *state) const
{
static_assert(SIMD::Width == 4, "EmitGroupNonUniform makes many assumptions that the SIMD vector width is 4");
......@@ -258,10 +316,195 @@ SpirvShader::EmitResult SpirvShader::EmitGroupNonUniform(InsnIterator insn, Emit
break;
}
case spv::OpGroupNonUniformIAdd:
{
using Type = SIMD::Int;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0),
[](RValue<Type>a, RValue<Type>b){ return a + b; }
);
break;
}
case spv::OpGroupNonUniformFAdd:
{
using Type = SIMD::Float;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0.),
[](RValue<Type>a, RValue<Type>b){ return a + b; }
);
break;
}
case spv::OpGroupNonUniformIMul:
{
using Type = SIMD::Int;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(1),
[](RValue<Type>a, RValue<Type>b){ return a * b; }
);
break;
}
case spv::OpGroupNonUniformFMul:
{
using Type = SIMD::Float;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(1.),
[](RValue<Type>a, RValue<Type>b){ return a * b; }
);
break;
}
case spv::OpGroupNonUniformBitwiseAnd:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(~0u),
[](RValue<Type>a, RValue<Type>b){ return a & b; }
);
break;
}
case spv::OpGroupNonUniformBitwiseOr:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0),
[](RValue<Type>a, RValue<Type>b){ return a | b; }
);
break;
}
case spv::OpGroupNonUniformBitwiseXor:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0),
[](RValue<Type>a, RValue<Type>b){ return a ^ b; }
);
break;
}
case spv::OpGroupNonUniformSMin:
{
using Type = SIMD::Int;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(INT32_MAX),
[](RValue<Type>a, RValue<Type>b){ return Min(a, b); }
);
break;
}
case spv::OpGroupNonUniformUMin:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(~0u),
[](RValue<Type>a, RValue<Type>b){ return Min(a, b); }
);
break;
}
case spv::OpGroupNonUniformFMin:
{
using Type = SIMD::Float;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type::positive_inf(),
[](RValue<Type>a, RValue<Type>b){ return NMin(a, b); }
);
break;
}
case spv::OpGroupNonUniformSMax:
{
using Type = SIMD::Int;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(INT32_MIN),
[](RValue<Type>a, RValue<Type>b){ return Max(a, b); }
);
break;
}
case spv::OpGroupNonUniformUMax:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0),
[](RValue<Type>a, RValue<Type>b){ return Max(a, b); }
);
break;
}
case spv::OpGroupNonUniformFMax:
{
using Type = SIMD::Float;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type::negative_inf(),
[](RValue<Type>a, RValue<Type>b){ return NMax(a, b); }
);
break;
}
case spv::OpGroupNonUniformLogicalAnd:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(~0u),
[](RValue<Type>a, RValue<Type>b){
SIMD::UInt zero = SIMD::UInt(0);
return CmpNEQ(a, zero) & CmpNEQ(b, zero);
}
);
break;
}
case spv::OpGroupNonUniformLogicalOr:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0),
[](RValue<Type>a, RValue<Type>b){
SIMD::UInt zero = SIMD::UInt(0);
return CmpNEQ(a, zero) | CmpNEQ(b, zero);
}
);
break;
}
case spv::OpGroupNonUniformLogicalXor:
{
using Type = SIMD::UInt;
SpirvShader::GroupOps::BinaryOperation(
this, insn, state, dst,
Type(0),
[](RValue<Type>a, RValue<Type>b){
SIMD::UInt zero = SIMD::UInt(0);
return CmpNEQ(a, zero) ^ CmpNEQ(b, zero);
}
);
break;
}
default:
UNIMPLEMENTED("EmitGroupNonUniform op: %s", OpcodeName(type.opcode()).c_str());
}
return EmitResult::Continue;
}
} // namespace sw
\ No newline at end of file
} // namespace sw
......@@ -108,6 +108,35 @@ namespace rr
unmaterializedVariables.clear();
}
// NOTE: Only 12 bits out of 16 of the |select| value are used.
// More specifically, the value should look like:
//
// msb lsb
// v v
// [.aaa|.bbb|.ccc|.ddd] where '.' means an ignored bit
//
// This format makes it easy to write calls with hexadecimal select values,
// since each hex digit is a separate swizzle index. Note that the order
// of indices is reversed compared to createSwizzle4() below!
//
// For example:
// createBlend4( [a,b,c,d], [e,f,g,h], 0x0123 ) -> [a,b,c,d]
// createBlend4( [a,b,c,d], [e,f,g,h], 0x4567 ) -> [e,f,g,h]
// createBlend4( [a,b,c,d], [e,f,g,h], 0x4012 ) -> [e,a,b,c]
//
static Value *createBlend4(Value *lhs, Value *rhs, unsigned short select)
{
int swizzle[4] =
{
(select >> 12) & 0x07,
(select >> 8) & 0x07,
(select >> 4) & 0x07,
(select >> 0) & 0x07,
};
return Nucleus::createShuffleVector(lhs, rhs, swizzle);
}
static Value *createSwizzle4(Value *val, unsigned char select)
{
int swizzle[4] =
......@@ -3481,6 +3510,11 @@ namespace rr
return RValue<Int4>(createSwizzle4(x.value, select));
}
RValue<Int4> Blend(RValue<Int4> x, RValue<Int4> y, unsigned short select)
{
return RValue<Int4>(createBlend4(x.value, y.value, select));
}
UInt4::UInt4() : XYZW(this)
{
}
......@@ -3716,6 +3750,11 @@ namespace rr
return RValue<UInt4>(createSwizzle4(x.value, select));
}
RValue<UInt4> Blend(RValue<UInt4> x, RValue<UInt4> y, unsigned short select)
{
return RValue<UInt4>(createBlend4(x.value, y.value, select));
}
Half::Half(RValue<Float> cast)
{
UInt fp32i = As<UInt>(cast);
......@@ -3805,7 +3844,7 @@ namespace rr
// being reinterpreted as float and then bitcast to integer again,
// which does not guarantee preserving the integer value.
//
// Should inifinty and NaN constants be required, methods like
// Should infinity and NaN constants be required, methods like
// infinity(), quiet_NaN(), and signaling_NaN() should be added
// to the Float class.
ASSERT(std::isfinite(x));
......@@ -4026,6 +4065,27 @@ namespace rr
constant(x, y, z, w);
}
Float4 Float4::positive_inf()
{
Float4 result;
result.infinity_constant(false);
return result;
}
Float4 Float4::negative_inf()
{
Float4 result;
result.infinity_constant(true);
return result;
}
void Float4::infinity_constant(bool negative)
{
double inf = negative ? -INFINITY : INFINITY;
double constantVector[4] = {inf, inf, inf, inf};
storeValue(Nucleus::createConstantVector(constantVector, getType()));
}
void Float4::constant(float x, float y, float z, float w)
{
// See Float(float) constructor for the rationale behind this assert.
......@@ -4190,6 +4250,11 @@ namespace rr
return RValue<Float4>(createSwizzle4(x.value, select));
}
RValue<Float4> Blend(RValue<Float4> x, RValue<Float4> y, unsigned short select)
{
return RValue<Float4>(createBlend4(x.value, y.value, select));
}
RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
{
int shuffle[4] =
......
......@@ -1946,6 +1946,7 @@ namespace rr
RValue<Int4> Insert(RValue<Int4> val, RValue<Int> element, int i);
RValue<Int> SignMask(RValue<Int4> x);
RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select);
RValue<Int4> Blend(RValue<Int4> x, RValue<Int4> y, unsigned short select);
RValue<Int4> MulHigh(RValue<Int4> x, RValue<Int4> y);
class UInt4 : public LValue<UInt4>, public XYZW<UInt4>
......@@ -2030,6 +2031,7 @@ namespace rr
RValue<UInt4> Insert(RValue<UInt4> val, RValue<UInt> element, int i);
// RValue<UInt4> RoundInt(RValue<Float4> cast);
RValue<UInt4> Swizzle(RValue<UInt4> x, unsigned char select);
RValue<UInt4> Blend(RValue<UInt4> x, RValue<UInt4> y, unsigned short select);
class Half : public LValue<Half>
{
......@@ -2227,9 +2229,11 @@ namespace rr
RValue<Float4> operator=(const Swizzle4<Float4, T> &rhs);
static Type *getType();
static Float4 negative_inf();
static Float4 positive_inf();
private:
void constant(float x, float y, float z, float w);
void infinity_constant(bool negative);
};
RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs);
......@@ -2254,6 +2258,7 @@ namespace rr
RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i);
RValue<Float> Extract(RValue<Float4> x, int i);
RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select);
RValue<Float4> Blend(RValue<Float4> x, RValue<Float4> y, unsigned short select);
RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm);
RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y);
RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y);
......
......@@ -470,6 +470,106 @@ TEST(ReactorUnitTests, Swizzle)
}
TEST(ReactorUnitTests, Blend)
{
{
// |select| is [0aaa:0bbb:0ccc:0ddd] where |aaa|, |bbb|, |ccc|
// and |ddd| are 7-bit selection indices. For a total (1 << 12)
// possibilities.
const int kSelectRange = 1 << 12;
// Unfortunately, testing the whole kSelectRange results in a test
// that is far too slow to run, because LLVM spends exponentially more
// time optimizing the function below as the number of test cases
// increases.
//
// To work-around the problem, only test a subset of the range by
// skipping every kRangeIncrement value.
//
// Set this value to 1 if you want to test the whole implementation,
// which will take a little less than 2 minutes on a fast workstation.
//
// The default value here takes about 1390ms, which is a little more than
// what the Swizzle test takes (993 ms) on my machine. A non-power-of-2
// value ensures a better spread over possible values.
const int kRangeIncrement = 11;
auto rangeIndexToSelect = [](int i) {
return static_cast<unsigned short>(
(((i >> 9) & 7) << 0) |
(((i >> 6) & 7) << 4) |
(((i >> 3) & 7) << 8) |
(((i >> 0) & 7) << 12)
);
};
FunctionT<int(void*)> function;
{
Pointer<Byte> out = function.Arg<0>();
for(int i = 0; i < kSelectRange; i += kRangeIncrement)
{
unsigned short select = rangeIndexToSelect(i);
*Pointer<Float4>(out + 16 * i) = Blend(Float4(1.0f, 2.0f, 3.0f, 4.0f),
Float4(5.0f, 6.0f, 7.0f, 8.0f),
select);
*Pointer<Int4>(out + (kSelectRange + i) * 16) = Blend(Int4(10, 11, 12, 13),
Int4(14, 15, 16, 17),
select);
*Pointer<UInt4>(out + (2 * kSelectRange + i) * 16) = Blend(UInt4(100, 101, 102, 103),
UInt4(104, 105, 106, 107),
select);
}
Return(0);
}
auto routine = function("one");
if(routine)
{
struct
{
float f[kSelectRange][4];
int i[kSelectRange][4];
unsigned u[kSelectRange][4];
} out;
memset(&out, 0, sizeof(out));
routine(&out);
for(int i = 0; i < kSelectRange; i += kRangeIncrement)
{
EXPECT_EQ(out.f[i][0], float(1.0f + (i & 7)));
EXPECT_EQ(out.f[i][1], float(1.0f + ((i >> 3) & 7)));
EXPECT_EQ(out.f[i][2], float(1.0f + ((i >> 6) & 7)));
EXPECT_EQ(out.f[i][3], float(1.0f + ((i >> 9) & 7)));
}
for(int i = 0; i < kSelectRange; i += kRangeIncrement)
{
EXPECT_EQ(out.i[i][0], int(10 + (i & 7)));
EXPECT_EQ(out.i[i][1], int(10 + ((i >> 3) & 7)));
EXPECT_EQ(out.i[i][2], int(10 + ((i >> 6) & 7)));
EXPECT_EQ(out.i[i][3], int(10 + ((i >> 9) & 7)));
}
for(int i = 0; i < kSelectRange; i += kRangeIncrement)
{
EXPECT_EQ(out.u[i][0], unsigned(100 + (i & 7)));
EXPECT_EQ(out.u[i][1], unsigned(100 + ((i >> 3) & 7)));
EXPECT_EQ(out.u[i][2], unsigned(100 + ((i >> 6) & 7)));
EXPECT_EQ(out.u[i][3], unsigned(100 + ((i >> 9) & 7)));
}
}
}
}
TEST(ReactorUnitTests, Branching)
{
{
......
......@@ -347,6 +347,7 @@ void PhysicalDevice::getProperties(VkPhysicalDeviceSubgroupProperties* propertie
properties->supportedOperations =
VK_SUBGROUP_FEATURE_BASIC_BIT |
VK_SUBGROUP_FEATURE_VOTE_BIT |
VK_SUBGROUP_FEATURE_ARITHMETIC_BIT |
VK_SUBGROUP_FEATURE_BALLOT_BIT |
VK_SUBGROUP_FEATURE_SHUFFLE_BIT |
VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT;
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment