Commit ecfeede6 by Ben Clayton Committed by Chris Forbes

SpirvShader: Implement OpControlBarrier.

Use the new coroutines to yield when hitting a ControlBarrier. A barrier pushes the shader subgroup to the end of the compute invocation queue, which forces all subgroups to be brought to the fence before continuing. Tests: dEQP-VK.spirv_assembly.instruction.compute.workgroup_memory.* Tests: dEQP-VK.subgroups.basic.compute.* Tests: dEQP-VK.compute.basic.* Bug: b/131672705 Bug: b/132232716 Change-Id: Id78be9ce9d9455cb2cb7254482568985845b8b6a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/30851 Presubmit-Ready: Ben Clayton <bclayton@google.com> Tested-by: 's avatarChris Forbes <chrisforbes@google.com> Reviewed-by: 's avatarChris Forbes <chrisforbes@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
parent b16c5867
...@@ -18,6 +18,8 @@ ...@@ -18,6 +18,8 @@
#include "Vulkan/VkDebug.hpp" #include "Vulkan/VkDebug.hpp"
#include "Vulkan/VkPipelineLayout.hpp" #include "Vulkan/VkPipelineLayout.hpp"
#include <queue>
namespace namespace
{ {
enum { X, Y, Z }; enum { X, Y, Z };
...@@ -154,17 +156,18 @@ namespace sw ...@@ -154,17 +156,18 @@ namespace sw
void ComputeProgram::emit() void ComputeProgram::emit()
{ {
Int workgroupX = Arg<1>();
Int workgroupY = Arg<2>();
Int workgroupZ = Arg<3>();
Pointer<Byte> workgroupMemory = Arg<4>();
Int firstSubgroup = Arg<5>();
Int subgroupCount = Arg<6>();
routine.descriptorSets = data + OFFSET(Data, descriptorSets); routine.descriptorSets = data + OFFSET(Data, descriptorSets);
routine.descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets); routine.descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
routine.pushConstants = data + OFFSET(Data, pushConstants); routine.pushConstants = data + OFFSET(Data, pushConstants);
routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants)); routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
routine.workgroupMemory = *Pointer<Pointer<Byte>>(data + OFFSET(Data, workgroupMemory)); routine.workgroupMemory = workgroupMemory;
Int workgroupX = Arg<1>();
Int workgroupY = Arg<2>();
Int workgroupZ = Arg<3>();
Int firstSubgroup = Arg<4>();
Int subgroupCount = Arg<5>();
Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup)); Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
...@@ -210,8 +213,8 @@ namespace sw ...@@ -210,8 +213,8 @@ namespace sw
auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup; auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
// We're sharing a buffer here across all workgroups. // We're sharing a buffer here across all workgroups.
// We can only do this because we know workgroups are executed // We can only do this because we know a single workgroup is in flight
// serially. // at any time.
std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size()); std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
Data data; Data data;
...@@ -230,19 +233,51 @@ namespace sw ...@@ -230,19 +233,51 @@ namespace sw
data.subgroupsPerWorkgroup = subgroupsPerWorkgroup; data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
data.pushConstants = pushConstants; data.pushConstants = pushConstants;
data.constants = &sw::constants; data.constants = &sw::constants;
data.workgroupMemory = workgroupMemory.data();
// TODO(bclayton): Split work across threads.
for (uint32_t groupZ = 0; groupZ < groupCountZ; groupZ++) for (uint32_t groupZ = 0; groupZ < groupCountZ; groupZ++)
{ {
for (uint32_t groupY = 0; groupY < groupCountY; groupY++) for (uint32_t groupY = 0; groupY < groupCountY; groupY++)
{ {
for (uint32_t groupX = 0; groupX < groupCountX; groupX++) for (uint32_t groupX = 0; groupX < groupCountX; groupX++)
{ {
(*this)(&data, groupX, groupY, groupZ, 0, subgroupsPerWorkgroup);
// TODO(bclayton): Split work across threads.
using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
std::queue<Coroutine> coroutines;
if (shader->getModes().ContainsControlBarriers)
{
// Make a function call per subgroup so each subgroup
// can yield, bringing all subgroups to the barrier
// together.
for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
{
auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
coroutines.push(std::move(coroutine));
} }
} }
else
{
auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
coroutines.push(std::move(coroutine));
} }
while (coroutines.size() > 0)
{
auto coroutine = std::move(coroutines.front());
coroutines.pop();
SpirvShader::YieldResult result;
if (coroutine->await(result))
{
// TODO: Consider result (when the enum is more than 1 entry).
coroutines.push(std::move(coroutine));
}
}
} // groupX
} // groupY
} // groupZ
} }
} // namespace sw } // namespace sw
...@@ -37,11 +37,12 @@ namespace sw ...@@ -37,11 +37,12 @@ namespace sw
struct Constants; struct Constants;
// ComputeProgram builds a SPIR-V compute shader. // ComputeProgram builds a SPIR-V compute shader.
class ComputeProgram : public Coroutine<int( class ComputeProgram : public Coroutine<SpirvShader::YieldResult(
void* data, void* data,
int32_t workgroupX, int32_t workgroupX,
int32_t workgroupY, int32_t workgroupY,
int32_t workgroupZ, int32_t workgroupZ,
void* workgroupMemory,
int32_t firstSubgroup, int32_t firstSubgroup,
int32_t subgroupCount)> int32_t subgroupCount)>
{ {
...@@ -80,7 +81,6 @@ namespace sw ...@@ -80,7 +81,6 @@ namespace sw
uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup. uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup.
PushConstantStorage pushConstants; PushConstantStorage pushConstants;
const Constants *constants; const Constants *constants;
uint8_t* workgroupMemory;
}; };
SpirvRoutine routine; SpirvRoutine routine;
......
...@@ -13,8 +13,9 @@ ...@@ -13,8 +13,9 @@
// limitations under the License. // limitations under the License.
#include "SpirvShader.hpp" #include "SpirvShader.hpp"
#include "SamplerCore.hpp" #include "SamplerCore.hpp"
#include "Reactor/Coroutine.hpp"
#include "System/Math.hpp" #include "System/Math.hpp"
#include "Vulkan/VkBuffer.hpp" #include "Vulkan/VkBuffer.hpp"
#include "Vulkan/VkBufferView.hpp" #include "Vulkan/VkBufferView.hpp"
...@@ -884,6 +885,10 @@ namespace sw ...@@ -884,6 +885,10 @@ namespace sw
// Don't need to do anything during analysis pass // Don't need to do anything during analysis pass
break; break;
case spv::OpControlBarrier:
modes.ContainsControlBarriers = true;
break;
case spv::OpExtension: case spv::OpExtension:
{ {
auto ext = reinterpret_cast<char const *>(insn.wordPointer(1)); auto ext = reinterpret_cast<char const *>(insn.wordPointer(1));
...@@ -2462,6 +2467,9 @@ namespace sw ...@@ -2462,6 +2467,9 @@ namespace sw
case spv::OpCopyMemory: case spv::OpCopyMemory:
return EmitCopyMemory(insn, state); return EmitCopyMemory(insn, state);
case spv::OpControlBarrier:
return EmitControlBarrier(insn, state);
case spv::OpMemoryBarrier: case spv::OpMemoryBarrier:
return EmitMemoryBarrier(insn, state); return EmitMemoryBarrier(insn, state);
...@@ -4889,6 +4897,11 @@ namespace sw ...@@ -4889,6 +4897,11 @@ namespace sw
return ptr; return ptr;
} }
void SpirvShader::Yield(YieldResult res) const
{
rr::Yield(RValue<Int>(int(res)));
}
SpirvShader::EmitResult SpirvShader::EmitImageRead(InsnIterator insn, EmitState *state) const SpirvShader::EmitResult SpirvShader::EmitImageRead(InsnIterator insn, EmitState *state) const
{ {
auto &resultType = getType(Type::ID(insn.word(1))); auto &resultType = getType(Type::ID(insn.word(1)));
...@@ -5468,6 +5481,29 @@ namespace sw ...@@ -5468,6 +5481,29 @@ namespace sw
return EmitResult::Continue; return EmitResult::Continue;
} }
SpirvShader::EmitResult SpirvShader::EmitControlBarrier(InsnIterator insn, EmitState *state) const
{
auto executionScope = spv::Scope(GetConstScalarInt(insn.word(1)));
auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(3)));
// TODO: We probably want to consider the memory scope here. For now,
// just always emit the full fence.
Fence(semantics);
switch (executionScope)
{
case spv::ScopeWorkgroup:
case spv::ScopeSubgroup:
Yield(YieldResult::ControlBarrier);
break;
default:
// See Vulkan 1.1 spec, Appendix A, Validation Rules within a Module.
UNREACHABLE("Scope for execution must be limited to Workgroup or Subgroup");
break;
}
return EmitResult::Continue;
}
SpirvShader::EmitResult SpirvShader::EmitMemoryBarrier(InsnIterator insn, EmitState *state) const SpirvShader::EmitResult SpirvShader::EmitMemoryBarrier(InsnIterator insn, EmitState *state) const
{ {
auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(2))); auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(2)));
......
...@@ -262,6 +262,11 @@ namespace sw ...@@ -262,6 +262,11 @@ namespace sw
using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants); using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants);
using GetImageSampler = ImageSampler*(const vk::ImageView *imageView, const vk::Sampler *sampler); using GetImageSampler = ImageSampler*(const vk::ImageView *imageView, const vk::Sampler *sampler);
enum class YieldResult
{
ControlBarrier,
};
/* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */ /* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */
class InsnIterator class InsnIterator
{ {
...@@ -543,6 +548,7 @@ namespace sw ...@@ -543,6 +548,7 @@ namespace sw
bool DepthLess : 1; bool DepthLess : 1;
bool DepthUnchanged : 1; bool DepthUnchanged : 1;
bool ContainsKill : 1; bool ContainsKill : 1;
bool ContainsControlBarriers : 1;
bool NeedsCentroid : 1; bool NeedsCentroid : 1;
// Compute workgroup dimensions // Compute workgroup dimensions
...@@ -934,6 +940,7 @@ namespace sw ...@@ -934,6 +940,7 @@ namespace sw
EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const; EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const;
EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const; EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const;
EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const; EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const;
EmitResult EmitControlBarrier(InsnIterator insn, EmitState *state) const;
EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const; EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const;
EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const; EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const;
...@@ -944,6 +951,9 @@ namespace sw ...@@ -944,6 +951,9 @@ namespace sw
// Emits a rr::Fence for the given MemorySemanticsMask. // Emits a rr::Fence for the given MemorySemanticsMask.
void Fence(spv::MemorySemanticsMask semantics) const; void Fence(spv::MemorySemanticsMask semantics) const;
// Helper for calling rr::Yield with res cast to an rr::Int.
void Yield(YieldResult res) const;
// OpcodeName() returns the name of the opcode op. // OpcodeName() returns the name of the opcode op.
// If NDEBUG is defined, then OpcodeName() will only return the numerical code. // If NDEBUG is defined, then OpcodeName() will only return the numerical code.
static std::string OpcodeName(spv::Op op); static std::string OpcodeName(spv::Op op);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment