Commit ecfeede6 by Ben Clayton Committed by Chris Forbes

SpirvShader: Implement OpControlBarrier.

Use the new coroutines to yield when hitting a ControlBarrier. A barrier pushes the shader subgroup to the end of the compute invocation queue, which forces all subgroups to be brought to the fence before continuing. Tests: dEQP-VK.spirv_assembly.instruction.compute.workgroup_memory.* Tests: dEQP-VK.subgroups.basic.compute.* Tests: dEQP-VK.compute.basic.* Bug: b/131672705 Bug: b/132232716 Change-Id: Id78be9ce9d9455cb2cb7254482568985845b8b6a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/30851 Presubmit-Ready: Ben Clayton <bclayton@google.com> Tested-by: 's avatarChris Forbes <chrisforbes@google.com> Reviewed-by: 's avatarChris Forbes <chrisforbes@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>
parent b16c5867
......@@ -18,6 +18,8 @@
#include "Vulkan/VkDebug.hpp"
#include "Vulkan/VkPipelineLayout.hpp"
#include <queue>
namespace
{
enum { X, Y, Z };
......@@ -154,17 +156,18 @@ namespace sw
void ComputeProgram::emit()
{
Int workgroupX = Arg<1>();
Int workgroupY = Arg<2>();
Int workgroupZ = Arg<3>();
Pointer<Byte> workgroupMemory = Arg<4>();
Int firstSubgroup = Arg<5>();
Int subgroupCount = Arg<6>();
routine.descriptorSets = data + OFFSET(Data, descriptorSets);
routine.descriptorDynamicOffsets = data + OFFSET(Data, descriptorDynamicOffsets);
routine.pushConstants = data + OFFSET(Data, pushConstants);
routine.constants = *Pointer<Pointer<Byte>>(data + OFFSET(Data, constants));
routine.workgroupMemory = *Pointer<Pointer<Byte>>(data + OFFSET(Data, workgroupMemory));
Int workgroupX = Arg<1>();
Int workgroupY = Arg<2>();
Int workgroupZ = Arg<3>();
Int firstSubgroup = Arg<4>();
Int subgroupCount = Arg<5>();
routine.workgroupMemory = workgroupMemory;
Int invocationsPerWorkgroup = *Pointer<Int>(data + OFFSET(Data, invocationsPerWorkgroup));
......@@ -210,8 +213,8 @@ namespace sw
auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
// We're sharing a buffer here across all workgroups.
// We can only do this because we know workgroups are executed
// serially.
// We can only do this because we know a single workgroup is in flight
// at any time.
std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
Data data;
......@@ -230,19 +233,51 @@ namespace sw
data.subgroupsPerWorkgroup = subgroupsPerWorkgroup;
data.pushConstants = pushConstants;
data.constants = &sw::constants;
data.workgroupMemory = workgroupMemory.data();
// TODO(bclayton): Split work across threads.
for (uint32_t groupZ = 0; groupZ < groupCountZ; groupZ++)
{
for (uint32_t groupY = 0; groupY < groupCountY; groupY++)
{
for (uint32_t groupX = 0; groupX < groupCountX; groupX++)
{
(*this)(&data, groupX, groupY, groupZ, 0, subgroupsPerWorkgroup);
}
}
}
// TODO(bclayton): Split work across threads.
using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
std::queue<Coroutine> coroutines;
if (shader->getModes().ContainsControlBarriers)
{
// Make a function call per subgroup so each subgroup
// can yield, bringing all subgroups to the barrier
// together.
for(int subgroupIndex = 0; subgroupIndex < subgroupsPerWorkgroup; subgroupIndex++)
{
auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), subgroupIndex, 1);
coroutines.push(std::move(coroutine));
}
}
else
{
auto coroutine = (*this)(&data, groupX, groupY, groupZ, workgroupMemory.data(), 0, subgroupsPerWorkgroup);
coroutines.push(std::move(coroutine));
}
while (coroutines.size() > 0)
{
auto coroutine = std::move(coroutines.front());
coroutines.pop();
SpirvShader::YieldResult result;
if (coroutine->await(result))
{
// TODO: Consider result (when the enum is more than 1 entry).
coroutines.push(std::move(coroutine));
}
}
} // groupX
} // groupY
} // groupZ
}
} // namespace sw
......@@ -37,11 +37,12 @@ namespace sw
struct Constants;
// ComputeProgram builds a SPIR-V compute shader.
class ComputeProgram : public Coroutine<int(
class ComputeProgram : public Coroutine<SpirvShader::YieldResult(
void* data,
int32_t workgroupX,
int32_t workgroupY,
int32_t workgroupZ,
void* workgroupMemory,
int32_t firstSubgroup,
int32_t subgroupCount)>
{
......@@ -80,7 +81,6 @@ namespace sw
uint32_t invocationsPerWorkgroup; // Total number of invocations per workgroup.
PushConstantStorage pushConstants;
const Constants *constants;
uint8_t* workgroupMemory;
};
SpirvRoutine routine;
......
......@@ -13,8 +13,9 @@
// limitations under the License.
#include "SpirvShader.hpp"
#include "SamplerCore.hpp"
#include "Reactor/Coroutine.hpp"
#include "System/Math.hpp"
#include "Vulkan/VkBuffer.hpp"
#include "Vulkan/VkBufferView.hpp"
......@@ -884,6 +885,10 @@ namespace sw
// Don't need to do anything during analysis pass
break;
case spv::OpControlBarrier:
modes.ContainsControlBarriers = true;
break;
case spv::OpExtension:
{
auto ext = reinterpret_cast<char const *>(insn.wordPointer(1));
......@@ -2462,6 +2467,9 @@ namespace sw
case spv::OpCopyMemory:
return EmitCopyMemory(insn, state);
case spv::OpControlBarrier:
return EmitControlBarrier(insn, state);
case spv::OpMemoryBarrier:
return EmitMemoryBarrier(insn, state);
......@@ -4889,6 +4897,11 @@ namespace sw
return ptr;
}
void SpirvShader::Yield(YieldResult res) const
{
rr::Yield(RValue<Int>(int(res)));
}
SpirvShader::EmitResult SpirvShader::EmitImageRead(InsnIterator insn, EmitState *state) const
{
auto &resultType = getType(Type::ID(insn.word(1)));
......@@ -5468,6 +5481,29 @@ namespace sw
return EmitResult::Continue;
}
SpirvShader::EmitResult SpirvShader::EmitControlBarrier(InsnIterator insn, EmitState *state) const
{
auto executionScope = spv::Scope(GetConstScalarInt(insn.word(1)));
auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(3)));
// TODO: We probably want to consider the memory scope here. For now,
// just always emit the full fence.
Fence(semantics);
switch (executionScope)
{
case spv::ScopeWorkgroup:
case spv::ScopeSubgroup:
Yield(YieldResult::ControlBarrier);
break;
default:
// See Vulkan 1.1 spec, Appendix A, Validation Rules within a Module.
UNREACHABLE("Scope for execution must be limited to Workgroup or Subgroup");
break;
}
return EmitResult::Continue;
}
SpirvShader::EmitResult SpirvShader::EmitMemoryBarrier(InsnIterator insn, EmitState *state) const
{
auto semantics = spv::MemorySemanticsMask(GetConstScalarInt(insn.word(2)));
......
......@@ -262,6 +262,11 @@ namespace sw
using ImageSampler = void(void* texture, void *sampler, void* uvsIn, void* texelOut, void* constants);
using GetImageSampler = ImageSampler*(const vk::ImageView *imageView, const vk::Sampler *sampler);
enum class YieldResult
{
ControlBarrier,
};
/* Pseudo-iterator over SPIRV instructions, designed to support range-based-for. */
class InsnIterator
{
......@@ -543,6 +548,7 @@ namespace sw
bool DepthLess : 1;
bool DepthUnchanged : 1;
bool ContainsKill : 1;
bool ContainsControlBarriers : 1;
bool NeedsCentroid : 1;
// Compute workgroup dimensions
......@@ -934,6 +940,7 @@ namespace sw
EmitResult EmitAtomicCompareExchange(InsnIterator insn, EmitState *state) const;
EmitResult EmitSampledImageCombineOrSplit(InsnIterator insn, EmitState *state) const;
EmitResult EmitCopyMemory(InsnIterator insn, EmitState *state) const;
EmitResult EmitControlBarrier(InsnIterator insn, EmitState *state) const;
EmitResult EmitMemoryBarrier(InsnIterator insn, EmitState *state) const;
EmitResult EmitGroupNonUniform(InsnIterator insn, EmitState *state) const;
......@@ -944,6 +951,9 @@ namespace sw
// Emits a rr::Fence for the given MemorySemanticsMask.
void Fence(spv::MemorySemanticsMask semantics) const;
// Helper for calling rr::Yield with res cast to an rr::Int.
void Yield(YieldResult res) const;
// OpcodeName() returns the name of the opcode op.
// If NDEBUG is defined, then OpcodeName() will only return the numerical code.
static std::string OpcodeName(spv::Op op);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment