Commit cd98c739 by Chris Forbes

SpirvShader: Optimize stores with static equal offsets

This is heavily used in dEQP-VK.ssbo.*. Avoiding generating the scatter is profitable on all non-AVX512-capable targets; ScalarizeMaskedMemIntrin is incredibly slow. Reduces runtime on dEQP-VK.ssbo.layout.random.all_shared_buffer.5 from 24s to 14s on my threadripper (on top of stack of other optimizations). Bug: b/135609394 Change-Id: I2d6840522a5bd30b4fd532b9c7e2a4712879caa9 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/33289Tested-by: 's avatarChris Forbes <chrisforbes@google.com> Presubmit-Ready: Chris Forbes <chrisforbes@google.com> Reviewed-by: 's avatarBen Clayton <bclayton@google.com>
parent 61f2a46a
......@@ -355,6 +355,23 @@ namespace sw
mask &= ptr.isInBounds(sizeof(float)); // Disable OOB writes.
if (!atomic && order == std::memory_order_relaxed)
{
if (ptr.hasStaticEqualOffsets())
{
If (AnyTrue(mask))
{
// All equal. One of these writes will win -- elect the winning lane.
auto v0111 = SIMD::Int(0, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
auto elect = mask & ~(v0111 & (mask.xxyz | mask.xxxy | mask.xxxx));
auto maskedVal = As<SIMD::Int>(val) & elect;
auto scalarVal = Extract(maskedVal, 0) |
Extract(maskedVal, 1) |
Extract(maskedVal, 2) |
Extract(maskedVal, 3);
*rr::Pointer<EL>(ptr.base + ptr.staticOffsets[0], sizeof(float)) = As<EL>(scalarVal);
}
return;
}
if (ptr.hasStaticSequentialOffsets(sizeof(float)))
{
return rr::MaskedStore(rr::Pointer<T>(ptr.base + ptr.staticOffsets[0]), val, mask, sizeof(float));
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment