Commit 4a105591 by Nicolas Capens Committed by Nicolas Capens

Process independent vertex elements

Previously, vertices would be processed in consecutive groups of four (for SSE/NEON). Now four indices are read from the index buffer. Reading the input was already a gather operation, but with constant stride. The vertex cache now performs a scatter. The vertices are written in reverse order so that the first vertex in a group is always present in the cache. Also use 2^32-1 as invalid vertex cache index (corresponds with the primitive restart index) instead of 0x80000000, since maxDrawIndexedIndexValue is UINT32_MAX. Bug: b/27351835 Test: dEQP-VK.glsl.loops.special.do_while_dynamic_iterations.dowhile_trap_vertex Change-Id: Ic69dbf53c67cbda50e44913ccae91aaca2b86e21 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/32609 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: 's avatarNicolas Capens <nicolascapens@google.com> Reviewed-by: 's avatarAlexis Hétu <sugoi@google.com>
parent 8bcd1744
......@@ -824,7 +824,7 @@ namespace sw
task->vertexCache.drawCall = primitiveDrawCall;
}
unsigned int batch[128][3]; // FIXME: Adjust to dynamic batch size
unsigned int batch[128 + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology));
if(!indices)
......@@ -862,6 +862,11 @@ namespace sw
}
}
// Repeat the last index to allow for SIMD width overrun.
batch[triangleCount][0] = batch[triangleCount - 1][2];
batch[triangleCount][1] = batch[triangleCount - 1][2];
batch[triangleCount][2] = batch[triangleCount - 1][2];
task->primitiveStart = start;
task->vertexCount = triangleCount * 3;
vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
......
......@@ -25,9 +25,9 @@ namespace sw
{
void VertexCache::clear()
{
for(int i = 0; i < 16; i++)
for(uint32_t i = 0; i < SIZE; i++)
{
tag[i] = 0x80000000;
tag[i] = 0xFFFFFFFF;
}
}
......
......@@ -25,12 +25,16 @@ namespace sw
{
struct DrawData;
struct VertexCache // FIXME: Variable size
// Basic direct mapped vertex cache.
struct VertexCache
{
static constexpr uint32_t SIZE = 64; // TODO: Variable size?
static constexpr uint32_t TAG_MASK = SIZE - 1; // Size must be power of 2.
void clear();
Vertex vertex[16][4];
unsigned int tag[16];
Vertex vertex[SIZE];
uint32_t tag[SIZE];
int drawCall;
};
......
......@@ -73,17 +73,23 @@ namespace sw
{
}
void VertexProgram::program(UInt &index)
void VertexProgram::program(Pointer<UInt> &batch)
{
auto it = spirvShader->inputBuiltins.find(spv::BuiltInVertexIndex);
if (it != spirvShader->inputBuiltins.end())
{
assert(it->second.SizeInComponents == 1);
Int4 indices;
indices = Insert(indices, As<Int>(batch[0]), 0);
indices = Insert(indices, As<Int>(batch[1]), 1);
indices = Insert(indices, As<Int>(batch[2]), 2);
indices = Insert(indices, As<Int>(batch[3]), 3);
routine.getVariable(it->second.Id)[it->second.FirstComponent] =
As<Float4>(Int4(As<Int>(index) + *Pointer<Int>(data + OFFSET(DrawData, baseVertex))) + Int4(0, 1, 2, 3));
As<Float4>(indices + Int4(*Pointer<Int>(data + OFFSET(DrawData, baseVertex))));
}
auto activeLaneMask = SIMD::Int(0xFFFFFFFF); // TODO: Control this.
auto activeLaneMask = SIMD::Int(0xFFFFFFFF);
spirvShader->emit(&routine, activeLaneMask, descriptorSets);
spirvShader->emitEpilog(&routine);
......
......@@ -34,7 +34,7 @@ namespace sw
virtual ~VertexProgram();
private:
void program(UInt &index) override;
void program(Pointer<UInt> &batch) override;
const vk::DescriptorSet::Bindings &descriptorSets;
};
......
......@@ -42,33 +42,32 @@ namespace sw
{
Pointer<Byte> cache = task + OFFSET(VertexTask,vertexCache);
Pointer<Byte> vertexCache = cache + OFFSET(VertexCache,vertex);
Pointer<Byte> tagCache = cache + OFFSET(VertexCache,tag);
Pointer<UInt> tagCache = Pointer<UInt>(cache + OFFSET(VertexCache,tag));
UInt vertexCount = *Pointer<UInt>(task + OFFSET(VertexTask,vertexCount));
constants = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData,constants));
// Check the cache one vertex index at a time. If a hit occurs, copy from the cache to the 'vertex' output buffer.
// On a cache miss, process a SIMD width of consecutive indices from the input batch. They're written to the cache
// in reverse order to guarantee that the first one doesn't get evicted and can be written out.
Do
{
UInt index = *batch;
UInt tagIndex = index & 0x0000003C;
UInt indexQ = index & 0xFFFFFFFC;
UInt cacheIndex = index & VertexCache::TAG_MASK;
If(*Pointer<UInt>(tagCache + tagIndex) != indexQ)
If(tagCache[cacheIndex] != index)
{
*Pointer<UInt>(tagCache + tagIndex) = indexQ;
readInput(indexQ);
program(indexQ);
readInput(batch);
program(batch);
computeClipFlags();
Pointer<Byte> cacheLine0 = vertexCache + tagIndex * UInt((int)sizeof(Vertex));
writeCache(cacheLine0);
writeCache(vertexCache, tagCache, batch);
}
UInt cacheIndex = index & 0x0000003F;
Pointer<Byte> cacheLine = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
writeVertex(vertex, cacheLine);
Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
writeVertex(vertex, cacheEntry);
vertex += sizeof(Vertex);
batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
......@@ -79,7 +78,7 @@ namespace sw
Return();
}
void VertexRoutine::readInput(UInt &index)
void VertexRoutine::readInput(Pointer<UInt> &batch)
{
for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
{
......@@ -88,11 +87,10 @@ namespace sw
spirvShader->inputs[i + 2].Type != SpirvShader::ATTRIBTYPE_UNUSED ||
spirvShader->inputs[i + 3].Type != SpirvShader::ATTRIBTYPE_UNUSED)
{
Pointer<Byte> input = *Pointer<Pointer<Byte>>(data + OFFSET(DrawData, input) + sizeof(void*) * (i / 4));
UInt stride = *Pointer<UInt>(data + OFFSET(DrawData, stride) + sizeof(uint32_t) * (i / 4));
auto value = readStream(input, stride, state.input[i / 4], index);
auto value = readStream(input, stride, state.input[i / 4], batch);
routine.inputs[i + 0] = value.x;
routine.inputs[i + 1] = value.y;
routine.inputs[i + 2] = value.z;
......@@ -134,14 +132,14 @@ namespace sw
clipFlags |= Pointer<Int>(constants + OFFSET(Constants,fini))[SignMask(finiteXYZ)];
}
Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index)
Vector4f VertexRoutine::readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch)
{
Vector4f v;
Pointer<Byte> source0 = buffer + index * stride;
Pointer<Byte> source1 = source0 + stride;
Pointer<Byte> source2 = source1 + stride;
Pointer<Byte> source3 = source2 + stride;
Pointer<Byte> source0 = buffer + batch[0] * stride;
Pointer<Byte> source1 = buffer + batch[1] * stride;
Pointer<Byte> source2 = buffer + batch[2] * stride;
Pointer<Byte> source3 = buffer + batch[3] * stride;
bool isNativeFloatAttrib = (stream.attribType == SpirvShader::ATTRIBTYPE_FLOAT) || stream.normalized;
......@@ -486,8 +484,25 @@ namespace sw
return v;
}
void VertexRoutine::writeCache(Pointer<Byte> &cacheLine)
void VertexRoutine::writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch)
{
UInt index0 = batch[0];
UInt index1 = batch[1];
UInt index2 = batch[2];
UInt index3 = batch[3];
UInt cacheIndex0 = index0 & VertexCache::TAG_MASK;
UInt cacheIndex1 = index1 & VertexCache::TAG_MASK;
UInt cacheIndex2 = index2 & VertexCache::TAG_MASK;
UInt cacheIndex3 = index3 & VertexCache::TAG_MASK;
// We processed a SIMD group of vertices, with the first one being the one that missed the cache tag check.
// Write them out in reverse order here and below to ensure the first one is now guaranteed to be in the cache.
tagCache[cacheIndex3] = index3;
tagCache[cacheIndex2] = index2;
tagCache[cacheIndex1] = index1;
tagCache[cacheIndex0] = index0;
auto it = spirvShader->outputBuiltins.find(spv::BuiltInPosition);
assert(it != spirvShader->outputBuiltins.end());
assert(it->second.SizeInComponents == 4);
......@@ -511,10 +526,10 @@ namespace sw
transpose4x4(pos.x, pos.y, pos.z, pos.w);
*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 0, 16) = pos.x;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 1, 16) = pos.y;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 2, 16) = pos.z;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,position) + sizeof(Vertex) * 3, 16) = pos.w;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,position), 16) = pos.w;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,position), 16) = pos.z;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,position), 16) = pos.y;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,position), 16) = pos.x;
it = spirvShader->outputBuiltins.find(spv::BuiltInPointSize);
if(it != spirvShader->outputBuiltins.end())
......@@ -522,23 +537,23 @@ namespace sw
assert(it->second.SizeInComponents == 1);
auto psize = routine.getVariable(it->second.Id)[it->second.FirstComponent];
*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 0) = Extract(psize, 0);
*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 1) = Extract(psize, 1);
*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 2) = Extract(psize, 2);
*Pointer<Float>(cacheLine + OFFSET(Vertex,pointSize) + sizeof(Vertex) * 3) = Extract(psize, 3);
*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,pointSize)) = Extract(psize, 3);
*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,pointSize)) = Extract(psize, 2);
*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,pointSize)) = Extract(psize, 1);
*Pointer<Float>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,pointSize)) = Extract(psize, 0);
}
*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 0) = (clipFlags >> 0) & 0x0000000FF;
*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 1) = (clipFlags >> 8) & 0x0000000FF;
*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 2) = (clipFlags >> 16) & 0x0000000FF;
*Pointer<Int>(cacheLine + OFFSET(Vertex,clipFlags) + sizeof(Vertex) * 3) = (clipFlags >> 24) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 24) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 16) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 8) & 0x0000000FF;
*Pointer<Int>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,clipFlags)) = (clipFlags >> 0) & 0x0000000FF;
transpose4x4(proj.x, proj.y, proj.z, proj.w);
*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 0, 16) = proj.x;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 1, 16) = proj.y;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 2, 16) = proj.z;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,projected) + sizeof(Vertex) * 3, 16) = proj.w;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,projected), 16) = proj.w;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,projected), 16) = proj.z;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,projected), 16) = proj.y;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,projected), 16) = proj.x;
for(int i = 0; i < MAX_INTERFACE_COMPONENTS; i += 4)
{
......@@ -555,10 +570,10 @@ namespace sw
transpose4x4(v.x, v.y, v.z, v.w);
*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 0, 16) = v.x;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 1, 16) = v.y;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 2, 16) = v.z;
*Pointer<Float4>(cacheLine + OFFSET(Vertex,v[i]) + sizeof(Vertex) * 3, 16) = v.w;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex3 + OFFSET(Vertex,v[i]), 16) = v.w;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex2 + OFFSET(Vertex,v[i]), 16) = v.z;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex1 + OFFSET(Vertex,v[i]), 16) = v.y;
*Pointer<Float4>(vertexCache + sizeof(Vertex) * cacheIndex0 + OFFSET(Vertex,v[i]), 16) = v.x;
}
}
}
......
......@@ -62,14 +62,14 @@ namespace sw
SpirvShader const * const spirvShader;
private:
virtual void program(UInt &index) = 0;
virtual void program(Pointer<UInt> &batch) = 0;
typedef VertexProcessor::State::Input Stream;
Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, const UInt &index);
void readInput(UInt &index);
Vector4f readStream(Pointer<Byte> &buffer, UInt &stride, const Stream &stream, Pointer<UInt> &batch);
void readInput(Pointer<UInt> &batch);
void computeClipFlags();
void writeCache(Pointer<Byte> &cacheLine);
void writeCache(Pointer<Byte> &vertexCache, Pointer<UInt> &tagCache, Pointer<UInt> &batch);
void writeVertex(const Pointer<Byte> &vertex, Pointer<Byte> &cacheEntry);
};
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment