Commit 9f2b6c45 by Alexis Hetu Committed by Alexis Hétu

Point vertex processing optimization

Instead of packing each point in a triangle primitive, tightly pack all point vertices and write them out 3 at a time in the output primitive. This should be roughly twice as fast. Explanation: Currently: Vertices: 0 0 0 1 1 1 2 2 2 ... Processing: 1) 0 0 0 1 2) 0 (cache hit) 3) 0 (cache hit) 2) 1 (cache hit) 3) 1 (cache hit) 4) 1 (cache hit) 5) 2 2 2 3 ... -> We processed 8 vertices to get points 0 1 2 3 New way: 1) 0 1 2 3 -> We processed 4 vertices to get points 0 1 2 3 2) 4 5 6 7 Will affect these tests once vertexPipelineStoresAndAtomics is enabled: dEQP-VK.glsl.atomic_operations.* Note that these tests are affected because they wrongly assume vertices won't be processed more than once. These tests should still get fixed. Bug b/140294254 Change-Id: Idb21085838317db7b7a6630a18de4d7284534429 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/36349 Presubmit-Ready: Alexis Hétu <sugoi@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com> Tested-by: 's avatarAlexis Hétu <sugoi@google.com>
parent 4ba1b04b
...@@ -53,13 +53,17 @@ namespace sw ...@@ -53,13 +53,17 @@ namespace sw
case VK_PRIMITIVE_TOPOLOGY_POINT_LIST: case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
{ {
auto index = start; auto index = start;
auto pointBatch = &(batch[0][0]);
for(unsigned int i = 0; i < triangleCount; i++) for(unsigned int i = 0; i < triangleCount; i++)
{ {
batch[i][0] = indices[index]; *pointBatch++ = indices[index++];
batch[i][1] = indices[index]; }
batch[i][2] = indices[index];
index += 1; // Repeat the last index to allow for SIMD width overrun.
index--;
for(unsigned int i = 0; i < 3; i++)
{
*pointBatch++ = indices[index];
} }
break; break;
} }
...@@ -496,7 +500,8 @@ namespace sw ...@@ -496,7 +500,8 @@ namespace sw
auto& vertexTask = batch->vertexTask; auto& vertexTask = batch->vertexTask;
vertexTask.primitiveStart = batch->firstPrimitive; vertexTask.primitiveStart = batch->firstPrimitive;
vertexTask.vertexCount = batch->numPrimitives * 3; // We're only using batch compaction for points, not lines
vertexTask.vertexCount = batch->numPrimitives * ((draw->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST) ? 1 : 3);
if (vertexTask.vertexCache.drawCall != draw->id) if (vertexTask.vertexCache.drawCall != draw->id)
{ {
vertexTask.vertexCache.clear(); vertexTask.vertexCache.clear();
...@@ -590,10 +595,14 @@ namespace sw ...@@ -590,10 +595,14 @@ namespace sw
} }
} }
// Repeat the last index to allow for SIMD width overrun. // setBatchIndices() takes care of the point case, since it's different due to the compaction
triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2]; if (topology != VK_PRIMITIVE_TOPOLOGY_POINT_LIST)
triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2]; {
triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2]; // Repeat the last index to allow for SIMD width overrun.
triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
}
} }
int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count) int DrawCall::setupSolidTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
......
...@@ -78,6 +78,7 @@ namespace sw ...@@ -78,6 +78,7 @@ namespace sw
State state; State state;
state.shaderID = context->vertexShader->getSerialID(); state.shaderID = context->vertexShader->getSerialID();
state.isPoint = context->topology == VK_PRIMITIVE_TOPOLOGY_POINT_LIST;
for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++) for(int i = 0; i < MAX_INTERFACE_COMPONENTS / 4; i++)
{ {
......
...@@ -75,6 +75,7 @@ namespace sw ...@@ -75,6 +75,7 @@ namespace sw
}; };
Input input[MAX_INTERFACE_COMPONENTS / 4]; Input input[MAX_INTERFACE_COMPONENTS / 4];
bool isPoint : 1;
}; };
struct State : States struct State : States
......
...@@ -67,9 +67,14 @@ namespace sw ...@@ -67,9 +67,14 @@ namespace sw
} }
Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex)); Pointer<Byte> cacheEntry = vertexCache + cacheIndex * UInt((int)sizeof(Vertex));
writeVertex(vertex, cacheEntry);
vertex += sizeof(Vertex); // For points, vertexCount is 1 per primitive, so duplicate vertex for all 3 vertices of the primitive
for(int i = 0; i < (state.isPoint ? 3 : 1); i++)
{
writeVertex(vertex, cacheEntry);
vertex += sizeof(Vertex);
}
batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t)); batch = Pointer<UInt>(Pointer<Byte>(batch) + sizeof(uint32_t));
vertexCount--; vertexCount--;
} }
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment