Commit cde4dd96 by Ben Clayton

Device: Migrate Renderer to Yarn

Drop the complex task scheduling logic for yarn. Performance gains seen up to around ~30% FPS. Bug: b/139142453 Change-Id: I264fee36323425a791088565d99dc586670a948a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35572Tested-by: 's avatarBen Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent b5f0a4be
...@@ -19,7 +19,6 @@ ...@@ -19,7 +19,6 @@
#include "Polygon.hpp" #include "Polygon.hpp"
#include "Reactor/Reactor.hpp" #include "Reactor/Reactor.hpp"
#include "Pipeline/Constants.hpp" #include "Pipeline/Constants.hpp"
#include "System/CPUID.hpp"
#include "System/Memory.hpp" #include "System/Memory.hpp"
#include "System/Half.hpp" #include "System/Half.hpp"
#include "System/Math.hpp" #include "System/Math.hpp"
...@@ -33,6 +32,10 @@ ...@@ -33,6 +32,10 @@
#include "Pipeline/SpirvShader.hpp" #include "Pipeline/SpirvShader.hpp"
#include "Vertex.hpp" #include "Vertex.hpp"
#include "Yarn/Containers.hpp"
#include "Yarn/Defer.hpp"
#include "Yarn/Trace.hpp"
#undef max #undef max
#ifndef NDEBUG #ifndef NDEBUG
...@@ -42,11 +45,6 @@ unsigned int maxPrimitives = 1 << 21; ...@@ -42,11 +45,6 @@ unsigned int maxPrimitives = 1 << 21;
namespace sw namespace sw
{ {
static const int batchSize = 128;
std::atomic<int> threadCount(1);
std::atomic<int> Renderer::unitCount(1);
std::atomic<int> Renderer::clusterCount(1);
template<typename T> template<typename T>
inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, T indices, unsigned int start, unsigned int triangleCount) inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, T indices, unsigned int start, unsigned int triangleCount)
{ {
...@@ -138,20 +136,8 @@ namespace sw ...@@ -138,20 +136,8 @@ namespace sw
return true; return true;
} }
struct Parameters
{
Renderer *renderer;
int threadIndex;
};
DrawCall::DrawCall() DrawCall::DrawCall()
{ {
occlusionQuery = nullptr;
references = -1;
events = nullptr;
data = (DrawData*)allocate(sizeof(DrawData)); data = (DrawData*)allocate(sizeof(DrawData));
data->constants = &constants; data->constants = &constants;
} }
...@@ -163,74 +149,14 @@ namespace sw ...@@ -163,74 +149,14 @@ namespace sw
Renderer::Renderer(vk::Device* device) : device(device) Renderer::Renderer(vk::Device* device) : device(device)
{ {
for(int i = 0; i < 16; i++) VertexProcessor::setRoutineCacheSize(1024);
{ PixelProcessor::setRoutineCacheSize(1024);
vertexTask[i] = nullptr; SetupProcessor::setRoutineCacheSize(1024);
worker[i] = nullptr;
resume[i] = nullptr;
suspend[i] = nullptr;
}
threadsAwake = 0;
resumeApp = new Event();
currentDraw = 0;
nextDraw = 0;
qHead = 0;
qSize = 0;
for(int i = 0; i < 16; i++)
{
triangleBatch[i] = nullptr;
primitiveBatch[i] = nullptr;
}
for(int draw = 0; draw < DRAW_COUNT; draw++)
{
drawCall[draw] = new DrawCall();
drawList[draw] = drawCall[draw];
}
for(int unit = 0; unit < 16; unit++)
{
primitiveProgress[unit].init();
}
for(int cluster = 0; cluster < 16; cluster++)
{
pixelProgress[cluster].init();
}
updateConfiguration(true);
} }
Renderer::~Renderer() Renderer::~Renderer()
{ {
sync.wait(); drawTickets.take().wait();
terminateThreads();
delete resumeApp;
resumeApp = nullptr;
for(int draw = 0; draw < DRAW_COUNT; draw++)
{
delete drawCall[draw];
drawCall[draw] = nullptr;
}
}
// This object has to be mem aligned
void* Renderer::operator new(size_t size)
{
ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
return sw::allocate(sizeof(Renderer), 16);
}
void Renderer::operator delete(void * mem)
{
sw::deallocate(mem);
} }
void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex, void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
...@@ -239,6 +165,9 @@ namespace sw ...@@ -239,6 +165,9 @@ namespace sw
{ {
if(count == 0) { return; } if(count == 0) { return; }
auto id = nextDrawID++;
YARN_SCOPED_EVENT("draw %d", id);
#ifndef NDEBUG #ifndef NDEBUG
{ {
unsigned int minPrimitives = 1; unsigned int minPrimitives = 1;
...@@ -250,8 +179,6 @@ namespace sw ...@@ -250,8 +179,6 @@ namespace sw
} }
#endif #endif
updateConfiguration();
int ms = context->sampleCount; int ms = context->sampleCount;
if(!context->multiSampleMask) if(!context->multiSampleMask)
...@@ -259,10 +186,16 @@ namespace sw ...@@ -259,10 +186,16 @@ namespace sw
return; return;
} }
sync.add(); yarn::Pool<sw::DrawCall>::Loan draw;
{
YARN_SCOPED_EVENT("drawCallPool.borrow()");
draw = drawCallPool.borrow();
}
draw->id = id;
if(update) if(update)
{ {
YARN_SCOPED_EVENT("update");
vertexState = VertexProcessor::update(context); vertexState = VertexProcessor::update(context);
setupState = SetupProcessor::update(context); setupState = SetupProcessor::update(context);
pixelState = PixelProcessor::update(context); pixelState = PixelProcessor::update(context);
...@@ -272,56 +205,29 @@ namespace sw ...@@ -272,56 +205,29 @@ namespace sw
pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets); pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
} }
int batch = batchSize / ms; DrawCall::SetupFunction setupPrimitives = nullptr;
int (Renderer::*setupPrimitives)(int batch, int count);
if(context->isDrawTriangle()) if(context->isDrawTriangle())
{ {
setupPrimitives = &Renderer::setupTriangles; setupPrimitives = &DrawCall::setupTriangles;
} }
else if(context->isDrawLine()) else if(context->isDrawLine())
{ {
setupPrimitives = &Renderer::setupLines; setupPrimitives = &DrawCall::setupLines;
} }
else // Point draw else // Point draw
{ {
setupPrimitives = &Renderer::setupPoints; setupPrimitives = &DrawCall::setupPoints;
} }
DrawCall *draw = nullptr;
do
{
for(int i = 0; i < DRAW_COUNT; i++)
{
if(drawCall[i]->references == -1)
{
draw = drawCall[i];
drawList[nextDraw & DRAW_COUNT_BITS] = draw;
break;
}
}
if(!draw)
{
resumeApp->wait();
}
}
while(!draw);
DrawData *data = draw->data; DrawData *data = draw->data;
if (occlusionQuery)
{
occlusionQuery->start();
}
draw->occlusionQuery = occlusionQuery; draw->occlusionQuery = occlusionQuery;
draw->batchDataPool = &batchDataPool;
draw->numPrimitives = count;
draw->numPrimitivesPerBatch = MaxBatchSize / ms;
draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
draw->topology = context->topology; draw->topology = context->topology;
draw->indexType = indexType; draw->indexType = indexType;
draw->batchSize = batch;
draw->vertexRoutine = vertexRoutine; draw->vertexRoutine = vertexRoutine;
draw->setupRoutine = setupRoutine; draw->setupRoutine = setupRoutine;
...@@ -335,14 +241,6 @@ namespace sw ...@@ -335,14 +241,6 @@ namespace sw
data->descriptorSets = context->descriptorSets; data->descriptorSets = context->descriptorSets;
data->descriptorDynamicOffsets = context->descriptorDynamicOffsets; data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
if(events)
{
events->start();
}
ASSERT(!draw->events);
draw->events = events;
for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++) for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
{ {
data->input[i] = context->input[i].buffer; data->input[i] = context->input[i].buffer;
...@@ -383,7 +281,7 @@ namespace sw ...@@ -383,7 +281,7 @@ namespace sw
if(pixelState.occlusionEnabled) if(pixelState.occlusionEnabled)
{ {
for(int cluster = 0; cluster < clusterCount; cluster++) for(int cluster = 0; cluster < MaxClusterCount; cluster++)
{ {
data->occlusion[cluster] = 0; data->occlusion[cluster] = 0;
} }
...@@ -461,357 +359,197 @@ namespace sw ...@@ -461,357 +359,197 @@ namespace sw
data->pushConstants = pushConstants; data->pushConstants = pushConstants;
} }
draw->primitive = 0; draw->events = events;
draw->count = count;
draw->references = (count + batch - 1) / batch;
schedulerMutex.lock();
++nextDraw; // Atomic
schedulerMutex.unlock();
#ifndef NDEBUG
if(threadCount == 1) // Use main thread for draw execution
{
threadsAwake = 1;
task[0].type = Task::RESUME;
taskLoop(0);
}
else
#endif
{
if(!threadsAwake)
{
suspend[0]->wait();
threadsAwake = 1;
task[0].type = Task::RESUME;
resume[0]->signal();
}
}
}
void Renderer::threadFunction(void *parameters)
{
Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;
CPUID::setFlushToZero(true);
CPUID::setDenormalsAreZero(true);
renderer->threadLoop(threadIndex); DrawCall::run(draw, &drawTickets, clusterQueues);
} }
void Renderer::threadLoop(int threadIndex) void DrawCall::setup()
{ {
while(!exitThreads) if(occlusionQuery != nullptr)
{ {
taskLoop(threadIndex); occlusionQuery->start();
suspend[threadIndex]->signal();
resume[threadIndex]->wait();
} }
}
void Renderer::taskLoop(int threadIndex) if(events)
{
while(task[threadIndex].type != Task::SUSPEND)
{ {
scheduleTask(threadIndex); events->start();
executeTask(threadIndex);
} }
} }
void Renderer::findAvailableTasks() void DrawCall::teardown()
{ {
// Find pixel tasks if(events)
for(int cluster = 0; cluster < clusterCount; cluster++)
{
if(!pixelProgress[cluster].executing)
{
for(int unit = 0; unit < unitCount; unit++)
{
if(primitiveProgress[unit].references > 0) // Contains processed primitives
{
if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
{
if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive) // Previous primitives have been rendered
{
Task &task = taskQueue[qHead];
task.type = Task::PIXELS;
task.primitiveUnit = unit;
task.pixelCluster = cluster;
pixelProgress[cluster].executing = true;
// Commit to the task queue
qHead = (qHead + 1) & TASK_COUNT_BITS;
++qSize; // Atomic
break;
}
}
}
}
}
}
// Find primitive tasks
if(currentDraw == nextDraw)
{ {
return; // No more primitives to process events->finish();
events = nullptr;
} }
for(int unit = 0; unit < unitCount; unit++) if (occlusionQuery != nullptr)
{ {
DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS]; for(int cluster = 0; cluster < MaxClusterCount; cluster++)
int primitive = draw->primitive;
int count = draw->count;
if(primitive >= count)
{
++currentDraw; // Atomic
if(currentDraw == nextDraw)
{
return; // No more primitives to process
}
draw = drawList[currentDraw & DRAW_COUNT_BITS];
}
if(!primitiveProgress[unit].references) // Task not already being executed and not still in use by a pixel unit
{ {
primitive = draw->primitive; occlusionQuery->add(data->occlusion[cluster]);
count = draw->count;
int batch = draw->batchSize;
primitiveProgress[unit].drawCall = currentDraw.load();
primitiveProgress[unit].firstPrimitive = primitive;
primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;
draw->primitive += batch;
Task &task = taskQueue[qHead];
task.type = Task::PRIMITIVES;
task.primitiveUnit = unit;
primitiveProgress[unit].references = -1;
// Commit to the task queue
qHead = (qHead + 1) & TASK_COUNT_BITS;
++qSize; // Atomic
} }
occlusionQuery->finish();
} }
vertexRoutine.reset();
setupRoutine.reset();
pixelRoutine.reset();
} }
void Renderer::scheduleTask(int threadIndex) void DrawCall::run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount])
{ {
schedulerMutex.lock(); draw->setup();
int curThreadsAwake = threadsAwake; auto const numPrimitives = draw->numPrimitives;
auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
auto const numBatches = draw->numBatches;
if((int)qSize < threadCount - curThreadsAwake + 1) auto ticket = tickets->take();
{ auto finally = yarn::make_shared_finally([draw, ticket] {
findAvailableTasks(); YARN_SCOPED_EVENT("FINISH draw %d", draw->id);
} draw->teardown();
ticket.done();
});
if(qSize != 0) for (unsigned int batchId = 0; batchId < numBatches; batchId++)
{ {
task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS]; auto batch = draw->batchDataPool->borrow();
--qSize; // Atomic batch->id = batchId;
batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
if(curThreadsAwake != threadCount) for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{ {
int wakeup = qSize - curThreadsAwake + 1; batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
for(int i = 0; i < threadCount && wakeup > 0; i++)
{
if(task[i].type == Task::SUSPEND)
{
suspend[i]->wait();
task[i].type = Task::RESUME;
resume[i]->signal();
++threadsAwake; // Atomic
wakeup--;
}
}
} }
}
else
{
task[threadIndex].type = Task::SUSPEND;
--threadsAwake; // Atomic yarn::schedule([draw, batch, finally] {
}
schedulerMutex.unlock(); processVertices(draw.get(), batch.get());
}
void Renderer::executeTask(int threadIndex) if (!draw->setupState.rasterizerDiscard)
{
switch(task[threadIndex].type.load())
{
case Task::PRIMITIVES:
{
int unit = task[threadIndex].primitiveUnit;
int input = primitiveProgress[unit].firstPrimitive;
int count = primitiveProgress[unit].primitiveCount;
DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;
processPrimitiveVertices(unit, input, count, draw->count, threadIndex);
int visible = 0;
if(!draw->setupState.rasterizerDiscard)
{ {
visible = (this->*setupPrimitives)(unit, count); processPrimitives(draw.get(), batch.get());
}
primitiveProgress[unit].visible = visible; if (batch->numVisible > 0)
primitiveProgress[unit].references = clusterCount.load(); {
} processPixels(draw, batch, finally);
break; return;
case Task::PIXELS: }
{ }
int unit = task[threadIndex].primitiveUnit;
int visible = primitiveProgress[unit].visible;
if(visible > 0) for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{ {
int cluster = task[threadIndex].pixelCluster; batch->clusterTickets[cluster].done();
Primitive *primitive = primitiveBatch[unit];
DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
DrawData *data = draw->data;
PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
pixelRoutine(primitive, visible, cluster, clusterCount, data);
} }
});
finishRendering(task[threadIndex]);
}
break;
case Task::RESUME:
break;
case Task::SUSPEND:
break;
default:
ASSERT(false);
} }
} }
void Renderer::synchronize() void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
{ {
sync.wait(); YARN_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
device->updateSamplingRoutineConstCache();
}
void Renderer::finishRendering(Task &pixelTask)
{
int unit = pixelTask.primitiveUnit;
int cluster = pixelTask.pixelCluster;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
DrawData &data = *draw.data;
int primitive = primitiveProgress[unit].firstPrimitive;
int count = primitiveProgress[unit].primitiveCount;
int processedPrimitives = primitive + count;
pixelProgress[cluster].processedPrimitives = processedPrimitives; unsigned int triangleIndices[MaxBatchSize + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
if(pixelProgress[cluster].processedPrimitives >= draw.count)
{ {
++pixelProgress[cluster].drawCall; // Atomic YARN_SCOPED_EVENT("processPrimitiveVertices");
pixelProgress[cluster].processedPrimitives = 0; processPrimitiveVertices(
triangleIndices,
draw->data->indices,
draw->indexType,
batch->firstPrimitive,
batch->numPrimitives,
draw->topology);
} }
int ref = --primitiveProgress[unit].references; // Atomic auto& vertexTask = batch->vertexTask;
vertexTask.primitiveStart = batch->firstPrimitive;
if(ref == 0) vertexTask.vertexCount = batch->numPrimitives * 3;
if (vertexTask.vertexCache.drawCall != draw->id)
{ {
ref = --draw.references; // Atomic vertexTask.vertexCache.clear();
vertexTask.vertexCache.drawCall = draw->id;
if(ref == 0)
{
if (draw.occlusionQuery)
{
for(int cluster = 0; cluster < clusterCount; cluster++)
{
draw.occlusionQuery->add(data.occlusion[cluster]);
}
draw.occlusionQuery->finish();
}
draw.vertexRoutine.reset();
draw.setupRoutine.reset();
draw.pixelRoutine.reset();
if(draw.events)
{
draw.events->finish();
draw.events = nullptr;
}
sync.done();
draw.references = -1;
resumeApp->signal();
}
} }
pixelProgress[cluster].executing = false; draw->vertexPointer(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
} }
void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread) void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
{ {
Triangle *triangle = triangleBatch[unit]; YARN_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
int primitiveDrawCall = primitiveProgress[unit].drawCall; auto triangles = &batch->triangles[0];
DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS]; auto primitives = &batch->primitives[0];
DrawData *data = draw->data; batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
VertexTask *task = vertexTask[thread]; }
const void *indices = data->indices;
VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
if(task->vertexCache.drawCall != primitiveDrawCall) void DrawCall::processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
{ {
task->vertexCache.clear(); struct Data
task->vertexCache.drawCall = primitiveDrawCall; {
Data(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
: draw(draw), batch(batch), finally(finally) {}
yarn::Loan<DrawCall> draw;
yarn::Loan<BatchData> batch;
std::shared_ptr<yarn::Finally> finally;
};
auto data = std::make_shared<Data>(draw, batch, finally);
for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{
batch->clusterTickets[cluster].onCall([data, cluster]
{
auto& draw = data->draw;
auto& batch = data->batch;
YARN_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
draw->pixelPointer(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
batch->clusterTickets[cluster].done();
});
} }
}
unsigned int batch[128 + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size. void Renderer::synchronize()
VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology)); {
YARN_SCOPED_EVENT("synchronize");
auto ticket = drawTickets.take();
ticket.wait();
device->updateSamplingRoutineConstCache();
ticket.done();
}
if(!indices) void DrawCall::processPrimitiveVertices(
unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
const void *primitiveIndices,
VkIndexType indexType,
unsigned int start,
unsigned int triangleCount,
VkPrimitiveTopology topology)
{
if(!primitiveIndices)
{ {
struct LinearIndex struct LinearIndex
{ {
unsigned int operator[](unsigned int i) { return i; } unsigned int operator[](unsigned int i) { return i; }
}; };
if(!setBatchIndices(batch, topology, LinearIndex(), start, triangleCount)) if(!setBatchIndices(triangleIndicesOut, topology, LinearIndex(), start, triangleCount))
{ {
return; return;
} }
} }
else else
{ {
switch(draw->indexType.load()) switch(indexType)
{ {
case VK_INDEX_TYPE_UINT16: case VK_INDEX_TYPE_UINT16:
if(!setBatchIndices(batch, topology, static_cast<const uint16_t*>(indices), start, triangleCount)) if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
{ {
return; return;
} }
break; break;
case VK_INDEX_TYPE_UINT32: case VK_INDEX_TYPE_UINT32:
if(!setBatchIndices(batch, topology, static_cast<const uint32_t*>(indices), start, triangleCount)) if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
{ {
return; return;
} }
...@@ -824,33 +562,25 @@ namespace sw ...@@ -824,33 +562,25 @@ namespace sw
} }
// Repeat the last index to allow for SIMD width overrun. // Repeat the last index to allow for SIMD width overrun.
batch[triangleCount][0] = batch[triangleCount - 1][2]; triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
batch[triangleCount][1] = batch[triangleCount - 1][2]; triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
batch[triangleCount][2] = batch[triangleCount - 1][2]; triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
task->primitiveStart = start;
task->vertexCount = triangleCount * 3;
vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
} }
int Renderer::setupTriangles(int unit, int count) int DrawCall::setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{ {
Triangle *triangle = triangleBatch[unit]; auto &state = drawCall->setupState;
Primitive *primitive = primitiveBatch[unit]; auto setupRoutine = drawCall->setupPointer;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
SetupProcessor::State &state = draw.setupState;
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
int ms = state.multiSample; int ms = state.multiSample;
const DrawData *data = draw.data; const DrawData *data = drawCall->data;
int visible = 0; int visible = 0;
for(int i = 0; i < count; i++, triangle++) for(int i = 0; i < count; i++, triangles++)
{ {
Vertex &v0 = triangle->v0; Vertex &v0 = triangles->v0;
Vertex &v1 = triangle->v1; Vertex &v1 = triangles->v1;
Vertex &v2 = triangle->v2; Vertex &v2 = triangles->v2;
if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE) if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
{ {
...@@ -860,15 +590,15 @@ namespace sw ...@@ -860,15 +590,15 @@ namespace sw
if(clipFlagsOr != Clipper::CLIP_FINITE) if(clipFlagsOr != Clipper::CLIP_FINITE)
{ {
if(!Clipper::Clip(polygon, clipFlagsOr, draw)) if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
{ {
continue; continue;
} }
} }
if(setupRoutine(primitive, triangle, &polygon, data)) if(setupRoutine(primitives, triangles, &polygon, data))
{ {
primitive += ms; primitives += ms;
visible++; visible++;
} }
} }
...@@ -877,57 +607,49 @@ namespace sw ...@@ -877,57 +607,49 @@ namespace sw
return visible; return visible;
} }
int Renderer::setupLines(int unit, int count) int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{ {
Triangle *triangle = triangleBatch[unit]; auto &state = drawCall->setupState;
Primitive *primitive = primitiveBatch[unit];
int visible = 0;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
SetupProcessor::State &state = draw.setupState;
int visible = 0;
int ms = state.multiSample; int ms = state.multiSample;
for(int i = 0; i < count; i++) for(int i = 0; i < count; i++)
{ {
if(setupLine(*primitive, *triangle, draw)) if(setupLine(*primitives, *triangles, *drawCall))
{ {
primitive += ms; primitives += ms;
visible++; visible++;
} }
triangle++; triangles++;
} }
return visible; return visible;
} }
int Renderer::setupPoints(int unit, int count) int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{ {
Triangle *triangle = triangleBatch[unit]; auto &state = drawCall->setupState;
Primitive *primitive = primitiveBatch[unit];
int visible = 0;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
SetupProcessor::State &state = draw.setupState;
int visible = 0;
int ms = state.multiSample; int ms = state.multiSample;
for(int i = 0; i < count; i++) for(int i = 0; i < count; i++)
{ {
if(setupPoint(*primitive, *triangle, draw)) if(setupPoint(*primitives, *triangles, *drawCall))
{ {
primitive += ms; primitives += ms;
visible++; visible++;
} }
triangle++; triangles++;
} }
return visible; return visible;
} }
bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw) bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{ {
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer; const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
const DrawData &data = *draw.data; const DrawData &data = *draw.data;
...@@ -1120,7 +842,7 @@ namespace sw ...@@ -1120,7 +842,7 @@ namespace sw
return false; return false;
} }
bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw) bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{ {
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer; const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
const DrawData &data = *draw.data; const DrawData &data = *draw.data;
...@@ -1183,76 +905,6 @@ namespace sw ...@@ -1183,76 +905,6 @@ namespace sw
return false; return false;
} }
void Renderer::initializeThreads()
{
unitCount = ceilPow2(threadCount);
clusterCount = ceilPow2(threadCount);
for(int i = 0; i < unitCount; i++)
{
triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
}
for(int i = 0; i < threadCount; i++)
{
vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
vertexTask[i]->vertexCache.drawCall = -1;
task[i].type = Task::SUSPEND;
resume[i] = new Event();
suspend[i] = new Event();
Parameters parameters;
parameters.threadIndex = i;
parameters.renderer = this;
exitThreads = false;
worker[i] = new std::thread(threadFunction, &parameters);
suspend[i]->wait();
suspend[i]->signal();
}
}
void Renderer::terminateThreads()
{
while(threadsAwake != 0)
{
std::this_thread::yield();
}
for(int thread = 0; thread < threadCount; thread++)
{
if(worker[thread])
{
exitThreads = true;
resume[thread]->signal();
worker[thread]->join();
delete worker[thread];
worker[thread] = 0;
delete resume[thread];
resume[thread] = 0;
delete suspend[thread];
suspend[thread] = 0;
}
deallocate(vertexTask[thread]);
vertexTask[thread] = 0;
}
for(int i = 0; i < 16; i++)
{
deallocate(triangleBatch[i]);
triangleBatch[i] = 0;
deallocate(primitiveBatch[i]);
primitiveBatch[i] = 0;
}
}
void Renderer::addQuery(vk::Query *query) void Renderer::addQuery(vk::Query *query)
{ {
ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION); ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
...@@ -1292,28 +944,4 @@ namespace sw ...@@ -1292,28 +944,4 @@ namespace sw
this->scissor = scissor; this->scissor = scissor;
} }
void Renderer::updateConfiguration(bool initialUpdate)
{
if(initialUpdate)
{
terminateThreads();
VertexProcessor::setRoutineCacheSize(1024);
PixelProcessor::setRoutineCacheSize(1024);
SetupProcessor::setRoutineCacheSize(1024);
threadCount = CPUID::processAffinity();
CPUID::setEnableSSE4_1(true);
CPUID::setEnableSSSE3(true);
CPUID::setEnableSSE3(true);
CPUID::setEnableSSE2(true);
CPUID::setEnableSSE(true);
}
if(!initialUpdate && !worker[0])
{
initializeThreads();
}
}
} }
...@@ -19,11 +19,15 @@ ...@@ -19,11 +19,15 @@
#include "PixelProcessor.hpp" #include "PixelProcessor.hpp"
#include "SetupProcessor.hpp" #include "SetupProcessor.hpp"
#include "Plane.hpp" #include "Plane.hpp"
#include "Primitive.hpp"
#include "Blitter.hpp" #include "Blitter.hpp"
#include "Device/Config.hpp" #include "Device/Config.hpp"
#include "System/Synchronization.hpp"
#include "Vulkan/VkDescriptorSet.hpp" #include "Vulkan/VkDescriptorSet.hpp"
#include "Yarn/Pool.hpp"
#include "Yarn/Finally.hpp"
#include "Yarn/Ticket.hpp"
#include <atomic> #include <atomic>
#include <list> #include <list>
#include <mutex> #include <mutex>
...@@ -46,6 +50,14 @@ namespace sw ...@@ -46,6 +50,14 @@ namespace sw
class Resource; class Resource;
struct Constants; struct Constants;
static constexpr int MaxBatchSize = 128;
static constexpr int MaxBatchCount = 16;
static constexpr int MaxClusterCount = 16;
static constexpr int MaxDrawCount = 16;
using TriangleBatch = std::array<Triangle, MaxBatchSize>;
using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
struct DrawData struct DrawData
{ {
const Constants *constants; const Constants *constants;
...@@ -64,7 +76,7 @@ namespace sw ...@@ -64,7 +76,7 @@ namespace sw
PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise
PixelProcessor::Factor factor; PixelProcessor::Factor factor;
unsigned int occlusion[16]; // Number of pixels passing depth test unsigned int occlusion[MaxClusterCount]; // Number of pixels passing depth test
float4 Wx16; float4 Wx16;
float4 Hx16; float4 Hx16;
...@@ -100,71 +112,88 @@ namespace sw ...@@ -100,71 +112,88 @@ namespace sw
PushConstantStorage pushConstants; PushConstantStorage pushConstants;
}; };
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor struct DrawCall
{ {
struct Task struct BatchData
{ {
enum Type using Pool = yarn::BoundedPool<BatchData, MaxBatchCount, yarn::PoolPolicy::Preserve>;
{
PRIMITIVES, TriangleBatch triangles;
PIXELS, PrimitiveBatch primitives;
VertexTask vertexTask;
RESUME, unsigned int id;
SUSPEND unsigned int firstPrimitive;
}; unsigned int numPrimitives;
int numVisible;
void operator=(const Task& task) yarn::Ticket clusterTickets[MaxClusterCount];
{
type = task.type.load();
primitiveUnit = task.primitiveUnit.load();
pixelCluster = task.pixelCluster.load();
}
std::atomic<int> type;
std::atomic<int> primitiveUnit;
std::atomic<int> pixelCluster;
}; };
struct PrimitiveProgress using Pool = yarn::BoundedPool<DrawCall, MaxDrawCount, yarn::PoolPolicy::Preserve>;
{ using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
void init()
{
drawCall = 0;
firstPrimitive = 0;
primitiveCount = 0;
visible = 0;
references = 0;
}
std::atomic<int> drawCall;
std::atomic<int> firstPrimitive;
std::atomic<int> primitiveCount;
std::atomic<int> visible;
std::atomic<int> references;
};
struct PixelProgress DrawCall();
{ ~DrawCall();
void init()
{
drawCall = 0;
processedPrimitives = 0;
executing = false;
}
std::atomic<int> drawCall;
std::atomic<int> processedPrimitives;
std::atomic<int> executing;
};
static void run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount]);
static void processVertices(DrawCall* draw, BatchData* batch);
static void processPrimitives(DrawCall* draw, BatchData* batch);
static void processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally);
void setup();
void teardown();
int id;
BatchData::Pool *batchDataPool;
unsigned int numPrimitives;
unsigned int numPrimitivesPerBatch;
unsigned int numBatches;
VkPrimitiveTopology topology;
VkIndexType indexType;
std::shared_ptr<Routine> vertexRoutine;
std::shared_ptr<Routine> setupRoutine;
std::shared_ptr<Routine> pixelRoutine;
VertexProcessor::RoutinePointer vertexPointer;
SetupProcessor::RoutinePointer setupPointer;
PixelProcessor::RoutinePointer pixelPointer;
SetupFunction setupPrimitives;
SetupProcessor::State setupState;
vk::ImageView *renderTarget[RENDERTARGETS];
vk::ImageView *depthBuffer;
vk::ImageView *stencilBuffer;
TaskEvents *events;
vk::Query* occlusionQuery;
DrawData *data;
static void processPrimitiveVertices(
unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
const void *primitiveIndices,
VkIndexType indexType,
unsigned int start,
unsigned int triangleCount,
VkPrimitiveTopology topology);
static int setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
};
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
{
public: public:
Renderer(vk::Device* device); Renderer(vk::Device* device);
virtual ~Renderer(); virtual ~Renderer();
void *operator new(size_t size);
void operator delete(void * mem);
bool hasOcclusionQuery() const { return occlusionQuery != nullptr; } bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex, void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
...@@ -182,74 +211,18 @@ namespace sw ...@@ -182,74 +211,18 @@ namespace sw
void synchronize(); void synchronize();
static int getClusterCount() { return clusterCount; }
private: private:
static void threadFunction(void *parameters);
void threadLoop(int threadIndex);
void taskLoop(int threadIndex);
void findAvailableTasks();
void scheduleTask(int threadIndex);
void executeTask(int threadIndex);
void finishRendering(Task &pixelTask);
void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
int setupTriangles(int batch, int count);
int setupLines(int batch, int count);
int setupPoints(int batch, int count);
bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
void updateConfiguration(bool initialUpdate = false);
void initializeThreads();
void terminateThreads();
VkViewport viewport; VkViewport viewport;
VkRect2D scissor; VkRect2D scissor;
Triangle *triangleBatch[16]; DrawCall::Pool drawCallPool;
Primitive *primitiveBatch[16]; DrawCall::BatchData::Pool batchDataPool;
std::atomic<int> exitThreads;
std::atomic<int> threadsAwake;
std::thread *worker[16];
Event *resume[16]; // Events for resuming threads
Event *suspend[16]; // Events for suspending threads
Event *resumeApp; // Event for resuming the application thread
PrimitiveProgress primitiveProgress[16]; std::atomic<int> nextDrawID = {0};
PixelProgress pixelProgress[16];
Task task[16]; // Current tasks for threads
enum {
DRAW_COUNT = 16, // Number of draw calls buffered (must be power of 2)
DRAW_COUNT_BITS = DRAW_COUNT - 1,
};
DrawCall *drawCall[DRAW_COUNT];
DrawCall *drawList[DRAW_COUNT];
std::atomic<int> currentDraw;
std::atomic<int> nextDraw;
enum {
TASK_COUNT = 32, // Size of the task queue (must be power of 2)
TASK_COUNT_BITS = TASK_COUNT - 1,
};
Task taskQueue[TASK_COUNT];
std::atomic<int> qHead;
std::atomic<int> qSize;
static std::atomic<int> unitCount;
static std::atomic<int> clusterCount;
std::mutex schedulerMutex;
VertexTask *vertexTask[16];
vk::Query *occlusionQuery; vk::Query *occlusionQuery;
WaitGroup sync; yarn::Ticket::Queue drawTickets;
yarn::Ticket::Queue clusterQueues[MaxClusterCount];
VertexProcessor::State vertexState; VertexProcessor::State vertexState;
SetupProcessor::State setupState; SetupProcessor::State setupState;
...@@ -262,40 +235,6 @@ namespace sw ...@@ -262,40 +235,6 @@ namespace sw
vk::Device* device; vk::Device* device;
}; };
struct DrawCall
{
DrawCall();
~DrawCall();
std::atomic<int> topology;
std::atomic<int> indexType;
std::atomic<int> batchSize;
std::shared_ptr<Routine> vertexRoutine;
std::shared_ptr<Routine> setupRoutine;
std::shared_ptr<Routine> pixelRoutine;
VertexProcessor::RoutinePointer vertexPointer;
SetupProcessor::RoutinePointer setupPointer;
PixelProcessor::RoutinePointer pixelPointer;
int (Renderer::*setupPrimitives)(int batch, int count);
SetupProcessor::State setupState;
vk::ImageView *renderTarget[RENDERTARGETS];
vk::ImageView *depthBuffer;
vk::ImageView *stencilBuffer;
TaskEvents *events;
vk::Query *occlusionQuery;
std::atomic<int> primitive; // Current primitive to enter pipeline
std::atomic<int> count; // Number of primitives to render
std::atomic<int> references; // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
DrawData *data;
};
} }
#endif // sw_Renderer_hpp #endif // sw_Renderer_hpp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment