Commit cde4dd96 by Ben Clayton

Device: Migrate Renderer to Yarn

Drop the complex task scheduling logic for yarn. Performance gains seen up to around ~30% FPS. Bug: b/139142453 Change-Id: I264fee36323425a791088565d99dc586670a948a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35572Tested-by: 's avatarBen Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent b5f0a4be
......@@ -19,7 +19,6 @@
#include "Polygon.hpp"
#include "Reactor/Reactor.hpp"
#include "Pipeline/Constants.hpp"
#include "System/CPUID.hpp"
#include "System/Memory.hpp"
#include "System/Half.hpp"
#include "System/Math.hpp"
......@@ -33,6 +32,10 @@
#include "Pipeline/SpirvShader.hpp"
#include "Vertex.hpp"
#include "Yarn/Containers.hpp"
#include "Yarn/Defer.hpp"
#include "Yarn/Trace.hpp"
#undef max
#ifndef NDEBUG
......@@ -42,11 +45,6 @@ unsigned int maxPrimitives = 1 << 21;
namespace sw
{
static const int batchSize = 128;
std::atomic<int> threadCount(1);
std::atomic<int> Renderer::unitCount(1);
std::atomic<int> Renderer::clusterCount(1);
template<typename T>
inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, T indices, unsigned int start, unsigned int triangleCount)
{
......@@ -138,20 +136,8 @@ namespace sw
return true;
}
struct Parameters
{
Renderer *renderer;
int threadIndex;
};
DrawCall::DrawCall()
{
occlusionQuery = nullptr;
references = -1;
events = nullptr;
data = (DrawData*)allocate(sizeof(DrawData));
data->constants = &constants;
}
......@@ -163,74 +149,14 @@ namespace sw
Renderer::Renderer(vk::Device* device) : device(device)
{
for(int i = 0; i < 16; i++)
{
vertexTask[i] = nullptr;
worker[i] = nullptr;
resume[i] = nullptr;
suspend[i] = nullptr;
}
threadsAwake = 0;
resumeApp = new Event();
currentDraw = 0;
nextDraw = 0;
qHead = 0;
qSize = 0;
for(int i = 0; i < 16; i++)
{
triangleBatch[i] = nullptr;
primitiveBatch[i] = nullptr;
}
for(int draw = 0; draw < DRAW_COUNT; draw++)
{
drawCall[draw] = new DrawCall();
drawList[draw] = drawCall[draw];
}
for(int unit = 0; unit < 16; unit++)
{
primitiveProgress[unit].init();
}
for(int cluster = 0; cluster < 16; cluster++)
{
pixelProgress[cluster].init();
}
updateConfiguration(true);
VertexProcessor::setRoutineCacheSize(1024);
PixelProcessor::setRoutineCacheSize(1024);
SetupProcessor::setRoutineCacheSize(1024);
}
Renderer::~Renderer()
{
sync.wait();
terminateThreads();
delete resumeApp;
resumeApp = nullptr;
for(int draw = 0; draw < DRAW_COUNT; draw++)
{
delete drawCall[draw];
drawCall[draw] = nullptr;
}
}
// This object has to be mem aligned
void* Renderer::operator new(size_t size)
{
ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
return sw::allocate(sizeof(Renderer), 16);
}
void Renderer::operator delete(void * mem)
{
sw::deallocate(mem);
drawTickets.take().wait();
}
void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
......@@ -239,6 +165,9 @@ namespace sw
{
if(count == 0) { return; }
auto id = nextDrawID++;
YARN_SCOPED_EVENT("draw %d", id);
#ifndef NDEBUG
{
unsigned int minPrimitives = 1;
......@@ -250,8 +179,6 @@ namespace sw
}
#endif
updateConfiguration();
int ms = context->sampleCount;
if(!context->multiSampleMask)
......@@ -259,10 +186,16 @@ namespace sw
return;
}
sync.add();
yarn::Pool<sw::DrawCall>::Loan draw;
{
YARN_SCOPED_EVENT("drawCallPool.borrow()");
draw = drawCallPool.borrow();
}
draw->id = id;
if(update)
{
YARN_SCOPED_EVENT("update");
vertexState = VertexProcessor::update(context);
setupState = SetupProcessor::update(context);
pixelState = PixelProcessor::update(context);
......@@ -272,56 +205,29 @@ namespace sw
pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
}
int batch = batchSize / ms;
int (Renderer::*setupPrimitives)(int batch, int count);
DrawCall::SetupFunction setupPrimitives = nullptr;
if(context->isDrawTriangle())
{
setupPrimitives = &Renderer::setupTriangles;
setupPrimitives = &DrawCall::setupTriangles;
}
else if(context->isDrawLine())
{
setupPrimitives = &Renderer::setupLines;
setupPrimitives = &DrawCall::setupLines;
}
else // Point draw
{
setupPrimitives = &Renderer::setupPoints;
setupPrimitives = &DrawCall::setupPoints;
}
DrawCall *draw = nullptr;
do
{
for(int i = 0; i < DRAW_COUNT; i++)
{
if(drawCall[i]->references == -1)
{
draw = drawCall[i];
drawList[nextDraw & DRAW_COUNT_BITS] = draw;
break;
}
}
if(!draw)
{
resumeApp->wait();
}
}
while(!draw);
DrawData *data = draw->data;
if (occlusionQuery)
{
occlusionQuery->start();
}
draw->occlusionQuery = occlusionQuery;
draw->batchDataPool = &batchDataPool;
draw->numPrimitives = count;
draw->numPrimitivesPerBatch = MaxBatchSize / ms;
draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
draw->topology = context->topology;
draw->indexType = indexType;
draw->batchSize = batch;
draw->vertexRoutine = vertexRoutine;
draw->setupRoutine = setupRoutine;
......@@ -335,14 +241,6 @@ namespace sw
data->descriptorSets = context->descriptorSets;
data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
if(events)
{
events->start();
}
ASSERT(!draw->events);
draw->events = events;
for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
{
data->input[i] = context->input[i].buffer;
......@@ -383,7 +281,7 @@ namespace sw
if(pixelState.occlusionEnabled)
{
for(int cluster = 0; cluster < clusterCount; cluster++)
for(int cluster = 0; cluster < MaxClusterCount; cluster++)
{
data->occlusion[cluster] = 0;
}
......@@ -461,357 +359,197 @@ namespace sw
data->pushConstants = pushConstants;
}
draw->primitive = 0;
draw->count = count;
draw->references = (count + batch - 1) / batch;
schedulerMutex.lock();
++nextDraw; // Atomic
schedulerMutex.unlock();
#ifndef NDEBUG
if(threadCount == 1) // Use main thread for draw execution
{
threadsAwake = 1;
task[0].type = Task::RESUME;
taskLoop(0);
}
else
#endif
{
if(!threadsAwake)
{
suspend[0]->wait();
threadsAwake = 1;
task[0].type = Task::RESUME;
resume[0]->signal();
}
}
}
void Renderer::threadFunction(void *parameters)
{
Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;
CPUID::setFlushToZero(true);
CPUID::setDenormalsAreZero(true);
draw->events = events;
renderer->threadLoop(threadIndex);
DrawCall::run(draw, &drawTickets, clusterQueues);
}
void Renderer::threadLoop(int threadIndex)
void DrawCall::setup()
{
while(!exitThreads)
if(occlusionQuery != nullptr)
{
taskLoop(threadIndex);
suspend[threadIndex]->signal();
resume[threadIndex]->wait();
occlusionQuery->start();
}
}
void Renderer::taskLoop(int threadIndex)
{
while(task[threadIndex].type != Task::SUSPEND)
if(events)
{
scheduleTask(threadIndex);
executeTask(threadIndex);
events->start();
}
}
void Renderer::findAvailableTasks()
void DrawCall::teardown()
{
// Find pixel tasks
for(int cluster = 0; cluster < clusterCount; cluster++)
{
if(!pixelProgress[cluster].executing)
{
for(int unit = 0; unit < unitCount; unit++)
{
if(primitiveProgress[unit].references > 0) // Contains processed primitives
{
if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
{
if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive) // Previous primitives have been rendered
{
Task &task = taskQueue[qHead];
task.type = Task::PIXELS;
task.primitiveUnit = unit;
task.pixelCluster = cluster;
pixelProgress[cluster].executing = true;
// Commit to the task queue
qHead = (qHead + 1) & TASK_COUNT_BITS;
++qSize; // Atomic
break;
}
}
}
}
}
}
// Find primitive tasks
if(currentDraw == nextDraw)
if(events)
{
return; // No more primitives to process
events->finish();
events = nullptr;
}
for(int unit = 0; unit < unitCount; unit++)
if (occlusionQuery != nullptr)
{
DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS];
int primitive = draw->primitive;
int count = draw->count;
if(primitive >= count)
{
++currentDraw; // Atomic
if(currentDraw == nextDraw)
{
return; // No more primitives to process
}
draw = drawList[currentDraw & DRAW_COUNT_BITS];
}
if(!primitiveProgress[unit].references) // Task not already being executed and not still in use by a pixel unit
for(int cluster = 0; cluster < MaxClusterCount; cluster++)
{
primitive = draw->primitive;
count = draw->count;
int batch = draw->batchSize;
primitiveProgress[unit].drawCall = currentDraw.load();
primitiveProgress[unit].firstPrimitive = primitive;
primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;
draw->primitive += batch;
Task &task = taskQueue[qHead];
task.type = Task::PRIMITIVES;
task.primitiveUnit = unit;
primitiveProgress[unit].references = -1;
// Commit to the task queue
qHead = (qHead + 1) & TASK_COUNT_BITS;
++qSize; // Atomic
occlusionQuery->add(data->occlusion[cluster]);
}
occlusionQuery->finish();
}
vertexRoutine.reset();
setupRoutine.reset();
pixelRoutine.reset();
}
void Renderer::scheduleTask(int threadIndex)
void DrawCall::run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount])
{
schedulerMutex.lock();
draw->setup();
int curThreadsAwake = threadsAwake;
auto const numPrimitives = draw->numPrimitives;
auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
auto const numBatches = draw->numBatches;
if((int)qSize < threadCount - curThreadsAwake + 1)
{
findAvailableTasks();
}
auto ticket = tickets->take();
auto finally = yarn::make_shared_finally([draw, ticket] {
YARN_SCOPED_EVENT("FINISH draw %d", draw->id);
draw->teardown();
ticket.done();
});
if(qSize != 0)
for (unsigned int batchId = 0; batchId < numBatches; batchId++)
{
task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS];
--qSize; // Atomic
auto batch = draw->batchDataPool->borrow();
batch->id = batchId;
batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
if(curThreadsAwake != threadCount)
for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{
int wakeup = qSize - curThreadsAwake + 1;
for(int i = 0; i < threadCount && wakeup > 0; i++)
{
if(task[i].type == Task::SUSPEND)
{
suspend[i]->wait();
task[i].type = Task::RESUME;
resume[i]->signal();
++threadsAwake; // Atomic
wakeup--;
}
}
batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
}
}
else
{
task[threadIndex].type = Task::SUSPEND;
--threadsAwake; // Atomic
}
yarn::schedule([draw, batch, finally] {
schedulerMutex.unlock();
}
processVertices(draw.get(), batch.get());
void Renderer::executeTask(int threadIndex)
{
switch(task[threadIndex].type.load())
{
case Task::PRIMITIVES:
{
int unit = task[threadIndex].primitiveUnit;
int input = primitiveProgress[unit].firstPrimitive;
int count = primitiveProgress[unit].primitiveCount;
DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;
processPrimitiveVertices(unit, input, count, draw->count, threadIndex);
int visible = 0;
if(!draw->setupState.rasterizerDiscard)
if (!draw->setupState.rasterizerDiscard)
{
visible = (this->*setupPrimitives)(unit, count);
}
processPrimitives(draw.get(), batch.get());
primitiveProgress[unit].visible = visible;
primitiveProgress[unit].references = clusterCount.load();
}
break;
case Task::PIXELS:
{
int unit = task[threadIndex].primitiveUnit;
int visible = primitiveProgress[unit].visible;
if (batch->numVisible > 0)
{
processPixels(draw, batch, finally);
return;
}
}
if(visible > 0)
for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{
int cluster = task[threadIndex].pixelCluster;
Primitive *primitive = primitiveBatch[unit];
DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
DrawData *data = draw->data;
PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
pixelRoutine(primitive, visible, cluster, clusterCount, data);
batch->clusterTickets[cluster].done();
}
finishRendering(task[threadIndex]);
}
break;
case Task::RESUME:
break;
case Task::SUSPEND:
break;
default:
ASSERT(false);
});
}
}
void Renderer::synchronize()
void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
{
sync.wait();
device->updateSamplingRoutineConstCache();
}
void Renderer::finishRendering(Task &pixelTask)
{
int unit = pixelTask.primitiveUnit;
int cluster = pixelTask.pixelCluster;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
DrawData &data = *draw.data;
int primitive = primitiveProgress[unit].firstPrimitive;
int count = primitiveProgress[unit].primitiveCount;
int processedPrimitives = primitive + count;
YARN_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
pixelProgress[cluster].processedPrimitives = processedPrimitives;
if(pixelProgress[cluster].processedPrimitives >= draw.count)
unsigned int triangleIndices[MaxBatchSize + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
{
++pixelProgress[cluster].drawCall; // Atomic
pixelProgress[cluster].processedPrimitives = 0;
YARN_SCOPED_EVENT("processPrimitiveVertices");
processPrimitiveVertices(
triangleIndices,
draw->data->indices,
draw->indexType,
batch->firstPrimitive,
batch->numPrimitives,
draw->topology);
}
int ref = --primitiveProgress[unit].references; // Atomic
if(ref == 0)
auto& vertexTask = batch->vertexTask;
vertexTask.primitiveStart = batch->firstPrimitive;
vertexTask.vertexCount = batch->numPrimitives * 3;
if (vertexTask.vertexCache.drawCall != draw->id)
{
ref = --draw.references; // Atomic
if(ref == 0)
{
if (draw.occlusionQuery)
{
for(int cluster = 0; cluster < clusterCount; cluster++)
{
draw.occlusionQuery->add(data.occlusion[cluster]);
}
draw.occlusionQuery->finish();
}
draw.vertexRoutine.reset();
draw.setupRoutine.reset();
draw.pixelRoutine.reset();
if(draw.events)
{
draw.events->finish();
draw.events = nullptr;
}
sync.done();
draw.references = -1;
resumeApp->signal();
}
vertexTask.vertexCache.clear();
vertexTask.vertexCache.drawCall = draw->id;
}
pixelProgress[cluster].executing = false;
draw->vertexPointer(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
}
void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
{
Triangle *triangle = triangleBatch[unit];
int primitiveDrawCall = primitiveProgress[unit].drawCall;
DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS];
DrawData *data = draw->data;
VertexTask *task = vertexTask[thread];
const void *indices = data->indices;
VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
YARN_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
auto triangles = &batch->triangles[0];
auto primitives = &batch->primitives[0];
batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
}
if(task->vertexCache.drawCall != primitiveDrawCall)
{
task->vertexCache.clear();
task->vertexCache.drawCall = primitiveDrawCall;
void DrawCall::processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
{
struct Data
{
Data(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
: draw(draw), batch(batch), finally(finally) {}
yarn::Loan<DrawCall> draw;
yarn::Loan<BatchData> batch;
std::shared_ptr<yarn::Finally> finally;
};
auto data = std::make_shared<Data>(draw, batch, finally);
for (int cluster = 0; cluster < MaxClusterCount; cluster++)
{
batch->clusterTickets[cluster].onCall([data, cluster]
{
auto& draw = data->draw;
auto& batch = data->batch;
YARN_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
draw->pixelPointer(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
batch->clusterTickets[cluster].done();
});
}
}
unsigned int batch[128 + 1][3]; // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology));
void Renderer::synchronize()
{
YARN_SCOPED_EVENT("synchronize");
auto ticket = drawTickets.take();
ticket.wait();
device->updateSamplingRoutineConstCache();
ticket.done();
}
if(!indices)
void DrawCall::processPrimitiveVertices(
unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
const void *primitiveIndices,
VkIndexType indexType,
unsigned int start,
unsigned int triangleCount,
VkPrimitiveTopology topology)
{
if(!primitiveIndices)
{
struct LinearIndex
{
unsigned int operator[](unsigned int i) { return i; }
};
if(!setBatchIndices(batch, topology, LinearIndex(), start, triangleCount))
if(!setBatchIndices(triangleIndicesOut, topology, LinearIndex(), start, triangleCount))
{
return;
}
}
else
{
switch(draw->indexType.load())
switch(indexType)
{
case VK_INDEX_TYPE_UINT16:
if(!setBatchIndices(batch, topology, static_cast<const uint16_t*>(indices), start, triangleCount))
if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
{
return;
}
break;
case VK_INDEX_TYPE_UINT32:
if(!setBatchIndices(batch, topology, static_cast<const uint32_t*>(indices), start, triangleCount))
if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
{
return;
}
......@@ -824,33 +562,25 @@ namespace sw
}
// Repeat the last index to allow for SIMD width overrun.
batch[triangleCount][0] = batch[triangleCount - 1][2];
batch[triangleCount][1] = batch[triangleCount - 1][2];
batch[triangleCount][2] = batch[triangleCount - 1][2];
task->primitiveStart = start;
task->vertexCount = triangleCount * 3;
vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
}
int Renderer::setupTriangles(int unit, int count)
int DrawCall::setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
Triangle *triangle = triangleBatch[unit];
Primitive *primitive = primitiveBatch[unit];
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
SetupProcessor::State &state = draw.setupState;
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
auto &state = drawCall->setupState;
auto setupRoutine = drawCall->setupPointer;
int ms = state.multiSample;
const DrawData *data = draw.data;
const DrawData *data = drawCall->data;
int visible = 0;
for(int i = 0; i < count; i++, triangle++)
for(int i = 0; i < count; i++, triangles++)
{
Vertex &v0 = triangle->v0;
Vertex &v1 = triangle->v1;
Vertex &v2 = triangle->v2;
Vertex &v0 = triangles->v0;
Vertex &v1 = triangles->v1;
Vertex &v2 = triangles->v2;
if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
{
......@@ -860,15 +590,15 @@ namespace sw
if(clipFlagsOr != Clipper::CLIP_FINITE)
{
if(!Clipper::Clip(polygon, clipFlagsOr, draw))
if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
{
continue;
}
}
if(setupRoutine(primitive, triangle, &polygon, data))
if(setupRoutine(primitives, triangles, &polygon, data))
{
primitive += ms;
primitives += ms;
visible++;
}
}
......@@ -877,57 +607,49 @@ namespace sw
return visible;
}
int Renderer::setupLines(int unit, int count)
int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
Triangle *triangle = triangleBatch[unit];
Primitive *primitive = primitiveBatch[unit];
int visible = 0;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
SetupProcessor::State &state = draw.setupState;
auto &state = drawCall->setupState;
int visible = 0;
int ms = state.multiSample;
for(int i = 0; i < count; i++)
{
if(setupLine(*primitive, *triangle, draw))
if(setupLine(*primitives, *triangles, *drawCall))
{
primitive += ms;
primitives += ms;
visible++;
}
triangle++;
triangles++;
}
return visible;
}
int Renderer::setupPoints(int unit, int count)
int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
{
Triangle *triangle = triangleBatch[unit];
Primitive *primitive = primitiveBatch[unit];
int visible = 0;
DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
SetupProcessor::State &state = draw.setupState;
auto &state = drawCall->setupState;
int visible = 0;
int ms = state.multiSample;
for(int i = 0; i < count; i++)
{
if(setupPoint(*primitive, *triangle, draw))
if(setupPoint(*primitives, *triangles, *drawCall))
{
primitive += ms;
primitives += ms;
visible++;
}
triangle++;
triangles++;
}
return visible;
}
bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
const DrawData &data = *draw.data;
......@@ -1120,7 +842,7 @@ namespace sw
return false;
}
bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
{
const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
const DrawData &data = *draw.data;
......@@ -1183,76 +905,6 @@ namespace sw
return false;
}
void Renderer::initializeThreads()
{
unitCount = ceilPow2(threadCount);
clusterCount = ceilPow2(threadCount);
for(int i = 0; i < unitCount; i++)
{
triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
}
for(int i = 0; i < threadCount; i++)
{
vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
vertexTask[i]->vertexCache.drawCall = -1;
task[i].type = Task::SUSPEND;
resume[i] = new Event();
suspend[i] = new Event();
Parameters parameters;
parameters.threadIndex = i;
parameters.renderer = this;
exitThreads = false;
worker[i] = new std::thread(threadFunction, &parameters);
suspend[i]->wait();
suspend[i]->signal();
}
}
void Renderer::terminateThreads()
{
while(threadsAwake != 0)
{
std::this_thread::yield();
}
for(int thread = 0; thread < threadCount; thread++)
{
if(worker[thread])
{
exitThreads = true;
resume[thread]->signal();
worker[thread]->join();
delete worker[thread];
worker[thread] = 0;
delete resume[thread];
resume[thread] = 0;
delete suspend[thread];
suspend[thread] = 0;
}
deallocate(vertexTask[thread]);
vertexTask[thread] = 0;
}
for(int i = 0; i < 16; i++)
{
deallocate(triangleBatch[i]);
triangleBatch[i] = 0;
deallocate(primitiveBatch[i]);
primitiveBatch[i] = 0;
}
}
void Renderer::addQuery(vk::Query *query)
{
ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
......@@ -1292,28 +944,4 @@ namespace sw
this->scissor = scissor;
}
void Renderer::updateConfiguration(bool initialUpdate)
{
if(initialUpdate)
{
terminateThreads();
VertexProcessor::setRoutineCacheSize(1024);
PixelProcessor::setRoutineCacheSize(1024);
SetupProcessor::setRoutineCacheSize(1024);
threadCount = CPUID::processAffinity();
CPUID::setEnableSSE4_1(true);
CPUID::setEnableSSSE3(true);
CPUID::setEnableSSE3(true);
CPUID::setEnableSSE2(true);
CPUID::setEnableSSE(true);
}
if(!initialUpdate && !worker[0])
{
initializeThreads();
}
}
}
......@@ -19,11 +19,15 @@
#include "PixelProcessor.hpp"
#include "SetupProcessor.hpp"
#include "Plane.hpp"
#include "Primitive.hpp"
#include "Blitter.hpp"
#include "Device/Config.hpp"
#include "System/Synchronization.hpp"
#include "Vulkan/VkDescriptorSet.hpp"
#include "Yarn/Pool.hpp"
#include "Yarn/Finally.hpp"
#include "Yarn/Ticket.hpp"
#include <atomic>
#include <list>
#include <mutex>
......@@ -46,6 +50,14 @@ namespace sw
class Resource;
struct Constants;
static constexpr int MaxBatchSize = 128;
static constexpr int MaxBatchCount = 16;
static constexpr int MaxClusterCount = 16;
static constexpr int MaxDrawCount = 16;
using TriangleBatch = std::array<Triangle, MaxBatchSize>;
using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
struct DrawData
{
const Constants *constants;
......@@ -64,7 +76,7 @@ namespace sw
PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise
PixelProcessor::Factor factor;
unsigned int occlusion[16]; // Number of pixels passing depth test
unsigned int occlusion[MaxClusterCount]; // Number of pixels passing depth test
float4 Wx16;
float4 Hx16;
......@@ -100,71 +112,88 @@ namespace sw
PushConstantStorage pushConstants;
};
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
struct DrawCall
{
struct Task
struct BatchData
{
enum Type
{
PRIMITIVES,
PIXELS,
RESUME,
SUSPEND
};
void operator=(const Task& task)
{
type = task.type.load();
primitiveUnit = task.primitiveUnit.load();
pixelCluster = task.pixelCluster.load();
}
std::atomic<int> type;
std::atomic<int> primitiveUnit;
std::atomic<int> pixelCluster;
using Pool = yarn::BoundedPool<BatchData, MaxBatchCount, yarn::PoolPolicy::Preserve>;
TriangleBatch triangles;
PrimitiveBatch primitives;
VertexTask vertexTask;
unsigned int id;
unsigned int firstPrimitive;
unsigned int numPrimitives;
int numVisible;
yarn::Ticket clusterTickets[MaxClusterCount];
};
struct PrimitiveProgress
{
void init()
{
drawCall = 0;
firstPrimitive = 0;
primitiveCount = 0;
visible = 0;
references = 0;
}
std::atomic<int> drawCall;
std::atomic<int> firstPrimitive;
std::atomic<int> primitiveCount;
std::atomic<int> visible;
std::atomic<int> references;
};
using Pool = yarn::BoundedPool<DrawCall, MaxDrawCount, yarn::PoolPolicy::Preserve>;
using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
struct PixelProgress
{
void init()
{
drawCall = 0;
processedPrimitives = 0;
executing = false;
}
std::atomic<int> drawCall;
std::atomic<int> processedPrimitives;
std::atomic<int> executing;
};
DrawCall();
~DrawCall();
static void run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount]);
static void processVertices(DrawCall* draw, BatchData* batch);
static void processPrimitives(DrawCall* draw, BatchData* batch);
static void processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally);
void setup();
void teardown();
int id;
BatchData::Pool *batchDataPool;
unsigned int numPrimitives;
unsigned int numPrimitivesPerBatch;
unsigned int numBatches;
VkPrimitiveTopology topology;
VkIndexType indexType;
std::shared_ptr<Routine> vertexRoutine;
std::shared_ptr<Routine> setupRoutine;
std::shared_ptr<Routine> pixelRoutine;
VertexProcessor::RoutinePointer vertexPointer;
SetupProcessor::RoutinePointer setupPointer;
PixelProcessor::RoutinePointer pixelPointer;
SetupFunction setupPrimitives;
SetupProcessor::State setupState;
vk::ImageView *renderTarget[RENDERTARGETS];
vk::ImageView *depthBuffer;
vk::ImageView *stencilBuffer;
TaskEvents *events;
vk::Query* occlusionQuery;
DrawData *data;
static void processPrimitiveVertices(
unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
const void *primitiveIndices,
VkIndexType indexType,
unsigned int start,
unsigned int triangleCount,
VkPrimitiveTopology topology);
static int setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
};
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
{
public:
Renderer(vk::Device* device);
virtual ~Renderer();
void *operator new(size_t size);
void operator delete(void * mem);
bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
......@@ -182,74 +211,18 @@ namespace sw
void synchronize();
static int getClusterCount() { return clusterCount; }
private:
static void threadFunction(void *parameters);
void threadLoop(int threadIndex);
void taskLoop(int threadIndex);
void findAvailableTasks();
void scheduleTask(int threadIndex);
void executeTask(int threadIndex);
void finishRendering(Task &pixelTask);
void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
int setupTriangles(int batch, int count);
int setupLines(int batch, int count);
int setupPoints(int batch, int count);
bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
void updateConfiguration(bool initialUpdate = false);
void initializeThreads();
void terminateThreads();
VkViewport viewport;
VkRect2D scissor;
Triangle *triangleBatch[16];
Primitive *primitiveBatch[16];
std::atomic<int> exitThreads;
std::atomic<int> threadsAwake;
std::thread *worker[16];
Event *resume[16]; // Events for resuming threads
Event *suspend[16]; // Events for suspending threads
Event *resumeApp; // Event for resuming the application thread
DrawCall::Pool drawCallPool;
DrawCall::BatchData::Pool batchDataPool;
PrimitiveProgress primitiveProgress[16];
PixelProgress pixelProgress[16];
Task task[16]; // Current tasks for threads
enum {
DRAW_COUNT = 16, // Number of draw calls buffered (must be power of 2)
DRAW_COUNT_BITS = DRAW_COUNT - 1,
};
DrawCall *drawCall[DRAW_COUNT];
DrawCall *drawList[DRAW_COUNT];
std::atomic<int> currentDraw;
std::atomic<int> nextDraw;
enum {
TASK_COUNT = 32, // Size of the task queue (must be power of 2)
TASK_COUNT_BITS = TASK_COUNT - 1,
};
Task taskQueue[TASK_COUNT];
std::atomic<int> qHead;
std::atomic<int> qSize;
static std::atomic<int> unitCount;
static std::atomic<int> clusterCount;
std::mutex schedulerMutex;
VertexTask *vertexTask[16];
std::atomic<int> nextDrawID = {0};
vk::Query *occlusionQuery;
WaitGroup sync;
yarn::Ticket::Queue drawTickets;
yarn::Ticket::Queue clusterQueues[MaxClusterCount];
VertexProcessor::State vertexState;
SetupProcessor::State setupState;
......@@ -262,40 +235,6 @@ namespace sw
vk::Device* device;
};
struct DrawCall
{
DrawCall();
~DrawCall();
std::atomic<int> topology;
std::atomic<int> indexType;
std::atomic<int> batchSize;
std::shared_ptr<Routine> vertexRoutine;
std::shared_ptr<Routine> setupRoutine;
std::shared_ptr<Routine> pixelRoutine;
VertexProcessor::RoutinePointer vertexPointer;
SetupProcessor::RoutinePointer setupPointer;
PixelProcessor::RoutinePointer pixelPointer;
int (Renderer::*setupPrimitives)(int batch, int count);
SetupProcessor::State setupState;
vk::ImageView *renderTarget[RENDERTARGETS];
vk::ImageView *depthBuffer;
vk::ImageView *stencilBuffer;
TaskEvents *events;
vk::Query *occlusionQuery;
std::atomic<int> primitive; // Current primitive to enter pipeline
std::atomic<int> count; // Number of primitives to render
std::atomic<int> references; // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
DrawData *data;
};
}
#endif // sw_Renderer_hpp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment