Device: Migrate Renderer to Yarn

Drop the complex task scheduling logic for yarn. Performance gains seen up to around ~30% FPS. Bug: b/139142453 Change-Id: I264fee36323425a791088565d99dc586670a948a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35572Tested-by: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com>

Device: Migrate Renderer to Yarn
cde4dd96 · Ben Clayton · b5f0a4be · cde4dd96 · cde4dd96
Commit cde4dd96 authored Aug 27, 2019 by Ben Clayton
Hide whitespace changes
Inline Side-by-side

Showing with 262 additions and 695 deletions

Renderer.cpp src/Device/Renderer.cpp +172 -544

Renderer.hpp src/Device/Renderer.hpp +90 -151

No files found.
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
@@ -19,7 +19,6 @@
 #include "Polygon.hpp"
 #include "Reactor/Reactor.hpp"
 #include "Pipeline/Constants.hpp"
-#include "System/CPUID.hpp"
 #include "System/Memory.hpp"
 #include "System/Half.hpp"
 #include "System/Math.hpp"
@@ -33,6 +32,10 @@
 #include "Pipeline/SpirvShader.hpp"
 #include "Vertex.hpp"
+#include "Yarn/Containers.hpp"
+#include "Yarn/Defer.hpp"
+#include "Yarn/Trace.hpp"
 #undef max
 #ifndef NDEBUG
@@ -42,11 +45,6 @@ unsigned int maxPrimitives = 1 << 21;
 namespace sw
 {
-	static const int batchSize = 128;
-	std::atomic<int> threadCount(1);
-	std::atomic<int> Renderer::unitCount(1);
-	std::atomic<int> Renderer::clusterCount(1);
 	template<typename T>
 	inline bool setBatchIndices(unsigned int batch[128][3], VkPrimitiveTopology topology, T indices, unsigned int start, unsigned int triangleCount)
 	{
@@ -138,20 +136,8 @@ namespace sw
 		return true;
 	}
-	struct Parameters
-	{
-		Renderer *renderer;
-		int threadIndex;
-	};
 	DrawCall::DrawCall()
 	{
-		occlusionQuery = nullptr;
-		references = -1;
-		events = nullptr;
 		data = (DrawData*)allocate(sizeof(DrawData));
 		data->constants = &constants;
 	}
@@ -163,74 +149,14 @@ namespace sw
 	Renderer::Renderer(vk::Device* device) : device(device)
 	{
-		for(int i = 0; i < 16; i++)
+		VertexProcessor::setRoutineCacheSize(1024);
-		{
+		PixelProcessor::setRoutineCacheSize(1024);
-			vertexTask[i] = nullptr;
+		SetupProcessor::setRoutineCacheSize(1024);
-			worker[i] = nullptr;
-			resume[i] = nullptr;
-			suspend[i] = nullptr;
-		}
-		threadsAwake = 0;
-		resumeApp = new Event();
-		currentDraw = 0;
-		nextDraw = 0;
-		qHead = 0;
-		qSize = 0;
-		for(int i = 0; i < 16; i++)
-		{
-			triangleBatch[i] = nullptr;
-			primitiveBatch[i] = nullptr;
-		}
-		for(int draw = 0; draw < DRAW_COUNT; draw++)
-		{
-			drawCall[draw] = new DrawCall();
-			drawList[draw] = drawCall[draw];
-		}
-		for(int unit = 0; unit < 16; unit++)
-		{
-			primitiveProgress[unit].init();
-		}
-		for(int cluster = 0; cluster < 16; cluster++)
-		{
-			pixelProgress[cluster].init();
-		}
-		updateConfiguration(true);
 	}
 	Renderer::~Renderer()
 	{
-		sync.wait();
+		drawTickets.take().wait();
-		terminateThreads();
-		delete resumeApp;
-		resumeApp = nullptr;
-		for(int draw = 0; draw < DRAW_COUNT; draw++)
-		{
-			delete drawCall[draw];
-			drawCall[draw] = nullptr;
-		}
-	}
-	// This object has to be mem aligned
-	void* Renderer::operator new(size_t size)
-	{
-		ASSERT(size == sizeof(Renderer)); // This operator can't be called from a derived class
-		return sw::allocate(sizeof(Renderer), 16);
-	}
-	void Renderer::operator delete(void * mem)
-	{
-		sw::deallocate(mem);
 	}
 	void Renderer::draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
@@ -239,6 +165,9 @@ namespace sw
 	{
 		if(count == 0) { return; }
+		auto id = nextDrawID++;
+		YARN_SCOPED_EVENT("draw %d", id);
 		#ifndef NDEBUG
 		{
 			unsigned int minPrimitives = 1;
@@ -250,8 +179,6 @@ namespace sw
 		}
 		#endif
-		updateConfiguration();
 		int ms = context->sampleCount;
 		if(!context->multiSampleMask)
@@ -259,10 +186,16 @@ namespace sw
 			return;
 		}
-		sync.add();
+		yarn::Pool<sw::DrawCall>::Loan draw;
+		{
+			YARN_SCOPED_EVENT("drawCallPool.borrow()");
+			draw = drawCallPool.borrow();
+		}
+		draw->id = id;
 		if(update)
 		{
+			YARN_SCOPED_EVENT("update");
 			vertexState = VertexProcessor::update(context);
 			setupState = SetupProcessor::update(context);
 			pixelState = PixelProcessor::update(context);
@@ -272,56 +205,29 @@ namespace sw
 			pixelRoutine = PixelProcessor::routine(pixelState, context->pipelineLayout, context->pixelShader, context->descriptorSets);
 		}
-		int batch = batchSize / ms;
+		DrawCall::SetupFunction setupPrimitives = nullptr;
-		int (Renderer::*setupPrimitives)(int batch, int count);
 		if(context->isDrawTriangle())
 		{
-			setupPrimitives = &Renderer::setupTriangles;
+			setupPrimitives = &DrawCall::setupTriangles;
 		}
 		else if(context->isDrawLine())
 		{
-			setupPrimitives = &Renderer::setupLines;
+			setupPrimitives = &DrawCall::setupLines;
 		}
 		else   // Point draw
 		{
-			setupPrimitives = &Renderer::setupPoints;
+			setupPrimitives = &DrawCall::setupPoints;
 		}
-		DrawCall *draw = nullptr;
-		do
-		{
-			for(int i = 0; i < DRAW_COUNT; i++)
-			{
-				if(drawCall[i]->references == -1)
-				{
-					draw = drawCall[i];
-					drawList[nextDraw & DRAW_COUNT_BITS] = draw;
-					break;
-				}
-			}
-			if(!draw)
-			{
-				resumeApp->wait();
-			}
-		}
-		while(!draw);
 		DrawData *data = draw->data;
-		if (occlusionQuery)
-		{
-			occlusionQuery->start();
-		}
 		draw->occlusionQuery = occlusionQuery;
+		draw->batchDataPool = &batchDataPool;
+		draw->numPrimitives = count;
+		draw->numPrimitivesPerBatch = MaxBatchSize / ms;
+		draw->numBatches = (count + draw->numPrimitivesPerBatch - 1) / draw->numPrimitivesPerBatch;
 		draw->topology = context->topology;
 		draw->indexType = indexType;
-		draw->batchSize = batch;
 		draw->vertexRoutine = vertexRoutine;
 		draw->setupRoutine = setupRoutine;
@@ -335,14 +241,6 @@ namespace sw
 		data->descriptorSets = context->descriptorSets;
 		data->descriptorDynamicOffsets = context->descriptorDynamicOffsets;
-		if(events)
-		{
-			events->start();
-		}
-		ASSERT(!draw->events);
-		draw->events = events;
 		for(int i = 0; i < MAX_INTERFACE_COMPONENTS/4; i++)
 		{
 			data->input[i] = context->input[i].buffer;
@@ -383,7 +281,7 @@ namespace sw
 		if(pixelState.occlusionEnabled)
 		{
-			for(int cluster = 0; cluster < clusterCount; cluster++)
+			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
 			{
 				data->occlusion[cluster] = 0;
 			}
@@ -461,357 +359,197 @@ namespace sw
 			data->pushConstants = pushConstants;
 		}
-		draw->primitive = 0;
+		draw->events = events;
-		draw->count = count;
-		draw->references = (count + batch - 1) / batch;
-		schedulerMutex.lock();
-		++nextDraw; // Atomic
-		schedulerMutex.unlock();
-		#ifndef NDEBUG
-		if(threadCount == 1)   // Use main thread for draw execution
-		{
-			threadsAwake = 1;
-			task[0].type = Task::RESUME;
-			taskLoop(0);
-		}
-		else
-		#endif
-		{
-			if(!threadsAwake)
-			{
-				suspend[0]->wait();
-				threadsAwake = 1;
-				task[0].type = Task::RESUME;
-				resume[0]->signal();
-			}
-		}
-	}
-	void Renderer::threadFunction(void *parameters)
-	{
-		Renderer *renderer = static_cast<Parameters*>(parameters)->renderer;
-		int threadIndex = static_cast<Parameters*>(parameters)->threadIndex;
-		CPUID::setFlushToZero(true);
-		CPUID::setDenormalsAreZero(true);
-		renderer->threadLoop(threadIndex);
+		DrawCall::run(draw, &drawTickets, clusterQueues);
 	}
-	void Renderer::threadLoop(int threadIndex)
+	void DrawCall::setup()
 	{
-		while(!exitThreads)
+		if(occlusionQuery != nullptr)
 		{
-			taskLoop(threadIndex);
+			occlusionQuery->start();
-			suspend[threadIndex]->signal();
-			resume[threadIndex]->wait();
 		}
-	}
-	void Renderer::taskLoop(int threadIndex)
+		if(events)
-	{
-		while(task[threadIndex].type != Task::SUSPEND)
 		{
-			scheduleTask(threadIndex);
+			events->start();
-			executeTask(threadIndex);
 		}
 	}
-	void Renderer::findAvailableTasks()
+	void DrawCall::teardown()
 	{
-		// Find pixel tasks
+		if(events)
-		for(int cluster = 0; cluster < clusterCount; cluster++)
-		{
-			if(!pixelProgress[cluster].executing)
-			{
-				for(int unit = 0; unit < unitCount; unit++)
-				{
-					if(primitiveProgress[unit].references > 0)   // Contains processed primitives
-					{
-						if(pixelProgress[cluster].drawCall == primitiveProgress[unit].drawCall)
-						{
-							if(pixelProgress[cluster].processedPrimitives == primitiveProgress[unit].firstPrimitive)   // Previous primitives have been rendered
-							{
-								Task &task = taskQueue[qHead];
-								task.type = Task::PIXELS;
-								task.primitiveUnit = unit;
-								task.pixelCluster = cluster;
-								pixelProgress[cluster].executing = true;
-								// Commit to the task queue
-								qHead = (qHead + 1) & TASK_COUNT_BITS;
-								++qSize; // Atomic
-								break;
-							}
-						}
-					}
-				}
-			}
-		}
-		// Find primitive tasks
-		if(currentDraw == nextDraw)
 		{
-			return;   // No more primitives to process
+			events->finish();
+			events = nullptr;
 		}
-		for(int unit = 0; unit < unitCount; unit++)
+		if (occlusionQuery != nullptr)
 		{
-			DrawCall *draw = drawList[currentDraw & DRAW_COUNT_BITS];
+			for(int cluster = 0; cluster < MaxClusterCount; cluster++)
-			int primitive = draw->primitive;
-			int count = draw->count;
-			if(primitive >= count)
-			{
-				++currentDraw; // Atomic
-				if(currentDraw == nextDraw)
-				{
-					return;   // No more primitives to process
-				}
-				draw = drawList[currentDraw & DRAW_COUNT_BITS];
-			}
-			if(!primitiveProgress[unit].references)   // Task not already being executed and not still in use by a pixel unit
 			{
-				primitive = draw->primitive;
+				occlusionQuery->add(data->occlusion[cluster]);
-				count = draw->count;
-				int batch = draw->batchSize;
-				primitiveProgress[unit].drawCall = currentDraw.load();
-				primitiveProgress[unit].firstPrimitive = primitive;
-				primitiveProgress[unit].primitiveCount = count - primitive >= batch ? batch : count - primitive;
-				draw->primitive += batch;
-				Task &task = taskQueue[qHead];
-				task.type = Task::PRIMITIVES;
-				task.primitiveUnit = unit;
-				primitiveProgress[unit].references = -1;
-				// Commit to the task queue
-				qHead = (qHead + 1) & TASK_COUNT_BITS;
-				++qSize; // Atomic
 			}
+			occlusionQuery->finish();
 		}
+		vertexRoutine.reset();
+		setupRoutine.reset();
+		pixelRoutine.reset();
 	}
-	void Renderer::scheduleTask(int threadIndex)
+	void DrawCall::run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount])
 	{
-		schedulerMutex.lock();
+		draw->setup();
-		int curThreadsAwake = threadsAwake;
+		auto const numPrimitives = draw->numPrimitives;
+		auto const numPrimitivesPerBatch = draw->numPrimitivesPerBatch;
+		auto const numBatches = draw->numBatches;
-		if((int)qSize < threadCount - curThreadsAwake + 1)
+		auto ticket = tickets->take();
-		{
+		auto finally = yarn::make_shared_finally([draw, ticket] {
-			findAvailableTasks();
+			YARN_SCOPED_EVENT("FINISH draw %d", draw->id);
-		}
+			draw->teardown();
+			ticket.done();
+		});
-		if(qSize != 0)
+		for (unsigned int batchId = 0; batchId < numBatches; batchId++)
 		{
-			task[threadIndex] = taskQueue[(qHead - qSize) & TASK_COUNT_BITS];
+			auto batch = draw->batchDataPool->borrow();
-			--qSize; // Atomic
+			batch->id = batchId;
+			batch->firstPrimitive = batch->id * numPrimitivesPerBatch;
+			batch->numPrimitives = std::min(batch->firstPrimitive + numPrimitivesPerBatch, numPrimitives) - batch->firstPrimitive;
-			if(curThreadsAwake != threadCount)
+			for (int cluster = 0; cluster < MaxClusterCount; cluster++)
 			{
-				int wakeup = qSize - curThreadsAwake + 1;
+				batch->clusterTickets[cluster] = std::move(clusterQueues[cluster].take());
-				for(int i = 0; i < threadCount && wakeup > 0; i++)
-				{
-					if(task[i].type == Task::SUSPEND)
-					{
-						suspend[i]->wait();
-						task[i].type = Task::RESUME;
-						resume[i]->signal();
-						++threadsAwake; // Atomic
-						wakeup--;
-					}
-				}
 			}
-		}
-		else
-		{
-			task[threadIndex].type = Task::SUSPEND;
-			--threadsAwake; // Atomic
+			yarn::schedule([draw, batch, finally] {
-		}
-		schedulerMutex.unlock();
+				processVertices(draw.get(), batch.get());
-	}
-	void Renderer::executeTask(int threadIndex)
+				if (!draw->setupState.rasterizerDiscard)
-	{
-		switch(task[threadIndex].type.load())
-		{
-		case Task::PRIMITIVES:
-			{
-				int unit = task[threadIndex].primitiveUnit;
-				int input = primitiveProgress[unit].firstPrimitive;
-				int count = primitiveProgress[unit].primitiveCount;
-				DrawCall *draw = drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
-				int (Renderer::*setupPrimitives)(int batch, int count) = draw->setupPrimitives;
-				processPrimitiveVertices(unit, input, count, draw->count, threadIndex);
-				int visible = 0;
-				if(!draw->setupState.rasterizerDiscard)
 				{
-					visible = (this->*setupPrimitives)(unit, count);
+					processPrimitives(draw.get(), batch.get());
-				}
-				primitiveProgress[unit].visible = visible;
+					if (batch->numVisible > 0)
-				primitiveProgress[unit].references = clusterCount.load();
+					{
-			}
+						processPixels(draw, batch, finally);
-			break;
+						return;
-		case Task::PIXELS:
+					}
-			{
+				}
-				int unit = task[threadIndex].primitiveUnit;
-				int visible = primitiveProgress[unit].visible;
-				if(visible > 0)
+				for (int cluster = 0; cluster < MaxClusterCount; cluster++)
 				{
-					int cluster = task[threadIndex].pixelCluster;
+					batch->clusterTickets[cluster].done();
-					Primitive *primitive = primitiveBatch[unit];
-					DrawCall *draw = drawList[pixelProgress[cluster].drawCall & DRAW_COUNT_BITS];
-					DrawData *data = draw->data;
-					PixelProcessor::RoutinePointer pixelRoutine = draw->pixelPointer;
-					pixelRoutine(primitive, visible, cluster, clusterCount, data);
 				}
+			});
-				finishRendering(task[threadIndex]);
-			}
-			break;
-		case Task::RESUME:
-			break;
-		case Task::SUSPEND:
-			break;
-		default:
-			ASSERT(false);
 		}
 	}
-	void Renderer::synchronize()
+	void DrawCall::processVertices(DrawCall* draw, BatchData* batch)
 	{
-		sync.wait();
+		YARN_SCOPED_EVENT("VERTEX draw %d, batch %d", draw->id, batch->id);
-		device->updateSamplingRoutineConstCache();
-	}
-	void Renderer::finishRendering(Task &pixelTask)
-	{
-		int unit = pixelTask.primitiveUnit;
-		int cluster = pixelTask.pixelCluster;
-		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
-		DrawData &data = *draw.data;
-		int primitive = primitiveProgress[unit].firstPrimitive;
-		int count = primitiveProgress[unit].primitiveCount;
-		int processedPrimitives = primitive + count;
-		pixelProgress[cluster].processedPrimitives = processedPrimitives;
+		unsigned int triangleIndices[MaxBatchSize + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
-		if(pixelProgress[cluster].processedPrimitives >= draw.count)
 		{
-			++pixelProgress[cluster].drawCall; // Atomic
+			YARN_SCOPED_EVENT("processPrimitiveVertices");
-			pixelProgress[cluster].processedPrimitives = 0;
+			processPrimitiveVertices(
+				triangleIndices,
+				draw->data->indices,
+				draw->indexType,
+				batch->firstPrimitive,
+				batch->numPrimitives,
+				draw->topology);
 		}
-		int ref = --primitiveProgress[unit].references; // Atomic
+		auto& vertexTask = batch->vertexTask;
+		vertexTask.primitiveStart = batch->firstPrimitive;
-		if(ref == 0)
+		vertexTask.vertexCount = batch->numPrimitives * 3;
+		if (vertexTask.vertexCache.drawCall != draw->id)
 		{
-			ref = --draw.references; // Atomic
+			vertexTask.vertexCache.clear();
+			vertexTask.vertexCache.drawCall = draw->id;
-			if(ref == 0)
-			{
-				if (draw.occlusionQuery)
-				{
-					for(int cluster = 0; cluster < clusterCount; cluster++)
-					{
-						draw.occlusionQuery->add(data.occlusion[cluster]);
-					}
-					draw.occlusionQuery->finish();
-				}
-				draw.vertexRoutine.reset();
-				draw.setupRoutine.reset();
-				draw.pixelRoutine.reset();
-				if(draw.events)
-				{
-					draw.events->finish();
-					draw.events = nullptr;
-				}
-				sync.done();
-				draw.references = -1;
-				resumeApp->signal();
-			}
 		}
-		pixelProgress[cluster].executing = false;
+		draw->vertexPointer(&batch->triangles.front().v0, &triangleIndices[0][0], &vertexTask, draw->data);
 	}
-	void Renderer::processPrimitiveVertices(int unit, unsigned int start, unsigned int triangleCount, unsigned int loop, int thread)
+	void DrawCall::processPrimitives(DrawCall* draw, BatchData* batch)
 	{
-		Triangle *triangle = triangleBatch[unit];
+		YARN_SCOPED_EVENT("PRIMITIVES draw %d batch %d", draw->id, batch->id);
-		int primitiveDrawCall = primitiveProgress[unit].drawCall;
+		auto triangles = &batch->triangles[0];
-		DrawCall *draw = drawList[primitiveDrawCall & DRAW_COUNT_BITS];
+		auto primitives = &batch->primitives[0];
-		DrawData *data = draw->data;
+		batch->numVisible = draw->setupPrimitives(triangles, primitives, draw, batch->numPrimitives);
-		VertexTask *task = vertexTask[thread];
+	}
-		const void *indices = data->indices;
-		VertexProcessor::RoutinePointer vertexRoutine = draw->vertexPointer;
-		if(task->vertexCache.drawCall != primitiveDrawCall)
+	void DrawCall::processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
-		{
+	{
-			task->vertexCache.clear();
+		struct Data
-			task->vertexCache.drawCall = primitiveDrawCall;
+		{
+			Data(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally)
+				: draw(draw), batch(batch), finally(finally) {}
+			yarn::Loan<DrawCall> draw;
+			yarn::Loan<BatchData> batch;
+			std::shared_ptr<yarn::Finally> finally;
+		};
+		auto data = std::make_shared<Data>(draw, batch, finally);
+		for (int cluster = 0; cluster < MaxClusterCount; cluster++)
+		{
+			batch->clusterTickets[cluster].onCall([data, cluster]
+			{
+				auto& draw = data->draw;
+				auto& batch = data->batch;
+				YARN_SCOPED_EVENT("PIXEL draw %d, batch %d, cluster %d", draw->id, batch->id, cluster);
+				draw->pixelPointer(&batch->primitives.front(), batch->numVisible, cluster, MaxClusterCount, draw->data);
+				batch->clusterTickets[cluster].done();
+			});
 		}
+	}
-		unsigned int batch[128 + 1][3];  // One extra for SIMD width overrun. TODO: Adjust to dynamic batch size.
+	void Renderer::synchronize()
-		VkPrimitiveTopology topology = static_cast<VkPrimitiveTopology>(static_cast<int>(draw->topology));
+	{
+		YARN_SCOPED_EVENT("synchronize");
+		auto ticket = drawTickets.take();
+		ticket.wait();
+		device->updateSamplingRoutineConstCache();
+		ticket.done();
+	}
-		if(!indices)
+	void DrawCall::processPrimitiveVertices(
+		unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+		const void *primitiveIndices,
+		VkIndexType indexType,
+		unsigned int start,
+		unsigned int triangleCount,
+		VkPrimitiveTopology topology)
+	{
+		if(!primitiveIndices)
 		{
 			struct LinearIndex
 			{
 				unsigned int operator[](unsigned int i) { return i; }
 			};
-			if(!setBatchIndices(batch, topology, LinearIndex(), start, triangleCount))
+			if(!setBatchIndices(triangleIndicesOut, topology, LinearIndex(), start, triangleCount))
 			{
 				return;
 			}
 		}
 		else
 		{
-			switch(draw->indexType.load())
+			switch(indexType)
 			{
 			case VK_INDEX_TYPE_UINT16:
-				if(!setBatchIndices(batch, topology, static_cast<const uint16_t*>(indices), start, triangleCount))
+				if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint16_t*>(primitiveIndices), start, triangleCount))
 				{
 					return;
 				}
 				break;
 			case VK_INDEX_TYPE_UINT32:
-				if(!setBatchIndices(batch, topology, static_cast<const uint32_t*>(indices), start, triangleCount))
+				if(!setBatchIndices(triangleIndicesOut, topology, static_cast<const uint32_t*>(primitiveIndices), start, triangleCount))
 				{
 					return;
 				}
@@ -824,33 +562,25 @@ namespace sw
 		}
 		// Repeat the last index to allow for SIMD width overrun.
-		batch[triangleCount][0] = batch[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][0] = triangleIndicesOut[triangleCount - 1][2];
-		batch[triangleCount][1] = batch[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][1] = triangleIndicesOut[triangleCount - 1][2];
-		batch[triangleCount][2] = batch[triangleCount - 1][2];
+		triangleIndicesOut[triangleCount][2] = triangleIndicesOut[triangleCount - 1][2];
-		task->primitiveStart = start;
-		task->vertexCount = triangleCount * 3;
-		vertexRoutine(&triangle->v0, (unsigned int*)&batch, task, data);
 	}
-	int Renderer::setupTriangles(int unit, int count)
+	int DrawCall::setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
 	{
-		Triangle *triangle = triangleBatch[unit];
+		auto &state = drawCall->setupState;
-		Primitive *primitive = primitiveBatch[unit];
+		auto setupRoutine = drawCall->setupPointer;
-		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
-		SetupProcessor::State &state = draw.setupState;
-		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
 		int ms = state.multiSample;
-		const DrawData *data = draw.data;
+		const DrawData *data = drawCall->data;
 		int visible = 0;
-		for(int i = 0; i < count; i++, triangle++)
+		for(int i = 0; i < count; i++, triangles++)
 		{
-			Vertex &v0 = triangle->v0;
+			Vertex &v0 = triangles->v0;
-			Vertex &v1 = triangle->v1;
+			Vertex &v1 = triangles->v1;
-			Vertex &v2 = triangle->v2;
+			Vertex &v2 = triangles->v2;
 			if((v0.clipFlags & v1.clipFlags & v2.clipFlags) == Clipper::CLIP_FINITE)
 			{
@@ -860,15 +590,15 @@ namespace sw
 				if(clipFlagsOr != Clipper::CLIP_FINITE)
 				{
-					if(!Clipper::Clip(polygon, clipFlagsOr, draw))
+					if(!Clipper::Clip(polygon, clipFlagsOr, *drawCall))
 					{
 						continue;
 					}
 				}
-				if(setupRoutine(primitive, triangle, &polygon, data))
+				if(setupRoutine(primitives, triangles, &polygon, data))
 				{
-					primitive += ms;
+					primitives += ms;
 					visible++;
 				}
 			}
@@ -877,57 +607,49 @@ namespace sw
 		return visible;
 	}
-	int Renderer::setupLines(int unit, int count)
+	int DrawCall::setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
 	{
-		Triangle *triangle = triangleBatch[unit];
+		auto &state = drawCall->setupState;
-		Primitive *primitive = primitiveBatch[unit];
-		int visible = 0;
-		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
-		SetupProcessor::State &state = draw.setupState;
+		int visible = 0;
 		int ms = state.multiSample;
 		for(int i = 0; i < count; i++)
 		{
-			if(setupLine(*primitive, *triangle, draw))
+			if(setupLine(*primitives, *triangles, *drawCall))
 			{
-				primitive += ms;
+				primitives += ms;
 				visible++;
 			}
-			triangle++;
+			triangles++;
 		}
 		return visible;
 	}
-	int Renderer::setupPoints(int unit, int count)
+	int DrawCall::setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count)
 	{
-		Triangle *triangle = triangleBatch[unit];
+		auto &state = drawCall->setupState;
-		Primitive *primitive = primitiveBatch[unit];
-		int visible = 0;
-		DrawCall &draw = *drawList[primitiveProgress[unit].drawCall & DRAW_COUNT_BITS];
-		SetupProcessor::State &state = draw.setupState;
+		int visible = 0;
 		int ms = state.multiSample;
 		for(int i = 0; i < count; i++)
 		{
-			if(setupPoint(*primitive, *triangle, draw))
+			if(setupPoint(*primitives, *triangles, *drawCall))
 			{
-				primitive += ms;
+				primitives += ms;
 				visible++;
 			}
-			triangle++;
+			triangles++;
 		}
 		return visible;
 	}
-	bool Renderer::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+	bool DrawCall::setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
 	{
 		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
 		const DrawData &data = *draw.data;
@@ -1120,7 +842,7 @@ namespace sw
 		return false;
 	}
-	bool Renderer::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
+	bool DrawCall::setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw)
 	{
 		const SetupProcessor::RoutinePointer &setupRoutine = draw.setupPointer;
 		const DrawData &data = *draw.data;
@@ -1183,76 +905,6 @@ namespace sw
 		return false;
 	}
-	void Renderer::initializeThreads()
-	{
-		unitCount = ceilPow2(threadCount);
-		clusterCount = ceilPow2(threadCount);
-		for(int i = 0; i < unitCount; i++)
-		{
-			triangleBatch[i] = (Triangle*)allocate(batchSize * sizeof(Triangle));
-			primitiveBatch[i] = (Primitive*)allocate(batchSize * sizeof(Primitive));
-		}
-		for(int i = 0; i < threadCount; i++)
-		{
-			vertexTask[i] = (VertexTask*)allocate(sizeof(VertexTask));
-			vertexTask[i]->vertexCache.drawCall = -1;
-			task[i].type = Task::SUSPEND;
-			resume[i] = new Event();
-			suspend[i] = new Event();
-			Parameters parameters;
-			parameters.threadIndex = i;
-			parameters.renderer = this;
-			exitThreads = false;
-			worker[i] = new std::thread(threadFunction, &parameters);
-			suspend[i]->wait();
-			suspend[i]->signal();
-		}
-	}
-	void Renderer::terminateThreads()
-	{
-		while(threadsAwake != 0)
-		{
-			std::this_thread::yield();
-		}
-		for(int thread = 0; thread < threadCount; thread++)
-		{
-			if(worker[thread])
-			{
-				exitThreads = true;
-				resume[thread]->signal();
-				worker[thread]->join();
-				delete worker[thread];
-				worker[thread] = 0;
-				delete resume[thread];
-				resume[thread] = 0;
-				delete suspend[thread];
-				suspend[thread] = 0;
-			}
-			deallocate(vertexTask[thread]);
-			vertexTask[thread] = 0;
-		}
-		for(int i = 0; i < 16; i++)
-		{
-			deallocate(triangleBatch[i]);
-			triangleBatch[i] = 0;
-			deallocate(primitiveBatch[i]);
-			primitiveBatch[i] = 0;
-		}
-	}
 	void Renderer::addQuery(vk::Query *query)
 	{
 		ASSERT(query->getType() == VK_QUERY_TYPE_OCCLUSION);
@@ -1292,28 +944,4 @@ namespace sw
 		this->scissor = scissor;
 	}
-	void Renderer::updateConfiguration(bool initialUpdate)
-	{
-		if(initialUpdate)
-		{
-			terminateThreads();
-			VertexProcessor::setRoutineCacheSize(1024);
-			PixelProcessor::setRoutineCacheSize(1024);
-			SetupProcessor::setRoutineCacheSize(1024);
-			threadCount = CPUID::processAffinity();
-			CPUID::setEnableSSE4_1(true);
-			CPUID::setEnableSSSE3(true);
-			CPUID::setEnableSSE3(true);
-			CPUID::setEnableSSE2(true);
-			CPUID::setEnableSSE(true);
-		}
-		if(!initialUpdate && !worker[0])
-		{
-			initializeThreads();
-		}
-	}
 }
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -19,11 +19,15 @@
 #include "PixelProcessor.hpp"
 #include "SetupProcessor.hpp"
 #include "Plane.hpp"
+#include "Primitive.hpp"
 #include "Blitter.hpp"
 #include "Device/Config.hpp"
-#include "System/Synchronization.hpp"
 #include "Vulkan/VkDescriptorSet.hpp"
+#include "Yarn/Pool.hpp"
+#include "Yarn/Finally.hpp"
+#include "Yarn/Ticket.hpp"
 #include <atomic>
 #include <list>
 #include <mutex>
@@ -46,6 +50,14 @@ namespace sw
 	class Resource;
 	struct Constants;
+	static constexpr int MaxBatchSize = 128;
+	static constexpr int MaxBatchCount = 16;
+	static constexpr int MaxClusterCount = 16;
+	static constexpr int MaxDrawCount = 16;
+	using TriangleBatch = std::array<Triangle, MaxBatchSize>;
+	using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
 	struct DrawData
 	{
 		const Constants *constants;
@@ -64,7 +76,7 @@ namespace sw
 		PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
 		PixelProcessor::Factor factor;
-		unsigned int occlusion[16];   // Number of pixels passing depth test
+		unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
 		float4 Wx16;
 		float4 Hx16;
@@ -100,71 +112,88 @@ namespace sw
 		PushConstantStorage pushConstants;
 	};
-	class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+	struct DrawCall
 	{
-		struct Task
+		struct BatchData
 		{
-			enum Type
+			using Pool = yarn::BoundedPool<BatchData, MaxBatchCount, yarn::PoolPolicy::Preserve>;
-			{
-				PRIMITIVES,
+			TriangleBatch triangles;
-				PIXELS,
+			PrimitiveBatch primitives;
+			VertexTask vertexTask;
-				RESUME,
+			unsigned int id;
-				SUSPEND
+			unsigned int firstPrimitive;
-			};
+			unsigned int numPrimitives;
+			int numVisible;
-			void operator=(const Task& task)
+			yarn::Ticket clusterTickets[MaxClusterCount];
-			{
-				type = task.type.load();
-				primitiveUnit = task.primitiveUnit.load();
-				pixelCluster = task.pixelCluster.load();
-			}
-			std::atomic<int> type;
-			std::atomic<int> primitiveUnit;
-			std::atomic<int> pixelCluster;
 		};
-		struct PrimitiveProgress
+		using Pool = yarn::BoundedPool<DrawCall, MaxDrawCount, yarn::PoolPolicy::Preserve>;
-		{
+		using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-			void init()
-			{
-				drawCall = 0;
-				firstPrimitive = 0;
-				primitiveCount = 0;
-				visible = 0;
-				references = 0;
-			}
-			std::atomic<int> drawCall;
-			std::atomic<int> firstPrimitive;
-			std::atomic<int> primitiveCount;
-			std::atomic<int> visible;
-			std::atomic<int> references;
-		};
-		struct PixelProgress
+		DrawCall();
-		{
+		~DrawCall();
-			void init()
-			{
-				drawCall = 0;
-				processedPrimitives = 0;
-				executing = false;
-			}
-			std::atomic<int> drawCall;
-			std::atomic<int> processedPrimitives;
-			std::atomic<int> executing;
-		};
+		static void run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount]);
+		static void processVertices(DrawCall* draw, BatchData* batch);
+		static void processPrimitives(DrawCall* draw, BatchData* batch);
+		static void processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally);
+		void setup();
+		void teardown();
+		int id;
+		BatchData::Pool *batchDataPool;
+		unsigned int numPrimitives;
+		unsigned int numPrimitivesPerBatch;
+		unsigned int numBatches;
+		VkPrimitiveTopology topology;
+		VkIndexType indexType;
+		std::shared_ptr<Routine> vertexRoutine;
+		std::shared_ptr<Routine> setupRoutine;
+		std::shared_ptr<Routine> pixelRoutine;
+		VertexProcessor::RoutinePointer vertexPointer;
+		SetupProcessor::RoutinePointer setupPointer;
+		PixelProcessor::RoutinePointer pixelPointer;
+		SetupFunction setupPrimitives;
+		SetupProcessor::State setupState;
+		vk::ImageView *renderTarget[RENDERTARGETS];
+		vk::ImageView *depthBuffer;
+		vk::ImageView *stencilBuffer;
+		TaskEvents *events;
+		vk::Query* occlusionQuery;
+		DrawData *data;
+		static void processPrimitiveVertices(
+				unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+				const void *primitiveIndices,
+				VkIndexType indexType,
+				unsigned int start,
+				unsigned int triangleCount,
+				VkPrimitiveTopology topology);
+		static int setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+		static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+		static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+		static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+		static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+	};
+	class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+	{
 	public:
 		Renderer(vk::Device* device);
 		virtual ~Renderer();
-		void *operator new(size_t size);
-		void operator delete(void * mem);
 		bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
 		void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
@@ -182,74 +211,18 @@ namespace sw
 		void synchronize();
-		static int getClusterCount() { return clusterCount; }
 	private:
-		static void threadFunction(void *parameters);
-		void threadLoop(int threadIndex);
-		void taskLoop(int threadIndex);
-		void findAvailableTasks();
-		void scheduleTask(int threadIndex);
-		void executeTask(int threadIndex);
-		void finishRendering(Task &pixelTask);
-		void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
-		int setupTriangles(int batch, int count);
-		int setupLines(int batch, int count);
-		int setupPoints(int batch, int count);
-		bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-		bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-		void updateConfiguration(bool initialUpdate = false);
-		void initializeThreads();
-		void terminateThreads();
 		VkViewport viewport;
 		VkRect2D scissor;
-		Triangle *triangleBatch[16];
+		DrawCall::Pool drawCallPool;
-		Primitive *primitiveBatch[16];
+		DrawCall::BatchData::Pool batchDataPool;
-		std::atomic<int> exitThreads;
-		std::atomic<int> threadsAwake;
-		std::thread *worker[16];
-		Event *resume[16];         // Events for resuming threads
-		Event *suspend[16];        // Events for suspending threads
-		Event *resumeApp;          // Event for resuming the application thread
-		PrimitiveProgress primitiveProgress[16];
+		std::atomic<int> nextDrawID = {0};
-		PixelProgress pixelProgress[16];
-		Task task[16];   // Current tasks for threads
-		enum {
-			DRAW_COUNT = 16,   // Number of draw calls buffered (must be power of 2)
-			DRAW_COUNT_BITS = DRAW_COUNT - 1,
-		};
-		DrawCall *drawCall[DRAW_COUNT];
-		DrawCall *drawList[DRAW_COUNT];
-		std::atomic<int> currentDraw;
-		std::atomic<int> nextDraw;
-		enum {
-			TASK_COUNT = 32,   // Size of the task queue (must be power of 2)
-			TASK_COUNT_BITS = TASK_COUNT - 1,
-		};
-		Task taskQueue[TASK_COUNT];
-		std::atomic<int> qHead;
-		std::atomic<int> qSize;
-		static std::atomic<int> unitCount;
-		static std::atomic<int> clusterCount;
-		std::mutex schedulerMutex;
-		VertexTask *vertexTask[16];
 		vk::Query *occlusionQuery;
-		WaitGroup sync;
+		yarn::Ticket::Queue drawTickets;
+		yarn::Ticket::Queue clusterQueues[MaxClusterCount];
 		VertexProcessor::State vertexState;
 		SetupProcessor::State setupState;
@@ -262,40 +235,6 @@ namespace sw
 		vk::Device* device;
 	};
-	struct DrawCall
-	{
-		DrawCall();
-		~DrawCall();
-		std::atomic<int> topology;
-		std::atomic<int> indexType;
-		std::atomic<int> batchSize;
-		std::shared_ptr<Routine> vertexRoutine;
-		std::shared_ptr<Routine> setupRoutine;
-		std::shared_ptr<Routine> pixelRoutine;
-		VertexProcessor::RoutinePointer vertexPointer;
-		SetupProcessor::RoutinePointer setupPointer;
-		PixelProcessor::RoutinePointer pixelPointer;
-		int (Renderer::*setupPrimitives)(int batch, int count);
-		SetupProcessor::State setupState;
-		vk::ImageView *renderTarget[RENDERTARGETS];
-		vk::ImageView *depthBuffer;
-		vk::ImageView *stencilBuffer;
-		TaskEvents *events;
-		vk::Query *occlusionQuery;
-		std::atomic<int> primitive;    // Current primitive to enter pipeline
-		std::atomic<int> count;        // Number of primitives to render
-		std::atomic<int> references;   // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
-		DrawData *data;
-	};
 }
 #endif   // sw_Renderer_hpp