Device: Migrate Renderer to Yarn

Drop the complex task scheduling logic for yarn. Performance gains seen up to around ~30% FPS. Bug: b/139142453 Change-Id: I264fee36323425a791088565d99dc586670a948a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35572Tested-by: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com>

Device: Migrate Renderer to Yarn
cde4dd96 · Ben Clayton · b5f0a4be · cde4dd96 · cde4dd96
Commit cde4dd96 authored Aug 27, 2019 by Ben Clayton
Expand all Show whitespace changes
Inline Side-by-side

Showing with 88 additions and 149 deletions

Renderer.cpp src/Device/Renderer.cpp +0 -0

Renderer.hpp src/Device/Renderer.hpp +88 -149

No files found.
--- a/src/Device/Renderer.cpp
+++ b/src/Device/Renderer.cpp
--- a/src/Device/Renderer.hpp
+++ b/src/Device/Renderer.hpp
@@ -19,11 +19,15 @@
 #include "PixelProcessor.hpp"
 #include "SetupProcessor.hpp"
 #include "Plane.hpp"
+#include "Primitive.hpp"
 #include "Blitter.hpp"
 #include "Device/Config.hpp"
-#include "System/Synchronization.hpp"
 #include "Vulkan/VkDescriptorSet.hpp"
+#include "Yarn/Pool.hpp"
+#include "Yarn/Finally.hpp"
+#include "Yarn/Ticket.hpp"
 #include <atomic>
 #include <list>
 #include <mutex>
@@ -46,6 +50,14 @@ namespace sw
 	class Resource;
 	struct Constants;
+	static constexpr int MaxBatchSize = 128;
+	static constexpr int MaxBatchCount = 16;
+	static constexpr int MaxClusterCount = 16;
+	static constexpr int MaxDrawCount = 16;
+	using TriangleBatch = std::array<Triangle, MaxBatchSize>;
+	using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
 	struct DrawData
 	{
 		const Constants *constants;
@@ -64,7 +76,7 @@ namespace sw
 		PixelProcessor::Stencil stencil[2];   // clockwise, counterclockwise
 		PixelProcessor::Factor factor;
-		unsigned int occlusion[16];   // Number of pixels passing depth test
+		unsigned int occlusion[MaxClusterCount];   // Number of pixels passing depth test
 		float4 Wx16;
 		float4 Hx16;
@@ -100,71 +112,88 @@ namespace sw
 		PushConstantStorage pushConstants;
 	};
-	class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+	struct DrawCall
-	{
-		struct Task
 	{
-			enum Type
+		struct BatchData
 		{
-				PRIMITIVES,
+			using Pool = yarn::BoundedPool<BatchData, MaxBatchCount, yarn::PoolPolicy::Preserve>;
-				PIXELS,
+			TriangleBatch triangles;
-				RESUME,
+			PrimitiveBatch primitives;
-				SUSPEND
+			VertexTask vertexTask;
+			unsigned int id;
+			unsigned int firstPrimitive;
+			unsigned int numPrimitives;
+			int numVisible;
+			yarn::Ticket clusterTickets[MaxClusterCount];
 		};
-			void operator=(const Task& task)
+		using Pool = yarn::BoundedPool<DrawCall, MaxDrawCount, yarn::PoolPolicy::Preserve>;
-			{
+		using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
-				type = task.type.load();
-				primitiveUnit = task.primitiveUnit.load();
-				pixelCluster = task.pixelCluster.load();
-			}
-			std::atomic<int> type;
-			std::atomic<int> primitiveUnit;
-			std::atomic<int> pixelCluster;
-		};
-		struct PrimitiveProgress
+		DrawCall();
-		{
+		~DrawCall();
-			void init()
-			{
-				drawCall = 0;
-				firstPrimitive = 0;
-				primitiveCount = 0;
-				visible = 0;
-				references = 0;
-			}
-			std::atomic<int> drawCall;
-			std::atomic<int> firstPrimitive;
-			std::atomic<int> primitiveCount;
-			std::atomic<int> visible;
-			std::atomic<int> references;
-		};
-		struct PixelProgress
+		static void run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount]);
-		{
+		static void processVertices(DrawCall* draw, BatchData* batch);
-			void init()
+		static void processPrimitives(DrawCall* draw, BatchData* batch);
-			{
+		static void processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally);
-				drawCall = 0;
+		void setup();
-				processedPrimitives = 0;
+		void teardown();
-				executing = false;
-			}
+		int id;
-			std::atomic<int> drawCall;
+		BatchData::Pool *batchDataPool;
-			std::atomic<int> processedPrimitives;
+		unsigned int numPrimitives;
-			std::atomic<int> executing;
+		unsigned int numPrimitivesPerBatch;
+		unsigned int numBatches;
+		VkPrimitiveTopology topology;
+		VkIndexType indexType;
+		std::shared_ptr<Routine> vertexRoutine;
+		std::shared_ptr<Routine> setupRoutine;
+		std::shared_ptr<Routine> pixelRoutine;
+		VertexProcessor::RoutinePointer vertexPointer;
+		SetupProcessor::RoutinePointer setupPointer;
+		PixelProcessor::RoutinePointer pixelPointer;
+		SetupFunction setupPrimitives;
+		SetupProcessor::State setupState;
+		vk::ImageView *renderTarget[RENDERTARGETS];
+		vk::ImageView *depthBuffer;
+		vk::ImageView *stencilBuffer;
+		TaskEvents *events;
+		vk::Query* occlusionQuery;
+		DrawData *data;
+		static void processPrimitiveVertices(
+				unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
+				const void *primitiveIndices,
+				VkIndexType indexType,
+				unsigned int start,
+				unsigned int triangleCount,
+				VkPrimitiveTopology topology);
+		static int setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+		static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+		static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
+		static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
+		static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
 	};
+	class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
+	{
 	public:
 		Renderer(vk::Device* device);
 		virtual ~Renderer();
-		void *operator new(size_t size);
-		void operator delete(void * mem);
 		bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
 		void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
@@ -182,74 +211,18 @@ namespace sw
 		void synchronize();
-		static int getClusterCount() { return clusterCount; }
 	private:
-		static void threadFunction(void *parameters);
-		void threadLoop(int threadIndex);
-		void taskLoop(int threadIndex);
-		void findAvailableTasks();
-		void scheduleTask(int threadIndex);
-		void executeTask(int threadIndex);
-		void finishRendering(Task &pixelTask);
-		void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
-		int setupTriangles(int batch, int count);
-		int setupLines(int batch, int count);
-		int setupPoints(int batch, int count);
-		bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-		bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
-		void updateConfiguration(bool initialUpdate = false);
-		void initializeThreads();
-		void terminateThreads();
 		VkViewport viewport;
 		VkRect2D scissor;
-		Triangle *triangleBatch[16];
+		DrawCall::Pool drawCallPool;
-		Primitive *primitiveBatch[16];
+		DrawCall::BatchData::Pool batchDataPool;
-		std::atomic<int> exitThreads;
+		std::atomic<int> nextDrawID = {0};
-		std::atomic<int> threadsAwake;
-		std::thread *worker[16];
-		Event *resume[16];         // Events for resuming threads
-		Event *suspend[16];        // Events for suspending threads
-		Event *resumeApp;          // Event for resuming the application thread
-		PrimitiveProgress primitiveProgress[16];
-		PixelProgress pixelProgress[16];
-		Task task[16];   // Current tasks for threads
-		enum {
-			DRAW_COUNT = 16,   // Number of draw calls buffered (must be power of 2)
-			DRAW_COUNT_BITS = DRAW_COUNT - 1,
-		};
-		DrawCall *drawCall[DRAW_COUNT];
-		DrawCall *drawList[DRAW_COUNT];
-		std::atomic<int> currentDraw;
-		std::atomic<int> nextDraw;
-		enum {
-			TASK_COUNT = 32,   // Size of the task queue (must be power of 2)
-			TASK_COUNT_BITS = TASK_COUNT - 1,
-		};
-		Task taskQueue[TASK_COUNT];
-		std::atomic<int> qHead;
-		std::atomic<int> qSize;
-		static std::atomic<int> unitCount;
-		static std::atomic<int> clusterCount;
-		std::mutex schedulerMutex;
-		VertexTask *vertexTask[16];
 		vk::Query *occlusionQuery;
-		WaitGroup sync;
+		yarn::Ticket::Queue drawTickets;
+		yarn::Ticket::Queue clusterQueues[MaxClusterCount];
 		VertexProcessor::State vertexState;
 		SetupProcessor::State setupState;
@@ -262,40 +235,6 @@ namespace sw
 		vk::Device* device;
 	};
-	struct DrawCall
-	{
-		DrawCall();
-		~DrawCall();
-		std::atomic<int> topology;
-		std::atomic<int> indexType;
-		std::atomic<int> batchSize;
-		std::shared_ptr<Routine> vertexRoutine;
-		std::shared_ptr<Routine> setupRoutine;
-		std::shared_ptr<Routine> pixelRoutine;
-		VertexProcessor::RoutinePointer vertexPointer;
-		SetupProcessor::RoutinePointer setupPointer;
-		PixelProcessor::RoutinePointer pixelPointer;
-		int (Renderer::*setupPrimitives)(int batch, int count);
-		SetupProcessor::State setupState;
-		vk::ImageView *renderTarget[RENDERTARGETS];
-		vk::ImageView *depthBuffer;
-		vk::ImageView *stencilBuffer;
-		TaskEvents *events;
-		vk::Query *occlusionQuery;
-		std::atomic<int> primitive;    // Current primitive to enter pipeline
-		std::atomic<int> count;        // Number of primitives to render
-		std::atomic<int> references;   // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
-		DrawData *data;
-	};
 }
 #endif   // sw_Renderer_hpp