Commit cde4dd96 by Ben Clayton

Device: Migrate Renderer to Yarn

Drop the complex task scheduling logic for yarn. Performance gains seen up to around ~30% FPS. Bug: b/139142453 Change-Id: I264fee36323425a791088565d99dc586670a948a Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35572Tested-by: 's avatarBen Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Reviewed-by: 's avatarNicolas Capens <nicolascapens@google.com>
parent b5f0a4be
...@@ -19,11 +19,15 @@ ...@@ -19,11 +19,15 @@
#include "PixelProcessor.hpp" #include "PixelProcessor.hpp"
#include "SetupProcessor.hpp" #include "SetupProcessor.hpp"
#include "Plane.hpp" #include "Plane.hpp"
#include "Primitive.hpp"
#include "Blitter.hpp" #include "Blitter.hpp"
#include "Device/Config.hpp" #include "Device/Config.hpp"
#include "System/Synchronization.hpp"
#include "Vulkan/VkDescriptorSet.hpp" #include "Vulkan/VkDescriptorSet.hpp"
#include "Yarn/Pool.hpp"
#include "Yarn/Finally.hpp"
#include "Yarn/Ticket.hpp"
#include <atomic> #include <atomic>
#include <list> #include <list>
#include <mutex> #include <mutex>
...@@ -46,6 +50,14 @@ namespace sw ...@@ -46,6 +50,14 @@ namespace sw
class Resource; class Resource;
struct Constants; struct Constants;
static constexpr int MaxBatchSize = 128;
static constexpr int MaxBatchCount = 16;
static constexpr int MaxClusterCount = 16;
static constexpr int MaxDrawCount = 16;
using TriangleBatch = std::array<Triangle, MaxBatchSize>;
using PrimitiveBatch = std::array<Primitive, MaxBatchSize>;
struct DrawData struct DrawData
{ {
const Constants *constants; const Constants *constants;
...@@ -64,7 +76,7 @@ namespace sw ...@@ -64,7 +76,7 @@ namespace sw
PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise PixelProcessor::Stencil stencil[2]; // clockwise, counterclockwise
PixelProcessor::Factor factor; PixelProcessor::Factor factor;
unsigned int occlusion[16]; // Number of pixels passing depth test unsigned int occlusion[MaxClusterCount]; // Number of pixels passing depth test
float4 Wx16; float4 Wx16;
float4 Hx16; float4 Hx16;
...@@ -100,71 +112,88 @@ namespace sw ...@@ -100,71 +112,88 @@ namespace sw
PushConstantStorage pushConstants; PushConstantStorage pushConstants;
}; };
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor struct DrawCall
{
struct Task
{ {
enum Type struct BatchData
{ {
PRIMITIVES, using Pool = yarn::BoundedPool<BatchData, MaxBatchCount, yarn::PoolPolicy::Preserve>;
PIXELS,
TriangleBatch triangles;
RESUME, PrimitiveBatch primitives;
SUSPEND VertexTask vertexTask;
unsigned int id;
unsigned int firstPrimitive;
unsigned int numPrimitives;
int numVisible;
yarn::Ticket clusterTickets[MaxClusterCount];
}; };
void operator=(const Task& task) using Pool = yarn::BoundedPool<DrawCall, MaxDrawCount, yarn::PoolPolicy::Preserve>;
{ using SetupFunction = int(*)(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
type = task.type.load();
primitiveUnit = task.primitiveUnit.load();
pixelCluster = task.pixelCluster.load();
}
std::atomic<int> type;
std::atomic<int> primitiveUnit;
std::atomic<int> pixelCluster;
};
struct PrimitiveProgress DrawCall();
{ ~DrawCall();
void init()
{
drawCall = 0;
firstPrimitive = 0;
primitiveCount = 0;
visible = 0;
references = 0;
}
std::atomic<int> drawCall;
std::atomic<int> firstPrimitive;
std::atomic<int> primitiveCount;
std::atomic<int> visible;
std::atomic<int> references;
};
struct PixelProgress static void run(const yarn::Loan<DrawCall>& draw, yarn::Ticket::Queue* tickets, yarn::Ticket::Queue clusterQueues[MaxClusterCount]);
{ static void processVertices(DrawCall* draw, BatchData* batch);
void init() static void processPrimitives(DrawCall* draw, BatchData* batch);
{ static void processPixels(const yarn::Loan<DrawCall>& draw, const yarn::Loan<BatchData>& batch, const std::shared_ptr<yarn::Finally>& finally);
drawCall = 0; void setup();
processedPrimitives = 0; void teardown();
executing = false;
} int id;
std::atomic<int> drawCall; BatchData::Pool *batchDataPool;
std::atomic<int> processedPrimitives; unsigned int numPrimitives;
std::atomic<int> executing; unsigned int numPrimitivesPerBatch;
unsigned int numBatches;
VkPrimitiveTopology topology;
VkIndexType indexType;
std::shared_ptr<Routine> vertexRoutine;
std::shared_ptr<Routine> setupRoutine;
std::shared_ptr<Routine> pixelRoutine;
VertexProcessor::RoutinePointer vertexPointer;
SetupProcessor::RoutinePointer setupPointer;
PixelProcessor::RoutinePointer pixelPointer;
SetupFunction setupPrimitives;
SetupProcessor::State setupState;
vk::ImageView *renderTarget[RENDERTARGETS];
vk::ImageView *depthBuffer;
vk::ImageView *stencilBuffer;
TaskEvents *events;
vk::Query* occlusionQuery;
DrawData *data;
static void processPrimitiveVertices(
unsigned int triangleIndicesOut[MaxBatchSize + 1][3],
const void *primitiveIndices,
VkIndexType indexType,
unsigned int start,
unsigned int triangleCount,
VkPrimitiveTopology topology);
static int setupTriangles(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static int setupLines(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static int setupPoints(Triangle *triangles, Primitive *primitives, const DrawCall *drawCall, int count);
static bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
static bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
}; };
class Renderer : public VertexProcessor, public PixelProcessor, public SetupProcessor
{
public: public:
Renderer(vk::Device* device); Renderer(vk::Device* device);
virtual ~Renderer(); virtual ~Renderer();
void *operator new(size_t size);
void operator delete(void * mem);
bool hasOcclusionQuery() const { return occlusionQuery != nullptr; } bool hasOcclusionQuery() const { return occlusionQuery != nullptr; }
void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex, void draw(const sw::Context* context, VkIndexType indexType, unsigned int count, int baseVertex,
...@@ -182,74 +211,18 @@ namespace sw ...@@ -182,74 +211,18 @@ namespace sw
void synchronize(); void synchronize();
static int getClusterCount() { return clusterCount; }
private: private:
static void threadFunction(void *parameters);
void threadLoop(int threadIndex);
void taskLoop(int threadIndex);
void findAvailableTasks();
void scheduleTask(int threadIndex);
void executeTask(int threadIndex);
void finishRendering(Task &pixelTask);
void processPrimitiveVertices(int unit, unsigned int start, unsigned int count, unsigned int loop, int thread);
int setupTriangles(int batch, int count);
int setupLines(int batch, int count);
int setupPoints(int batch, int count);
bool setupLine(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
bool setupPoint(Primitive &primitive, Triangle &triangle, const DrawCall &draw);
void updateConfiguration(bool initialUpdate = false);
void initializeThreads();
void terminateThreads();
VkViewport viewport; VkViewport viewport;
VkRect2D scissor; VkRect2D scissor;
Triangle *triangleBatch[16]; DrawCall::Pool drawCallPool;
Primitive *primitiveBatch[16]; DrawCall::BatchData::Pool batchDataPool;
std::atomic<int> exitThreads; std::atomic<int> nextDrawID = {0};
std::atomic<int> threadsAwake;
std::thread *worker[16];
Event *resume[16]; // Events for resuming threads
Event *suspend[16]; // Events for suspending threads
Event *resumeApp; // Event for resuming the application thread
PrimitiveProgress primitiveProgress[16];
PixelProgress pixelProgress[16];
Task task[16]; // Current tasks for threads
enum {
DRAW_COUNT = 16, // Number of draw calls buffered (must be power of 2)
DRAW_COUNT_BITS = DRAW_COUNT - 1,
};
DrawCall *drawCall[DRAW_COUNT];
DrawCall *drawList[DRAW_COUNT];
std::atomic<int> currentDraw;
std::atomic<int> nextDraw;
enum {
TASK_COUNT = 32, // Size of the task queue (must be power of 2)
TASK_COUNT_BITS = TASK_COUNT - 1,
};
Task taskQueue[TASK_COUNT];
std::atomic<int> qHead;
std::atomic<int> qSize;
static std::atomic<int> unitCount;
static std::atomic<int> clusterCount;
std::mutex schedulerMutex;
VertexTask *vertexTask[16];
vk::Query *occlusionQuery; vk::Query *occlusionQuery;
WaitGroup sync; yarn::Ticket::Queue drawTickets;
yarn::Ticket::Queue clusterQueues[MaxClusterCount];
VertexProcessor::State vertexState; VertexProcessor::State vertexState;
SetupProcessor::State setupState; SetupProcessor::State setupState;
...@@ -262,40 +235,6 @@ namespace sw ...@@ -262,40 +235,6 @@ namespace sw
vk::Device* device; vk::Device* device;
}; };
struct DrawCall
{
DrawCall();
~DrawCall();
std::atomic<int> topology;
std::atomic<int> indexType;
std::atomic<int> batchSize;
std::shared_ptr<Routine> vertexRoutine;
std::shared_ptr<Routine> setupRoutine;
std::shared_ptr<Routine> pixelRoutine;
VertexProcessor::RoutinePointer vertexPointer;
SetupProcessor::RoutinePointer setupPointer;
PixelProcessor::RoutinePointer pixelPointer;
int (Renderer::*setupPrimitives)(int batch, int count);
SetupProcessor::State setupState;
vk::ImageView *renderTarget[RENDERTARGETS];
vk::ImageView *depthBuffer;
vk::ImageView *stencilBuffer;
TaskEvents *events;
vk::Query *occlusionQuery;
std::atomic<int> primitive; // Current primitive to enter pipeline
std::atomic<int> count; // Number of primitives to render
std::atomic<int> references; // Remaining references to this draw call, 0 when done drawing, -1 when resources unlocked and slot is free
DrawData *data;
};
} }
#endif // sw_Renderer_hpp #endif // sw_Renderer_hpp
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment