Pipeline: Use Yarn to make compute multi-threaded.

Bug: b/139142453 Change-Id: I466b7c935db03104cb4df90735fafe10905bef9e Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/35568Tested-by: Ben Clayton <bclayton@google.com> Reviewed-by: Chris Forbes <chrisforbes@google.com> Reviewed-by: Nicolas Capens <nicolascapens@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com>

Pipeline: Use Yarn to make compute multi-threaded.
f7b7b706 · Ben Clayton · d6c61361 · f7b7b706
Commit f7b7b706 authored Aug 27, 2019 by Ben Clayton
Hide whitespace changes
Inline Side-by-side

Showing with 33 additions and 12 deletions

ComputeProgram.cpp src/Pipeline/ComputeProgram.cpp +33 -12

No files found.
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -18,6 +18,10 @@
 #include "Vulkan/VkDebug.hpp"
 #include "Vulkan/VkPipelineLayout.hpp"
+#include "Yarn/Defer.hpp"
+#include "Yarn/Trace.hpp"
+#include "Yarn/WaitGroup.hpp"
 #include <queue>
 namespace
@@ -40,6 +44,8 @@ namespace sw
 	void ComputeProgram::generate()
 	{
+		YARN_SCOPED_EVENT("ComputeProgram::generate");
 		SpirvRoutine routine(pipelineLayout);
 		shader->emitProlog(&routine);
 		emit(&routine);
@@ -199,11 +205,6 @@ namespace sw
 		auto invocationsPerWorkgroup = modes.WorkgroupSizeX * modes.WorkgroupSizeY * modes.WorkgroupSizeZ;
 		auto subgroupsPerWorkgroup = (invocationsPerWorkgroup + invocationsPerSubgroup - 1) / invocationsPerSubgroup;
-		// We're sharing a buffer here across all workgroups.
-		// We can only do this because we know a single workgroup is in flight
-		// at any time.
-		std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
 		Data data;
 		data.descriptorSets = descriptorSets;
 		data.descriptorDynamicOffsets = descriptorDynamicOffsets;
@@ -221,14 +222,33 @@ namespace sw
 		data.pushConstants = pushConstants;
 		data.constants = &sw::constants;
-		for (uint32_t groupZ = baseGroupZ; groupZ < baseGroupZ + groupCountZ; groupZ++)
+		yarn::WaitGroup wg;
+		const uint32_t batchCount = 16;
+		auto groupCount = groupCountX * groupCountY * groupCountZ;
+		for (uint32_t batchID = 0; batchID < batchCount && batchID < groupCount; batchID++)
 		{
-			for (uint32_t groupY = baseGroupY; groupY < baseGroupY + groupCountY; groupY++)
+			wg.add(1);
+			yarn::schedule([=, &data]
 			{
-				for (uint32_t groupX = baseGroupX; groupX < baseGroupX + groupCountX; groupX++)
+				defer(wg.done());
+				std::vector<uint8_t> workgroupMemory(shader->workgroupMemory.size());
+				for (uint32_t groupIndex = batchID; groupIndex < groupCount; groupIndex += batchCount)
 				{
+					auto modulo = groupIndex;
+					auto groupOffsetZ = modulo / (groupCountX * groupCountY);
+					modulo -= groupOffsetZ * (groupCountX * groupCountY);
+					auto groupOffsetY = modulo / groupCountX;
+					modulo -= groupOffsetY * groupCountX;
+					auto groupOffsetX = modulo;
+					auto groupZ = baseGroupZ + groupOffsetZ;
+					auto groupY = baseGroupY + groupOffsetY;
+					auto groupX = baseGroupX + groupOffsetX;
+					YARN_SCOPED_EVENT("groupX: %d, groupY: %d, groupZ: %d", groupX, groupY, groupZ);
-					// TODO(bclayton): Split work across threads.
 					using Coroutine = std::unique_ptr<rr::Stream<SpirvShader::YieldResult>>;
 					std::queue<Coroutine> coroutines;
@@ -261,10 +281,11 @@ namespace sw
 							coroutines.push(std::move(coroutine));
 						}
 					}
+				}
+			});
+		}
-				} // groupX
+		wg.wait();
-			} // groupY
-		} // groupZ
 	}
 } // namespace sw