SpirvRoutine: Add activeLaneMask to conditionally load / store per lane.

Use this to disable reads and writes on compute shader lanes that are not part of the subgroup. Bug: b/126871859 Bug: b/128527271 Change-Id: Idd7ad240a8f09e6e47db34b6ed5b0ec7ba959d39 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/27009 Presubmit-Ready: Ben Clayton <bclayton@google.com> Kokoro-Presubmit: kokoro <noreply+kokoro@google.com> Tested-by: Ben Clayton <bclayton@google.com> Reviewed-by: Chris Forbes <chrisforbes@google.com>

SpirvRoutine: Add activeLaneMask to conditionally load / store per lane.
49d8158b · Ben Clayton · 35e90e22 · 49d8158b · 49d8158b · 49d8158b
Commit 49d8158b authored Mar 12, 2019 by Ben Clayton
4 changed files
--- a/src/Pipeline/ComputeProgram.cpp
+++ b/src/Pipeline/ComputeProgram.cpp
@@ -80,6 +80,9 @@ namespace sw
 		{
 			Int4 localInvocationIndex = Int4(invocationIndex) + Int4(0, 1, 2, 3);
+			// Disable lanes where (invocationIDs >= numInvocations)
+			routine.activeLaneMask = CmpLT(localInvocationIndex, Int4(numInvocations));
 			Int4 localInvocationID[3];
 			{
 				Int4 idx = localInvocationIndex;
@@ -113,13 +116,10 @@ namespace sw
 						Int4(Extract(workgroupSize, component)) +
 						localInvocationID[component];
 					value[builtin.FirstComponent + component] = As<Float4>(globalInvocationID);
-					// RR_WATCH(component, globalInvocationID);
+					// RR_WATCH(component, globalInvocationID, routine.activeLaneMask);
 				}
 			});
-			// TODO(bclayton): Disable lanes where (invocationIDs >= numInvocations)
-			// Int4 enabledLanes = invocationIDs < Int4(numInvocations);
 			// Process numLanes of the workgroup.
 			shader->emit(&routine);
 		}

--- a/src/Pipeline/SpirvShader.cpp
+++ b/src/Pipeline/SpirvShader.cpp
@@ -1269,43 +1269,57 @@ namespace sw
 		}
 		bool interleavedByLane = IsStorageInterleavedByLane(pointerBaseTy.storageClass);
+		auto anyInactiveLanes = SignMask(~routine->activeLaneMask) != 0;
-		auto &dst = routine->createIntermediate(objectId, objectTy.sizeInComponents);
+		auto load = SpirvRoutine::Value(objectTy.sizeInComponents);
-		if (pointer.kind == Object::Kind::Value)
+		If(pointer.kind == Object::Kind::Value || anyInactiveLanes)
 		{
-			// Divergent offsets.
+			// Divergent offsets or masked lanes.
-			auto offsets = routine->getIntermediate(pointerId).Int(0);
+			auto offsets = pointer.kind == Object::Kind::Value ?
+					As<SIMD::Int>(routine->getIntermediate(pointerId).Int(0)) :
+					RValue<SIMD::Int>(SIMD::Int(0));
 			for (auto i = 0u; i < objectTy.sizeInComponents; i++)
 			{
 				// i wish i had a Float,Float,Float,Float constructor here..
-				SIMD::Float v;
 				for (int j = 0; j < SIMD::Width; j++)
 				{
-					Int offset = Int(i) + Extract(offsets, j);
+					If(Extract(routine->activeLaneMask, j) != 0)
-					if (interleavedByLane) { offset = offset * SIMD::Width + j; }
+					{
-					v = Insert(v, ptrBase[offset], j);
+						Int offset = Int(i) + Extract(offsets, j);
+						if (interleavedByLane) { offset = offset * SIMD::Width + j; }
+						load[i] = Insert(load[i], ptrBase[offset], j);
+					}
 				}
-				dst.emplace(i, v);
 			}
 		}
-		else if (interleavedByLane)
+		Else
 		{
-			// Lane-interleaved data. No divergent offsets.
+			// No divergent offsets or masked lanes.
-			Pointer<SIMD::Float> src = ptrBase;
+			if (interleavedByLane)
-			for (auto i = 0u; i < objectTy.sizeInComponents; i++)
 			{
-				dst.emplace(i, src[i]);
+				// Lane-interleaved data.
+				Pointer<SIMD::Float> src = ptrBase;
+				for (auto i = 0u; i < objectTy.sizeInComponents; i++)
+				{
+					load[i] = src[i];
+				}
 			}
-		}
+			else
-		else
-		{
-			// Non-interleaved data. No divergent offsets.
-			for (auto i = 0u; i < objectTy.sizeInComponents; i++)
 			{
-				dst.emplace(i, RValue<SIMD::Float>(ptrBase[i]));
+				// Non-interleaved data.
+				for (auto i = 0u; i < objectTy.sizeInComponents; i++)
+				{
+					load[i] = RValue<SIMD::Float>(ptrBase[i]);
+				}
 			}
 		}
+		auto &dst = routine->createIntermediate(objectId, objectTy.sizeInComponents);
+		for (auto i = 0u; i < objectTy.sizeInComponents; i++)
+		{
+			dst.emplace(i, load[i]);
+		}
 	}
 	void SpirvShader::EmitAccessChain(InsnIterator insn, SpirvRoutine *routine) const
@@ -1348,28 +1362,35 @@ namespace sw
 		}
 		bool interleavedByLane = IsStorageInterleavedByLane(pointerBaseTy.storageClass);
+		auto anyInactiveLanes = SignMask(~routine->activeLaneMask) != 0;
 		if (object.kind == Object::Kind::Constant)
 		{
+			// Constant source data.
 			auto src = reinterpret_cast<float *>(object.constantValue.get());
+			If(pointer.kind == Object::Kind::Value || anyInactiveLanes)
-			if (pointer.kind == Object::Kind::Value)
 			{
-				// Constant source data. Divergent offsets.
+				// Divergent offsets or masked lanes.
-				auto offsets = routine->getIntermediate(pointerId).Int(0);
+				auto offsets = pointer.kind == Object::Kind::Value ?
+						As<SIMD::Int>(routine->getIntermediate(pointerId).Int(0)) :
+						RValue<SIMD::Int>(SIMD::Int(0));
 				for (auto i = 0u; i < elementTy.sizeInComponents; i++)
 				{
 					for (int j = 0; j < SIMD::Width; j++)
 					{
-						Int offset = Int(i) + Extract(offsets, j);
+						If(Extract(routine->activeLaneMask, j) != 0)
-						if (interleavedByLane) { offset = offset * SIMD::Width + j; }
+						{
-						ptrBase[offset] = RValue<Float>(src[i]);
+							Int offset = Int(i) + Extract(offsets, j);
+							if (interleavedByLane) { offset = offset * SIMD::Width + j; }
+							ptrBase[offset] = RValue<Float>(src[i]);
+						}
 					}
 				}
 			}
-			else
+			Else
 			{
-				// Constant source data. No divergent offsets.
+				// Constant source data.
+				// No divergent offsets or masked lanes.
 				Pointer<SIMD::Float> dst = ptrBase;
 				for (auto i = 0u; i < elementTy.sizeInComponents; i++)
 				{
@@ -1379,38 +1400,47 @@ namespace sw
 		}
 		else
 		{
+			// Intermediate source data.
 			auto &src = routine->getIntermediate(objectId);
+			If(pointer.kind == Object::Kind::Value || anyInactiveLanes)
-			if (pointer.kind == Object::Kind::Value)
 			{
-				// Intermediate source data. Divergent offsets.
+				// Divergent offsets or masked lanes.
-				auto offsets = routine->getIntermediate(pointerId).Int(0);
+				auto offsets = pointer.kind == Object::Kind::Value ?
+						As<SIMD::Int>(routine->getIntermediate(pointerId).Int(0)) :
+						RValue<SIMD::Int>(SIMD::Int(0));
 				for (auto i = 0u; i < elementTy.sizeInComponents; i++)
 				{
 					for (int j = 0; j < SIMD::Width; j++)
 					{
-						Int offset = Int(i) + Extract(offsets, j);
+						If(Extract(routine->activeLaneMask, j) != 0)
-						if (interleavedByLane) { offset = offset * SIMD::Width + j; }
+						{
-						ptrBase[offset] = Extract(src.Float(i), j);
+							Int offset = Int(i) + Extract(offsets, j);
+							if (interleavedByLane) { offset = offset * SIMD::Width + j; }
+							ptrBase[offset] = Extract(src.Float(i), j);
+						}
 					}
 				}
 			}
-			else if (interleavedByLane)
+			Else
 			{
-				// Intermediate source data. Lane-interleaved data. No divergent offsets.
+				// No divergent offsets or masked lanes.
-				Pointer<SIMD::Float> dst = ptrBase;
+				if (interleavedByLane)
-				for (auto i = 0u; i < elementTy.sizeInComponents; i++)
 				{
-					dst[i] = src.Float(i);
+					// Lane-interleaved data.
+					Pointer<SIMD::Float> dst = ptrBase;
+					for (auto i = 0u; i < elementTy.sizeInComponents; i++)
+					{
+						dst[i] = src.Float(i);
+					}
 				}
-			}
+				else
-			else
-			{
-				// Intermediate source data. Non-interleaved data. No divergent offsets.
-				Pointer<SIMD::Float> dst = ptrBase;
-				for (auto i = 0u; i < elementTy.sizeInComponents; i++)
 				{
-					dst[i] = SIMD::Float(src.Float(i));
+					// Intermediate source data. Non-interleaved data.
+					Pointer<SIMD::Float> dst = ptrBase;
+					for (auto i = 0u; i < elementTy.sizeInComponents; i++)
+					{
+						dst[i] = SIMD::Float(src.Float(i));
+					}
 				}
 			}
 		}

--- a/src/Pipeline/SpirvShader.hpp
+++ b/src/Pipeline/SpirvShader.hpp
@@ -23,6 +23,7 @@
 #include <array>
 #include <cstring>
+#include <functional>
 #include <string>
 #include <vector>
 #include <unordered_map>
@@ -518,6 +519,8 @@ namespace sw
 		Value inputs = Value{MAX_INTERFACE_COMPONENTS};
 		Value outputs = Value{MAX_INTERFACE_COMPONENTS};
+		SIMD::Int activeLaneMask = SIMD::Int(0xFFFFFFFF);
 		std::array<Pointer<Byte>, vk::MAX_BOUND_DESCRIPTOR_SETS> descriptorSets;
 		void createLvalue(SpirvShader::Object::ID id, uint32_t size)

--- a/tests/VulkanUnitTests/unittests.cpp
+++ b/tests/VulkanUnitTests/unittests.cpp
@@ -405,6 +405,65 @@ TEST_P(SwiftShaderVulkanBufferToBufferComputeTest, Memcpy)
    test(src.str(), [](uint32_t i) { return i; }, [](uint32_t i) { return i; });
 }
+TEST_P(SwiftShaderVulkanBufferToBufferComputeTest, GlobalInvocationId)
+{
+    std::stringstream src;
+    src <<
+              "OpCapability Shader\n"
+              "OpMemoryModel Logical GLSL450\n"
+              "OpEntryPoint GLCompute %1 \"main\" %2\n"
+              "OpExecutionMode %1 LocalSize " <<
+                GetParam().localSizeX << " " <<
+                GetParam().localSizeY << " " <<
+                GetParam().localSizeZ << "\n" <<
+              "OpDecorate %3 ArrayStride 4\n"
+              "OpMemberDecorate %4 0 Offset 0\n"
+              "OpDecorate %4 BufferBlock\n"
+              "OpDecorate %5 DescriptorSet 0\n"
+              "OpDecorate %5 Binding 1\n"
+              "OpDecorate %2 BuiltIn GlobalInvocationId\n"
+              "OpDecorate %6 DescriptorSet 0\n"
+              "OpDecorate %6 Binding 0\n"
+         "%7 = OpTypeVoid\n"
+         "%8 = OpTypeFunction %7\n"             // void()
+         "%9 = OpTypeInt 32 1\n"                // int32
+        "%10 = OpTypeInt 32 0\n"                // uint32
+         "%3 = OpTypeRuntimeArray %9\n"         // int32[]
+         "%4 = OpTypeStruct %3\n"               // struct{ int32[] }
+        "%11 = OpTypePointer Uniform %4\n"      // struct{ int32[] }*
+         "%5 = OpVariable %11 Uniform\n"        // struct{ int32[] }* in
+        "%12 = OpConstant %9 0\n"               // int32(0)
+        "%13 = OpConstant %9 1\n"               // int32(1)
+        "%14 = OpConstant %10 0\n"              // uint32(0)
+        "%15 = OpConstant %10 1\n"              // uint32(1)
+        "%16 = OpConstant %10 2\n"              // uint32(2)
+        "%17 = OpTypeVector %10 3\n"            // vec4<int32>
+        "%18 = OpTypePointer Input %17\n"       // vec4<int32>*
+         "%2 = OpVariable %18 Input\n"          // gl_GlobalInvocationId
+        "%19 = OpTypePointer Input %10\n"       // uint32*
+         "%6 = OpVariable %11 Uniform\n"        // struct{ int32[] }* out
+        "%20 = OpTypePointer Uniform %9\n"      // int32*
+         "%1 = OpFunction %7 None %8\n"         // -- Function begin --
+        "%21 = OpLabel\n"
+        "%22 = OpAccessChain %19 %2 %14\n"      // &gl_GlobalInvocationId.x
+        "%23 = OpAccessChain %19 %2 %15\n"      // &gl_GlobalInvocationId.y
+        "%24 = OpAccessChain %19 %2 %16\n"      // &gl_GlobalInvocationId.z
+        "%25 = OpLoad %10 %22\n"                // gl_GlobalInvocationId.x
+        "%26 = OpLoad %10 %23\n"                // gl_GlobalInvocationId.y
+        "%27 = OpLoad %10 %24\n"                // gl_GlobalInvocationId.z
+        "%28 = OpAccessChain %20 %6 %12 %25\n"  // &in.arr[gl_GlobalInvocationId.x]
+        "%29 = OpLoad %9 %28\n"                 // out.arr[gl_GlobalInvocationId.x]
+        "%30 = OpIAdd %9 %29 %26\n"             // in[gl_GlobalInvocationId.x] + gl_GlobalInvocationId.y
+        "%31 = OpIAdd %9 %30 %27\n"             // in[gl_GlobalInvocationId.x] + gl_GlobalInvocationId.y + gl_GlobalInvocationId.z
+        "%32 = OpAccessChain %20 %5 %12 %25\n"  // &out.arr[gl_GlobalInvocationId.x]
+              "OpStore %32 %31\n"               // out.arr[gl_GlobalInvocationId.x] = in[gl_GlobalInvocationId.x] + gl_GlobalInvocationId.y + gl_GlobalInvocationId.z
+              "OpReturn\n"
+              "OpFunctionEnd\n";
+    // gl_GlobalInvocationId.y and gl_GlobalInvocationId.z should both be zero.
+    test(src.str(), [](uint32_t i) { return i; }, [](uint32_t i) { return i; });
+}
 TEST_P(SwiftShaderVulkanBufferToBufferComputeTest, BranchSimple)
 {
    std::stringstream src;