ES31: Support atomic functions on D3D11 - Part III

This patch is the third one to support atomic functions on D3D11. In this patch we enable support for atomic function returns outside of assignments (e.g. part of arithmetic operations or to index into arrays) and when used directly initialize a variable. Note that we are still missing the functionality to be tag loops with [allow_uav_condition] as required by InterlockedCompareExchange. BUG=angleproject:2682 TEST=angle_end2end_tests Change-Id: Ia409ebb10621fd5c514cf6c76f366a320a9d9fc1 Reviewed-on: https://chromium-review.googlesource.com/1208317Reviewed-by: Bryan Bernhart <bryan.bernhart@intel.com> Reviewed-by: Corentin Wallez <cwallez@chromium.org> Commit-Queue: Corentin Wallez <cwallez@chromium.org>

ES31: Support atomic functions on D3D11 - Part III
c55aefe3 · Bryan Bernhart · Commit Bot · 799da6d1 · c55aefe3 · c55aefe3
Commit c55aefe3 authored Aug 28, 2018 by Bryan Bernhart Committed by Commit Bot Sep 21, 2018
6 changed files
--- a/src/compiler/translator/TranslatorHLSL.cpp
+++ b/src/compiler/translator/TranslatorHLSL.cpp
@@ -129,7 +129,7 @@ void TranslatorHLSL::translate(TIntermBlock *root,
    if (getShaderVersion() >= 310)
    {
-        sh::RewriteAtomicFunctionExpressions(root, &getSymbolTable());
+        sh::RewriteAtomicFunctionExpressions(root, &getSymbolTable(), getShaderVersion());
    }
    sh::OutputHLSL outputHLSL(getShaderType(), getShaderVersion(), getExtensionBehavior(),

--- a/src/compiler/translator/tree_ops/RewriteAtomicFunctionExpressions.cpp
+++ b/src/compiler/translator/tree_ops/RewriteAtomicFunctionExpressions.cpp
@@ -18,45 +18,96 @@ namespace
 {
 // Traverser that simplifies all the atomic function expressions into the ones that can be directly
 // translated into HLSL.
+//
+// case 1 (only for atomicExchange and atomicCompSwap):
+//  original:
+//      atomicExchange(counter, newValue);
+//  new:
+//      tempValue = atomicExchange(counter, newValue);
+//
+// case 2 (atomic function, temporary variable required):
+//  original:
+//      value = atomicAdd(counter, 1) * otherValue;
+//      someArray[atomicAdd(counter, 1)] = someOtherValue;
+//  new:
+//      value = ((tempValue = atomicAdd(counter, 1)), tempValue) * otherValue;
+//      someArray[((tempValue = atomicAdd(counter, 1)), tempValue)] = someOtherValue;
+//
+// case 3 (atomic function used directly initialize a variable):
+//  original:
+//      int value = atomicAdd(counter, 1);
+//  new:
+//      tempValue = atomicAdd(counter, 1);
+//      int value = tempValue;
+//
 class RewriteAtomicFunctionExpressionsTraverser : public TIntermTraverser
 {
  public:
-    RewriteAtomicFunctionExpressionsTraverser(TSymbolTable *symbolTable);
+    RewriteAtomicFunctionExpressionsTraverser(TSymbolTable *symbolTable, int shaderVersion);
    bool visitAggregate(Visit visit, TIntermAggregate *node) override;
+    bool visitBlock(Visit visit, TIntermBlock *node) override;
  private:
    static bool IsAtomicExchangeOrCompSwapNoReturnValue(TIntermAggregate *node,
                                                        TIntermNode *parentNode);
+    static bool IsAtomicFunctionInsideExpression(TIntermAggregate *node, TIntermNode *parentNode);
+    void rewriteAtomicFunctionCallNode(TIntermAggregate *oldAtomicFunctionNode);
-    void separateAtomicFunctionCallNode(TIntermAggregate *oldAtomicFunctionNode);
+    const TVariable *getTempVariable(const TType *type);
+    int mShaderVersion;
+    TIntermSequence mTempVariables;
 };
 RewriteAtomicFunctionExpressionsTraverser::RewriteAtomicFunctionExpressionsTraverser(
-    TSymbolTable *symbolTable)
+    TSymbolTable *symbolTable,
-    : TIntermTraverser(true, false, false, symbolTable)
+    int shaderVersion)
+    : TIntermTraverser(false, false, true, symbolTable), mShaderVersion(shaderVersion)
 {
 }
-void RewriteAtomicFunctionExpressionsTraverser::separateAtomicFunctionCallNode(
+void RewriteAtomicFunctionExpressionsTraverser::rewriteAtomicFunctionCallNode(
    TIntermAggregate *oldAtomicFunctionNode)
 {
    ASSERT(oldAtomicFunctionNode);
-    TIntermSequence insertions;
+    const TVariable *returnVariable = getTempVariable(&oldAtomicFunctionNode->getType());
-    // Declare a temporary variable
-    TIntermDeclaration *returnVariableDeclaration;
-    TVariable *returnVariable = DeclareTempVariable(mSymbolTable, &oldAtomicFunctionNode->getType(),
-                                                    EvqTemporary, &returnVariableDeclaration);
-    insertions.push_back(returnVariableDeclaration);
-    // Use this variable as the return value of the atomic function call.
+    TIntermBinary *rewrittenNode = new TIntermBinary(
-    TIntermBinary *atomicFunctionAssignment = new TIntermBinary(
        TOperator::EOpAssign, CreateTempSymbolNode(returnVariable), oldAtomicFunctionNode);
-    insertStatementsInParentBlock(insertions);
+    auto *parentNode = getParentNode();
-    queueReplacement(atomicFunctionAssignment, OriginalNode::IS_DROPPED);
+    auto *parentBinary = parentNode->getAsBinaryNode();
+    if (parentBinary && parentBinary->getOp() == EOpInitialize)
+    {
+        insertStatementInParentBlock(rewrittenNode);
+        queueReplacement(CreateTempSymbolNode(returnVariable), OriginalNode::IS_DROPPED);
+    }
+    else
+    {
+        // As all atomic function assignment will be converted to the last argument of an
+        // interlocked function, if we need the return value, assignment needs to be wrapped with
+        // the comma operator and the temporary variables.
+        if (!parentNode->getAsBlock())
+        {
+            rewrittenNode = TIntermBinary::CreateComma(
+                rewrittenNode, new TIntermSymbol(returnVariable), mShaderVersion);
+        }
+        queueReplacement(rewrittenNode, OriginalNode::IS_DROPPED);
+    }
+}
+const TVariable *RewriteAtomicFunctionExpressionsTraverser::getTempVariable(const TType *type)
+{
+    TIntermDeclaration *variableDeclaration;
+    TVariable *returnVariable =
+        DeclareTempVariable(mSymbolTable, type, EvqTemporary, &variableDeclaration);
+    mTempVariables.push_back(variableDeclaration);
+    return returnVariable;
 }
 bool RewriteAtomicFunctionExpressionsTraverser::IsAtomicExchangeOrCompSwapNoReturnValue(
@@ -68,21 +119,57 @@ bool RewriteAtomicFunctionExpressionsTraverser::IsAtomicExchangeOrCompSwapNoRetu
           parentNode && parentNode->getAsBlock();
 }
-bool RewriteAtomicFunctionExpressionsTraverser::visitAggregate(Visit visit, TIntermAggregate *node)
+bool RewriteAtomicFunctionExpressionsTraverser::IsAtomicFunctionInsideExpression(
+    TIntermAggregate *node,
+    TIntermNode *parentNode)
 {
-    if (IsAtomicExchangeOrCompSwapNoReturnValue(node, getParentNode()))
+    ASSERT(node);
+    // We only need to handle atomic functions with a parent that it is not block nodes. If the
+    // parent node is block, it means that the atomic function is not inside an expression.
+    if (!IsAtomicFunction(node->getOp()) || parentNode->getAsBlock())
    {
-        separateAtomicFunctionCallNode(node);
        return false;
    }
+    auto *parentAsBinary = parentNode->getAsBinaryNode();
+    // Assignments are handled in OutputHLSL
+    return !parentAsBinary || parentAsBinary->getOp() != EOpAssign;
+}
+bool RewriteAtomicFunctionExpressionsTraverser::visitAggregate(Visit visit, TIntermAggregate *node)
+{
+    ASSERT(visit == PostVisit);
+    TIntermNode *parentNode = getParentNode();
+    if (IsAtomicExchangeOrCompSwapNoReturnValue(node, parentNode) ||
+        IsAtomicFunctionInsideExpression(node, parentNode))
+    {
+        rewriteAtomicFunctionCallNode(node);
+    }
    return true;
 }
+bool RewriteAtomicFunctionExpressionsTraverser::visitBlock(Visit visit, TIntermBlock *node)
+{
+    ASSERT(visit == PostVisit);
+    if (!mTempVariables.empty() && getParentNode()->getAsFunctionDefinition())
+    {
+        insertStatementsInBlockAtPosition(node, 0, mTempVariables, TIntermSequence());
+        mTempVariables.clear();
+    }
+    return true;
+}
 }  // anonymous namespace
-void RewriteAtomicFunctionExpressions(TIntermNode *root, TSymbolTable *symbolTable)
+void RewriteAtomicFunctionExpressions(TIntermNode *root,
+                                      TSymbolTable *symbolTable,
+                                      int shaderVersion)
 {
-    RewriteAtomicFunctionExpressionsTraverser traverser(symbolTable);
+    RewriteAtomicFunctionExpressionsTraverser traverser(symbolTable, shaderVersion);
    traverser.traverse(root);
    traverser.updateTree();
 }

--- a/src/compiler/translator/tree_ops/RewriteAtomicFunctionExpressions.h
+++ b/src/compiler/translator/tree_ops/RewriteAtomicFunctionExpressions.h
@@ -30,7 +30,9 @@ namespace sh
 class TIntermNode;
 class TSymbolTable;
-void RewriteAtomicFunctionExpressions(TIntermNode *root, TSymbolTable *symbolTable);
+void RewriteAtomicFunctionExpressions(TIntermNode *root,
+                                      TSymbolTable *symbolTable,
+                                      int shaderVersion);
 }  // namespace sh
 #endif  // COMPILER_TRANSLATOR_TREEOPS_REWRITE_ATOMIC_FUNCTION_EXPRESSIONS_H_
\ No newline at end of file
--- a/src/compiler/translator/tree_util/IntermTraverse.cpp
+++ b/src/compiler/translator/tree_util/IntermTraverse.cpp
@@ -282,6 +282,18 @@ void TIntermTraverser::insertStatementInParentBlock(TIntermNode *statement)
    insertStatementsInParentBlock(insertions);
 }
+void TIntermTraverser::insertStatementsInBlockAtPosition(TIntermBlock *parent,
+                                                         size_t position,
+                                                         const TIntermSequence &insertionsBefore,
+                                                         const TIntermSequence &insertionsAfter)
+{
+    ASSERT(parent);
+    ASSERT(position >= 0);
+    ASSERT(position < parent->getChildCount());
+    mInsertions.emplace_back(parent, position, insertionsBefore, insertionsAfter);
+}
 void TLValueTrackingTraverser::setInFunctionCallOutParameter(bool inOutParameter)
 {
    mInFunctionCallOutParameter = inOutParameter;

--- a/src/compiler/translator/tree_util/IntermTraverse.h
+++ b/src/compiler/translator/tree_util/IntermTraverse.h
@@ -180,6 +180,14 @@ class TIntermTraverser : angle::NonCopyable
    // Helper to insert a single statement.
    void insertStatementInParentBlock(TIntermNode *statement);
+    // Explicitly specify where to insert statements. The statements are inserted before and after
+    // the specified position. The statements will be inserted once updateTree is called. Note that
+    // two insertions to the same position in the same block are not supported.
+    void insertStatementsInBlockAtPosition(TIntermBlock *parent,
+                                           size_t position,
+                                           const TIntermSequence &insertionsBefore,
+                                           const TIntermSequence &insertionsAfter);
    enum class OriginalNode
    {
        BECOMES_CHILD,

--- a/src/tests/gl_tests/ComputeShaderTest.cpp
+++ b/src/tests/gl_tests/ComputeShaderTest.cpp
@@ -1681,6 +1681,158 @@ TEST_P(ComputeShaderTest, AtomicFunctionsInNonInitializerSingleAssignment)
    runSharedMemoryTest<GLint, 9, 1>(kCSShader, GL_R32I, GL_INT, inputData, expectedValues);
 }
+// Verify using atomic functions in an initializers and using unsigned int works correctly.
+TEST_P(ComputeShaderTest, AtomicFunctionsInitializerWithUnsigned)
+{
+    constexpr char kCShader[] =
+        R"(#version 310 es
+layout (local_size_x = 9, local_size_y = 1, local_size_z = 1) in;
+layout (r32ui, binding = 0) readonly uniform highp uimage2D srcImage;
+layout (r32ui, binding = 1) writeonly uniform highp uimage2D dstImage;
+shared highp uint sharedVariable;
+shared highp uint inputData[9];
+shared highp uint outputData[9];
+void main()
+{
+    uint inputValue = imageLoad(srcImage, ivec2(gl_LocalInvocationID.xy)).x;
+    inputData[gl_LocalInvocationID.x] = inputValue;
+    memoryBarrierShared();
+    barrier();
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        sharedVariable = 0u;
+        uint addValue = atomicAdd(sharedVariable, inputData[0]);
+        outputData[0] = addValue;
+        uint minValue = atomicMin(sharedVariable, inputData[1]);
+        outputData[1] = minValue;
+        uint maxValue = atomicMax(sharedVariable, inputData[2]);
+        outputData[2] = maxValue;
+        uint andValue = atomicAnd(sharedVariable, inputData[3]);
+        outputData[3] = andValue;
+        uint orValue = atomicOr(sharedVariable, inputData[4]);
+        outputData[4] = orValue;
+        uint xorValue = atomicXor(sharedVariable, inputData[5]);
+        outputData[5] = xorValue;
+        uint exchangeValue = atomicExchange(sharedVariable, inputData[6]);
+        outputData[6] = exchangeValue;
+        uint compSwapValue = atomicCompSwap(sharedVariable, 64u, inputData[7]);
+        outputData[7] = compSwapValue;
+        uint sharedVariable = atomicAdd(sharedVariable, inputData[8]);
+        outputData[8] = sharedVariable;
+    }
+    memoryBarrierShared();
+    barrier();
+    imageStore(dstImage, ivec2(gl_LocalInvocationID.xy),
+                uvec4(outputData[gl_LocalInvocationID.x]));
+})";
+    constexpr std::array<GLuint, 9> kInputData      = {{1, 2, 4, 8, 16, 32, 64, 128, 1}};
+    constexpr std::array<GLuint, 9> kExpectedValues = {{0, 1, 1, 4, 0, 16, 48, 64, 128}};
+    runSharedMemoryTest<GLuint, 9, 1>(kCShader, GL_R32UI, GL_UNSIGNED_INT, kInputData,
+                                      kExpectedValues);
+}
+// Verify using atomic functions inside expressions as unsigned int.
+TEST_P(ComputeShaderTest, AtomicFunctionsReturnWithUnsigned)
+{
+    constexpr char kCShader[] =
+        R"(#version 310 es
+layout (local_size_x = 9, local_size_y = 1, local_size_z = 1) in;
+layout (r32ui, binding = 0) readonly uniform highp uimage2D srcImage;
+layout (r32ui, binding = 1) writeonly uniform highp uimage2D dstImage;
+shared highp uint sharedVariable;
+shared highp uint inputData[9];
+shared highp uint outputData[9];
+void main()
+{
+    uint inputValue = imageLoad(srcImage, ivec2(gl_LocalInvocationID.xy)).x;
+    inputData[gl_LocalInvocationID.x] = inputValue;
+    memoryBarrierShared();
+    barrier();
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        sharedVariable = 0u;
+        outputData[0] = 1u + atomicAdd(sharedVariable, inputData[0]);
+        outputData[1] = 1u + atomicMin(sharedVariable, inputData[1]);
+        outputData[2] = 1u + atomicMax(sharedVariable, inputData[2]);
+        outputData[3] = 1u + atomicAnd(sharedVariable, inputData[3]);
+        outputData[4] = 1u + atomicOr(sharedVariable, inputData[4]);
+        outputData[5] = 1u + atomicXor(sharedVariable, inputData[5]);
+        outputData[6] = 1u + atomicExchange(sharedVariable, inputData[6]);
+        outputData[7] = 1u + atomicCompSwap(sharedVariable, 64u, inputData[7]);
+        outputData[8] = 1u + atomicAdd(sharedVariable, inputData[8]);
+    }
+    memoryBarrierShared();
+    barrier();
+    imageStore(dstImage, ivec2(gl_LocalInvocationID.xy),
+                uvec4(outputData[gl_LocalInvocationID.x]));
+})";
+    constexpr std::array<GLuint, 9> kInputData      = {{1, 2, 4, 8, 16, 32, 64, 128, 1}};
+    constexpr std::array<GLuint, 9> kExpectedValues = {{1, 2, 2, 5, 1, 17, 49, 65, 129}};
+    runSharedMemoryTest<GLuint, 9, 1>(kCShader, GL_R32UI, GL_UNSIGNED_INT, kInputData,
+                                      kExpectedValues);
+}
+// Verify using nested atomic functions in expressions.
+TEST_P(ComputeShaderTest, AtomicFunctionsReturnWithMultipleTypes)
+{
+    constexpr char kCShader[] =
+        R"(#version 310 es
+layout (local_size_x = 4, local_size_y = 1, local_size_z = 1) in;
+layout (r32ui, binding = 0) readonly uniform highp uimage2D srcImage;
+layout (r32ui, binding = 1) writeonly uniform highp uimage2D dstImage;
+shared highp uint sharedVariable;
+shared highp int  indexVariable;
+shared highp uint inputData[4];
+shared highp uint outputData[4];
+void main()
+{
+    uint inputValue = imageLoad(srcImage, ivec2(gl_LocalInvocationID.xy)).x;
+    inputData[gl_LocalInvocationID.x] = inputValue;
+    memoryBarrierShared();
+    barrier();
+    if (gl_LocalInvocationID.x == 0u)
+    {
+        sharedVariable = 0u;
+        indexVariable = 2;
+        outputData[0] = 1u + atomicAdd(sharedVariable, inputData[atomicAdd(indexVariable, -1)]);
+        outputData[1] = 1u + atomicAdd(sharedVariable, inputData[atomicAdd(indexVariable, -1)]);
+        outputData[2] = 1u + atomicAdd(sharedVariable, inputData[atomicAdd(indexVariable, -1)]);
+        outputData[3] = atomicAdd(sharedVariable, 0u);
+    }
+    memoryBarrierShared();
+    barrier();
+    imageStore(dstImage, ivec2(gl_LocalInvocationID.xy),
+                uvec4(outputData[gl_LocalInvocationID.x]));
+})";
+    constexpr std::array<GLuint, 4> kInputData      = {{1, 2, 3, 0}};
+    constexpr std::array<GLuint, 4> kExpectedValues = {{1, 4, 6, 6}};
+    runSharedMemoryTest<GLuint, 4, 1>(kCShader, GL_R32UI, GL_UNSIGNED_INT, kInputData,
+                                      kExpectedValues);
+}
 // Basic uniform buffer functionality.
 TEST_P(ComputeShaderTest, UniformBuffer)
 {