Vulkan: Enable CPU only buffers for PBOs

Add support for a CPU only buffer for PBOs that serve as the destination for all host operations like MapBuffer*. This removes the latency caused by waiting for the in-flight GPU commands to be complete before handing over the buffer to the app. This change removes a ~6ms wait/sleep on the first call to MapBuffer* in each frame of Manhattan Bug: angleproject:4339 Tests: angle_end2end_tests --gtest_filter=BufferDataTest*Vulkan Change-Id: I52016b160af8a670cc30f01c05e48f699521310f Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/2116874 Commit-Queue: Mohan Maiya <m.maiya@samsung.com> Reviewed-by: Tobin Ehlis <tobine@google.com>

Vulkan: Enable CPU only buffers for PBOs
745e0712 · Mohan Maiya · Commit Bot · dcd98298 · 745e0712 · 745e0712
Commit 745e0712 authored Mar 21, 2020 by Mohan Maiya Committed by Commit Bot Apr 02, 2020
10 changed files
--- a/src/libANGLE/renderer/vulkan/BufferVk.cpp
+++ b/src/libANGLE/renderer/vulkan/BufferVk.cpp
--- a/src/libANGLE/renderer/vulkan/BufferVk.h
+++ b/src/libANGLE/renderer/vulkan/BufferVk.h
@@ -100,13 +100,13 @@ class BufferVk : public BufferImpl
                               VkDeviceSize length,
                               GLbitfield access,
                               void **mapPtr);
-    void unmapImpl(ContextVk *contextVk);
+    angle::Result unmapImpl(ContextVk *contextVk);
    // Calls copyBuffer internally.
-    angle::Result copyToBuffer(ContextVk *contextVk,
+    angle::Result copyToBufferImpl(ContextVk *contextVk,
-                               vk::BufferHelper *destBuffer,
+                                   vk::BufferHelper *destBuffer,
-                               uint32_t copyCount,
+                                   uint32_t copyCount,
-                               const VkBufferCopy *copies);
+                                   const VkBufferCopy *copies);
    ConversionBuffer *getVertexConversionBuffer(RendererVk *renderer,
                                                angle::FormatID formatID,
@@ -116,6 +116,29 @@ class BufferVk : public BufferImpl
  private:
    void initializeStagingBuffer(ContextVk *contextVk, gl::BufferBinding target, size_t size);
+    angle::Result initializeShadowBuffer(ContextVk *contextVk,
+                                         gl::BufferBinding target,
+                                         size_t size);
+    ANGLE_INLINE uint8_t *getShadowBuffer(size_t offset)
+    {
+        return (mShadowBuffer.getCurrentBuffer() + offset);
+    }
+    ANGLE_INLINE const uint8_t *getShadowBuffer(size_t offset) const
+    {
+        return (mShadowBuffer.getCurrentBuffer() + offset);
+    }
+    void updateShadowBuffer(const uint8_t *data, size_t size, size_t offset);
+    angle::Result directUpdate(ContextVk *contextVk,
+                               const uint8_t *data,
+                               size_t size,
+                               size_t offset);
+    angle::Result stagedUpdate(ContextVk *contextVk,
+                               const uint8_t *data,
+                               size_t size,
+                               size_t offset);
    angle::Result setDataImpl(ContextVk *contextVk,
                              const uint8_t *data,
                              size_t size,
@@ -145,6 +168,13 @@ class BufferVk : public BufferImpl
    // All staging buffer support is provided by a DynamicBuffer.
    vk::DynamicBuffer mStagingBuffer;
+    // For GPU-read only buffers glMap* latency is reduced by maintaining a copy
+    // of the buffer which is writeable only by the CPU. The contents are updated on all
+    // glData/glSubData/glCopy calls. With this, a glMap* call becomes a non-blocking
+    // operation by elimnating the need to wait on any recorded or in-flight GPU commands.
+    // We use DynamicShadowBuffer class to encapsulate all the bookeeping logic.
+    vk::DynamicShadowBuffer mShadowBuffer;
    // A cache of converted vertex data.
    std::vector<VertexConversionBuffer> mVertexConversionBuffers;
 };

--- a/src/libANGLE/renderer/vulkan/ContextVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ContextVk.cpp
@@ -1000,7 +1000,7 @@ angle::Result ContextVk::setupIndexedDraw(const gl::Context *context,
                const size_t byteCount = static_cast<size_t>(elementArrayBuffer->getSize()) -
                                         reinterpret_cast<uintptr_t>(indices);
                ANGLE_TRY(mVertexArray->convertIndexBufferCPU(this, indexType, byteCount, src));
-                bufferVk->unmapImpl(this);
+                ANGLE_TRY(bufferVk->unmapImpl(this));
            }
            else
            {

--- a/src/libANGLE/renderer/vulkan/ResourceVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ResourceVk.cpp
@@ -31,6 +31,25 @@ angle::Result Resource::finishRunningCommands(ContextVk *contextVk)
    return contextVk->finishToSerial(mUse.getSerial());
 }
+angle::Result Resource::waitForIdle(ContextVk *contextVk)
+{
+    // If there are pending commands for the resource, flush them.
+    if (usedInRecordedCommands())
+    {
+        ANGLE_TRY(contextVk->flushImpl(nullptr));
+    }
+    // Make sure the driver is done with the resource.
+    if (usedInRunningCommands(contextVk->getLastCompletedQueueSerial()))
+    {
+        ANGLE_TRY(finishRunningCommands(contextVk));
+    }
+    ASSERT(!isCurrentlyInUse(contextVk->getLastCompletedQueueSerial()));
+    return angle::Result::Continue;
+}
 // SharedGarbage implementation.
 SharedGarbage::SharedGarbage() = default;

--- a/src/libANGLE/renderer/vulkan/ResourceVk.h
+++ b/src/libANGLE/renderer/vulkan/ResourceVk.h
@@ -179,6 +179,9 @@ class Resource : angle::NonCopyable
    // Ensures the driver is caught up to this resource and it is only in use by ANGLE.
    angle::Result finishRunningCommands(ContextVk *contextVk);
+    // Complete all recorded and in-flight commands involving this resource
+    angle::Result waitForIdle(ContextVk *contextVk);
    // Adds the resource to a resource use list.
    void retain(ResourceUseList *resourceUseList);

--- a/src/libANGLE/renderer/vulkan/TextureVk.cpp
+++ b/src/libANGLE/renderer/vulkan/TextureVk.cpp
@@ -283,7 +283,7 @@ angle::Result TextureVk::setSubImageImpl(const gl::Context *context,
                gl::Offset(area.x, area.y, area.z), formatInfo, unpack, type, source, vkFormat,
                inputRowPitch, inputDepthPitch, inputSkipBytes));
-            unpackBufferVk->unmapImpl(contextVk);
+            ANGLE_TRY(unpackBufferVk->unmapImpl(contextVk));
        }
    }
    else if (pixels)

--- a/src/libANGLE/renderer/vulkan/VertexArrayVk.cpp
+++ b/src/libANGLE/renderer/vulkan/VertexArrayVk.cpp
@@ -425,7 +425,7 @@ angle::Result VertexArrayVk::convertVertexBufferCPU(ContextVk *contextVk,
                               0, numVertices, binding.getStride(), srcFormatSize,
                               vertexFormat.vertexLoadFunction, &mCurrentArrayBuffers[attribIndex],
                               &conversion->lastAllocationOffset, 1));
-    srcBuffer->unmapImpl(contextVk);
+    ANGLE_TRY(srcBuffer->unmapImpl(contextVk));
    ASSERT(conversion->dirty);
    conversion->dirty = false;
@@ -757,7 +757,7 @@ angle::Result VertexArrayVk::updateStreamedAttribs(const gl::Context *context,
                                           &mCurrentArrayBufferOffsets[attribIndex], divisor));
                if (bufferVk)
                {
-                    bufferVk->unmapImpl(contextVk);
+                    ANGLE_TRY(bufferVk->unmapImpl(contextVk));
                }
            }
            else

--- a/src/libANGLE/renderer/vulkan/vk_helpers.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
@@ -744,6 +744,69 @@ void DynamicBuffer::reset()
    mLastFlushOrInvalidateOffset = 0;
 }
+// DynamicShadowBuffer implementation.
+DynamicShadowBuffer::DynamicShadowBuffer() : mInitialSize(0), mSize(0) {}
+DynamicShadowBuffer::DynamicShadowBuffer(DynamicShadowBuffer &&other)
+    : mInitialSize(other.mInitialSize), mSize(other.mSize), mBuffer(std::move(other.mBuffer))
+{}
+void DynamicShadowBuffer::init(size_t initialSize)
+{
+    mInitialSize = initialSize;
+}
+DynamicShadowBuffer::~DynamicShadowBuffer()
+{
+    ASSERT(mBuffer.empty());
+}
+angle::Result DynamicShadowBuffer::allocate(size_t sizeInBytes)
+{
+    bool result = true;
+    // Delete the current buffer, if any
+    if (!mBuffer.empty())
+    {
+        result &= mBuffer.resize(0);
+    }
+    // Cache the new size
+    mSize = std::max(mInitialSize, sizeInBytes);
+    // Allocate the buffer
+    result &= mBuffer.resize(mSize);
+    // If allocation failed, release the buffer and return error.
+    if (!result)
+    {
+        release();
+        return angle::Result::Stop;
+    }
+    return angle::Result::Continue;
+}
+void DynamicShadowBuffer::release()
+{
+    reset();
+    if (!mBuffer.empty())
+    {
+        (void)mBuffer.resize(0);
+    }
+}
+void DynamicShadowBuffer::destroy(VkDevice device)
+{
+    release();
+}
+void DynamicShadowBuffer::reset()
+{
+    mSize = 0;
+}
 // DescriptorPoolHelper implementation.
 DescriptorPoolHelper::DescriptorPoolHelper() : mFreeDescriptorSets(0) {}
@@ -1371,7 +1434,7 @@ angle::Result LineLoopHelper::getIndexBufferForElementArrayBuffer(ContextVk *con
        ANGLE_TRY(streamIndices(contextVk, glIndexType, indexCount,
                                static_cast<const uint8_t *>(srcDataMapping) + elementArrayOffset,
                                bufferOut, bufferOffsetOut, indexCountOut));
-        elementArrayBufferVk->unmapImpl(contextVk);
+        ANGLE_TRY(elementArrayBufferVk->unmapImpl(contextVk));
        return angle::Result::Continue;
    }
@@ -1396,7 +1459,7 @@ angle::Result LineLoopHelper::getIndexBufferForElementArrayBuffer(ContextVk *con
    if (contextVk->getRenderer()->getFeatures().extraCopyBufferRegion.enabled)
        copies.push_back({sourceOffset, *bufferOffsetOut + (unitCount + 1) * unitSize, 1});
-    ANGLE_TRY(elementArrayBufferVk->copyToBuffer(
+    ANGLE_TRY(elementArrayBufferVk->copyToBufferImpl(
        contextVk, *bufferOut, static_cast<uint32_t>(copies.size()), copies.data()));
    ANGLE_TRY(mDynamicIndexBuffer.flush(contextVk));
    return angle::Result::Continue;
@@ -3594,7 +3657,7 @@ angle::Result ImageHelper::readPixels(ContextVk *contextVk,
        uint8_t *dest = static_cast<uint8_t *>(mapPtr) + reinterpret_cast<ptrdiff_t>(pixels);
        PackPixels(packPixelsParams, *readFormat, area.width * readFormat->pixelBytes,
                   readPixelBuffer, static_cast<uint8_t *>(dest));
-        packBufferVk->unmapImpl(contextVk);
+        ANGLE_TRY(packBufferVk->unmapImpl(contextVk));
    }
    else
    {

--- a/src/libANGLE/renderer/vulkan/vk_helpers.h
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.h
@@ -9,6 +9,7 @@
 #ifndef LIBANGLE_RENDERER_VULKAN_VK_HELPERS_H_
 #define LIBANGLE_RENDERER_VULKAN_VK_HELPERS_H_
+#include "common/MemoryBuffer.h"
 #include "libANGLE/renderer/vulkan/ResourceVk.h"
 #include "libANGLE/renderer/vulkan/vk_cache_utils.h"
@@ -121,6 +122,77 @@ class DynamicBuffer : angle::NonCopyable
    std::vector<BufferHelper *> mBufferFreeList;
 };
+// Based off of the DynamicBuffer class, DynamicShadowBuffer provides
+// a similar conceptually infinitely long buffer that will only be written
+// to and read by the CPU. This can be used to provide CPU cached copies of
+// GPU-read only buffers. The value add here is that when an app requests
+// CPU access to a buffer we can fullfil such a request in O(1) time since
+// we don't need to wait for GPU to be done with in-flight commands.
+//
+// The hidden cost here is that any operation that updates a buffer, either
+// through a buffer sub data update or a buffer-to-buffer copy will have an
+// additional overhead of having to update its CPU only buffer
+class DynamicShadowBuffer : public angle::NonCopyable
+{
+  public:
+    DynamicShadowBuffer();
+    DynamicShadowBuffer(DynamicShadowBuffer &&other);
+    ~DynamicShadowBuffer();
+    // Initialize the DynamicShadowBuffer.
+    void init(size_t initialSize);
+    // Returns whether this DynamicShadowBuffer is active
+    ANGLE_INLINE bool valid() { return (mSize != 0); }
+    // This call will actually allocate a new CPU only memory from the heap.
+    // The size can be different than the one specified during `init`.
+    angle::Result allocate(size_t sizeInBytes);
+    ANGLE_INLINE void updateData(const uint8_t *data, size_t size, size_t offset)
+    {
+        ASSERT(!mBuffer.empty());
+        // Memcopy data into the buffer
+        memcpy((mBuffer.data() + offset), data, size);
+    }
+    // Map the CPU only buffer and return the pointer. We map the entire buffer for now.
+    ANGLE_INLINE void map(size_t offset, void **mapPtr)
+    {
+        ASSERT(mapPtr);
+        ASSERT(!mBuffer.empty());
+        *mapPtr = mBuffer.data() + offset;
+    }
+    // Unmap the CPU only buffer, NOOP for now
+    ANGLE_INLINE void unmap() {}
+    // This releases resources when they might currently be in use.
+    void release();
+    // This frees resources immediately.
+    void destroy(VkDevice device);
+    ANGLE_INLINE uint8_t *getCurrentBuffer()
+    {
+        ASSERT(!mBuffer.empty());
+        return mBuffer.data();
+    }
+    ANGLE_INLINE const uint8_t *getCurrentBuffer() const
+    {
+        ASSERT(!mBuffer.empty());
+        return mBuffer.data();
+    }
+  private:
+    void reset();
+    size_t mInitialSize;
+    size_t mSize;
+    angle::MemoryBuffer mBuffer;
+};
 // Uses DescriptorPool to allocate descriptor sets as needed. If a descriptor pool becomes full, we
 // allocate new pools internally as needed. RendererVk takes care of the lifetime of the discarded
 // pools. Note that we used a fixed layout for descriptor pools in ANGLE. Uniform buffers must

--- a/src/tests/gl_tests/BufferDataTest.cpp
+++ b/src/tests/gl_tests/BufferDataTest.cpp
@@ -411,6 +411,166 @@ TEST_P(BufferDataTestES3, BufferResizing)
    EXPECT_GL_NO_ERROR();
 }
+// Test to verify mapping a buffer after copying to it contains flushed/updated data
+TEST_P(BufferDataTestES3, CopyBufferSubDataMapReadTest)
+{
+    const char simpleVertex[]   = R"(attribute vec2 position;
+attribute vec4 color;
+varying vec4 vColor;
+void main()
+{
+    gl_Position = vec4(position, 0, 1);
+    vColor = color;
+}
+)";
+    const char simpleFragment[] = R"(precision mediump float;
+varying vec4 vColor;
+void main()
+{
+    gl_FragColor = vColor;
+}
+)";
+    const uint32_t numComponents = 3;
+    const uint32_t width         = 4;
+    const uint32_t height        = 4;
+    const size_t numElements     = width * height * numComponents;
+    std::vector<uint8_t> srcData(numElements);
+    std::vector<uint8_t> dstData(numElements);
+    for (uint8_t i = 0; i < srcData.size(); i++)
+    {
+        srcData[i] = 128;
+    }
+    for (uint8_t i = 0; i < dstData.size(); i++)
+    {
+        dstData[i] = 0;
+    }
+    GLBuffer srcBuffer;
+    GLBuffer dstBuffer;
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glBufferData(GL_ARRAY_BUFFER, srcData.size(), srcData.data(), GL_STATIC_DRAW);
+    ASSERT_GL_NO_ERROR();
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, dstBuffer);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, dstData.size(), dstData.data(), GL_STATIC_READ);
+    ASSERT_GL_NO_ERROR();
+    ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
+    glUseProgram(program);
+    GLint colorLoc = glGetAttribLocation(program, "color");
+    ASSERT_NE(-1, colorLoc);
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glEnableVertexAttribArray(colorLoc);
+    drawQuad(program, "position", 0.5f, 1.0f, true);
+    ASSERT_GL_NO_ERROR();
+    glCopyBufferSubData(GL_ARRAY_BUFFER, GL_PIXEL_UNPACK_BUFFER, 0, 0, numElements);
+    // With GL_MAP_READ_BIT, we expect the data to be flushed and updated to match srcData
+    uint8_t *data = reinterpret_cast<uint8_t *>(
+        glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, numElements, GL_MAP_READ_BIT));
+    EXPECT_GL_NO_ERROR();
+    for (size_t i = 0; i < numElements; ++i)
+    {
+        EXPECT_EQ(srcData[i], data[i]);
+    }
+    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+    EXPECT_GL_NO_ERROR();
+}
+// Test to verify mapping a buffer after copying to it contains expected data
+// with GL_MAP_UNSYNCHRONIZED_BIT
+TEST_P(BufferDataTestES3, MapBufferUnsynchronizedReadTest)
+{
+    const char simpleVertex[]   = R"(attribute vec2 position;
+attribute vec4 color;
+varying vec4 vColor;
+void main()
+{
+    gl_Position = vec4(position, 0, 1);
+    vColor = color;
+}
+)";
+    const char simpleFragment[] = R"(precision mediump float;
+varying vec4 vColor;
+void main()
+{
+    gl_FragColor = vColor;
+}
+)";
+    const uint32_t numComponents = 3;
+    const uint32_t width         = 4;
+    const uint32_t height        = 4;
+    const size_t numElements     = width * height * numComponents;
+    std::vector<uint8_t> srcData(numElements);
+    std::vector<uint8_t> dstData(numElements);
+    for (uint8_t i = 0; i < srcData.size(); i++)
+    {
+        srcData[i] = 128;
+    }
+    for (uint8_t i = 0; i < dstData.size(); i++)
+    {
+        dstData[i] = 0;
+    }
+    GLBuffer srcBuffer;
+    GLBuffer dstBuffer;
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glBufferData(GL_ARRAY_BUFFER, srcData.size(), srcData.data(), GL_STATIC_DRAW);
+    ASSERT_GL_NO_ERROR();
+    glBindBuffer(GL_PIXEL_UNPACK_BUFFER, dstBuffer);
+    glBufferData(GL_PIXEL_UNPACK_BUFFER, dstData.size(), dstData.data(), GL_STATIC_READ);
+    ASSERT_GL_NO_ERROR();
+    ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
+    glUseProgram(program);
+    GLint colorLoc = glGetAttribLocation(program, "color");
+    ASSERT_NE(-1, colorLoc);
+    glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
+    glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glEnableVertexAttribArray(colorLoc);
+    drawQuad(program, "position", 0.5f, 1.0f, true);
+    ASSERT_GL_NO_ERROR();
+    glCopyBufferSubData(GL_ARRAY_BUFFER, GL_PIXEL_UNPACK_BUFFER, 0, 0, numElements);
+    // Synchronize.
+    glFinish();
+    // Map with GL_MAP_UNSYNCHRONIZED_BIT and overwrite buffers data with srcData
+    uint8_t *data = reinterpret_cast<uint8_t *>(glMapBufferRange(
+        GL_PIXEL_UNPACK_BUFFER, 0, numElements, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT));
+    EXPECT_GL_NO_ERROR();
+    memcpy(data, srcData.data(), srcData.size());
+    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+    EXPECT_GL_NO_ERROR();
+    // Map without GL_MAP_UNSYNCHRONIZED_BIT and read data. We expect it to be srcData
+    data = reinterpret_cast<uint8_t *>(
+        glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, numElements, GL_MAP_READ_BIT));
+    EXPECT_GL_NO_ERROR();
+    for (size_t i = 0; i < numElements; ++i)
+    {
+        EXPECT_EQ(srcData[i], data[i]);
+    }
+    glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+    EXPECT_GL_NO_ERROR();
+}
 // Verify the functionality of glMapBufferRange()'s GL_MAP_UNSYNCHRONIZED_BIT
 // NOTE: On Vulkan, if we ever use memory that's not `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT`, then
 // this could incorrectly pass.