Metal: Use shared memory for small dynamic buffers.

- If BufferMtl is static or large size, don't use shadow copy. Use one MTLBuffer and map directly on it. - If BufferMtl is dynamic and small size, use shadow copy and buffer pool of 10 MTLBuffer (s). The MTLBuffer is allocated in shared memory in this case (PCI-E memory for example). MTLBuffer in shared memory region doesn't need to sync content between CPU and GPU. - When copyBuffer, if BufferMtl is being used by GPU use blit command to do the copy on GPU side. - Also implemented GL_MAP_UNSYNCHRONIZED_BIT. Bug: angleproject:2634 Change-Id: I7a5aab309d24c76106a7087358ee5883ee05d250 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/2408592 Commit-Queue: Le Hoang Quyen <le.hoang.q@gmail.com> Reviewed-by: Jonah Ryan-Davis <jonahr@google.com> Reviewed-by: Jamie Madill <jmadill@chromium.org>

Metal: Use shared memory for small dynamic buffers.
69da0b92 · Le Hoang Quyen · Commit Bot · d59bccb5 · 69da0b92 · 69da0b92
Commit 69da0b92 authored Sep 13, 2020 by Le Hoang Quyen Committed by Commit Bot Sep 25, 2020
8 changed files
--- a/src/libANGLE/renderer/metal/BufferMtl.mm
+++ b/src/libANGLE/renderer/metal/BufferMtl.mm
@@ -47,7 +47,7 @@ ConversionBufferMtl::ConversionBufferMtl(ContextMtl *contextMtl,
                                         size_t alignment)
    : dirty(true), convertedBuffer(nullptr), convertedOffset(0)
 {
-    data.initialize(contextMtl, initialSize, alignment);
+    data.initialize(contextMtl, initialSize, alignment, 0);
 }

 ConversionBufferMtl::~ConversionBufferMtl() = default;
@@ -123,6 +123,17 @@ angle::Result BufferMtl::copySubData(const gl::Context *context,
    ContextMtl *contextMtl = mtl::GetImpl(context);
    auto srcMtl            = GetAs<BufferMtl>(source);

+    if (srcMtl->clientShadowCopyDataNeedSync(contextMtl) || mBuffer->isBeingUsedByGPU(contextMtl))
+    {
+        // If shadow copy requires a synchronization then use blit command instead.
+        // It might break a pending render pass, but still faster than synchronization with
+        // GPU.
+        mtl::BlitCommandEncoder *blitEncoder = contextMtl->getBlitCommandEncoder();
+        blitEncoder->copyBuffer(srcMtl->getCurrentBuffer(), sourceOffset, mBuffer, destOffset,
+                                size);
+
+        return angle::Result::Continue;
+    }
    return setSubDataImpl(context, srcMtl->getClientShadowCopyData(contextMtl) + sourceOffset, size,
                          destOffset);
 }
@@ -151,7 +162,16 @@ angle::Result BufferMtl::mapRange(const gl::Context *context,
    if (mapPtr)
    {
        ContextMtl *contextMtl = mtl::GetImpl(context);
-        *mapPtr                = syncAndObtainShadowCopy(contextMtl) + offset;
+        if (mBufferPool.getMaxBuffers() == 1)
+        {
+            *mapPtr = mBuffer->mapWithOpt(contextMtl, (access & GL_MAP_WRITE_BIT) == 0,
+                                          access & GL_MAP_UNSYNCHRONIZED_BIT) +
+                      offset;
+        }
+        else
+        {
+            *mapPtr = syncAndObtainShadowCopy(contextMtl) + offset;
+        }
    }

    return angle::Result::Continue;
@@ -159,11 +179,44 @@ angle::Result BufferMtl::mapRange(const gl::Context *context,

 angle::Result BufferMtl::unmap(const gl::Context *context, GLboolean *result)
 {
-    ASSERT(mShadowCopy.size());
+    ContextMtl *contextMtl = mtl::GetImpl(context);
+    size_t offset          = static_cast<size_t>(mState.getMapOffset());
+    size_t len             = static_cast<size_t>(mState.getMapLength());

    markConversionBuffersDirty();

-    ANGLE_TRY(commitShadowCopy(context));
+    if (mBufferPool.getMaxBuffers() == 1)
+    {
+        ASSERT(mBuffer);
+        if (mState.getAccessFlags() & GL_MAP_WRITE_BIT)
+        {
+            mBuffer->unmapAndFlushSubset(contextMtl, offset, len);
+        }
+        else
+        {
+            // Buffer is already mapped with readonly flag, so just unmap it, no flushing will
+            // occur.
+            mBuffer->unmap(contextMtl);
+        }
+    }
+    else
+    {
+        ASSERT(mShadowCopy.size());
+
+        if (mState.getAccessFlags() & GL_MAP_UNSYNCHRONIZED_BIT)
+        {
+            // Copy the mapped region without synchronization with GPU
+            uint8_t *ptr =
+                mBuffer->mapWithOpt(contextMtl, /* readonly */ false, /* noSync */ true) + offset;
+            std::copy(mShadowCopy.data() + offset, mShadowCopy.data() + offset + len, ptr);
+            mBuffer->unmapAndFlushSubset(contextMtl, offset, len);
+        }
+        else
+        {
+            // commit shadow copy data to GPU synchronously
+            ANGLE_TRY(commitShadowCopy(context));
+        }
+    }

    return angle::Result::Continue;
 }
@@ -175,8 +228,6 @@ angle::Result BufferMtl::getIndexRange(const gl::Context *context,
                                       bool primitiveRestartEnabled,
                                       gl::IndexRange *outRange)
 {
-    ASSERT(mShadowCopy.size());
-
    const uint8_t *indices = getClientShadowCopyData(mtl::GetImpl(context)) + offset;

    *outRange = gl::ComputeIndexRange(type, indices, count, primitiveRestartEnabled);
@@ -190,8 +241,6 @@ angle::Result BufferMtl::getFirstLastIndices(ContextMtl *contextMtl,
                                             size_t count,
                                             std::pair<uint32_t, uint32_t> *outIndices)
 {
-    ASSERT(mShadowCopy.size());
-
    const uint8_t *indices = getClientShadowCopyData(contextMtl) + offset;

    switch (type)
@@ -220,6 +269,11 @@ void BufferMtl::onDataChanged()
 /* public */
 const uint8_t *BufferMtl::getClientShadowCopyData(ContextMtl *contextMtl)
 {
+    if (mBufferPool.getMaxBuffers() == 1)
+    {
+        // Don't need shadow copy in this case, use the buffer directly
+        return mBuffer->mapReadOnly(contextMtl);
+    }
    return syncAndObtainShadowCopy(contextMtl);
 }

@@ -230,9 +284,8 @@ bool BufferMtl::clientShadowCopyDataNeedSync(ContextMtl *contextMtl)

 void BufferMtl::ensureShadowCopySyncedFromGPU(ContextMtl *contextMtl)
 {
-    if (clientShadowCopyDataNeedSync(contextMtl))
+    if (mBuffer->isCPUReadMemDirty())
    {
-        // Copy data from GPU to shadow copy.
        const uint8_t *ptr = mBuffer->mapReadOnly(contextMtl);
        memcpy(mShadowCopy.data(), ptr, size());
        mBuffer->unmap(contextMtl);
@@ -334,39 +387,64 @@ angle::Result BufferMtl::setDataImpl(const gl::Context *context,
        case gl::BufferUsage::StaticCopy:
        case gl::BufferUsage::StaticDraw:
        case gl::BufferUsage::StaticRead:
-            maxBuffers = 1;  // static buffer doesn't need high speed data update
-            // NOTE(hqle): We don't really need buffer pool in this case. Consider disabling shadow
-            // copy in this case.
+        case gl::BufferUsage::DynamicRead:
+        case gl::BufferUsage::StreamRead:
+            maxBuffers = 1;  // static/read buffer doesn't need high speed data update
+            mBufferPool.setAlwaysUseGPUMem();
            break;
        default:
-            // dynamic buffer, allow up to 2 update per frame/encoding without
+            // dynamic buffer, allow up to 10 update per frame/encoding without
            // waiting for GPU.
-            // NOTE(hqle): If buffer size is too large, we should not use buffer pool, instead a
-            // single buffer should be used.
-            maxBuffers = 2;
+            if (adjustedSize <= mtl::kSharedMemBufferMaxBufSizeHint)
+            {
+                maxBuffers = 10;
+                mBufferPool.setAlwaysUseSharedMem();
+            }
+            else
+            {
+                maxBuffers = 1;
+                mBufferPool.setAlwaysUseGPUMem();
+            }
            break;
    }

    // Re-create the buffer
    mBuffer = nullptr;
-    mBufferPool.initialize(contextMtl, adjustedSize, 1, maxBuffers);
+    ANGLE_TRY(mBufferPool.reset(contextMtl, adjustedSize, 1, maxBuffers));

-    // We use shadow copy to maintain consistent data between buffers in pool
-    ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(adjustedSize), GL_OUT_OF_MEMORY);
-
-    if (data)
+    if (maxBuffers > 1)
    {
-        // Transfer data to shadow copy buffer
-        auto ptr = static_cast<const uint8_t *>(data);
-        std::copy(ptr, ptr + intendedSize, mShadowCopy.data());
+        // We use shadow copy to maintain consistent data between buffers in pool
+        ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(adjustedSize), GL_OUT_OF_MEMORY);
+
+        if (data)
+        {
+            // Transfer data to shadow copy buffer
+            auto ptr = static_cast<const uint8_t *>(data);
+            std::copy(ptr, ptr + intendedSize, mShadowCopy.data());

-        // Transfer data from shadow copy buffer to GPU buffer.
-        ANGLE_TRY(commitShadowCopy(context, adjustedSize));
+            // Transfer data from shadow copy buffer to GPU buffer.
+            ANGLE_TRY(commitShadowCopy(context, adjustedSize));
+        }
+        else
+        {
+            // This is needed so that first buffer pointer could be available
+            ANGLE_TRY(commitShadowCopy(context, 0));
+        }
    }
    else
    {
-        // This is needed so that first buffer pointer could be available
-        ANGLE_TRY(commitShadowCopy(context, 0));
+        // We don't need shadow copy if there will be only one buffer in the pool.
+        ANGLE_MTL_CHECK(contextMtl, mShadowCopy.resize(0), GL_OUT_OF_MEMORY);
+
+        // Allocate one buffer to use
+        ANGLE_TRY(
+            mBufferPool.allocate(contextMtl, adjustedSize, nullptr, &mBuffer, nullptr, nullptr));
+
+        if (data)
+        {
+            ANGLE_TRY(setSubDataImpl(context, data, intendedSize, 0));
+        }
    }

 #ifndef NDEBUG
@@ -400,17 +478,27 @@ angle::Result BufferMtl::setSubDataImpl(const gl::Context *context,

    markConversionBuffersDirty();

-    ASSERT(mShadowCopy.size());
+    if (mBufferPool.getMaxBuffers() == 1)
+    {
+        ASSERT(mBuffer);
+        uint8_t *ptr = mBuffer->map(contextMtl);
+        std::copy(srcPtr, srcPtr + sizeToCopy, ptr + offset);
+        mBuffer->unmapAndFlushSubset(contextMtl, offset, sizeToCopy);
+    }
+    else
+    {
+        ASSERT(mShadowCopy.size());

-    // 1. Before copying data from client, we need to synchronize modified data from GPU to shadow
-    // copy first.
-    ensureShadowCopySyncedFromGPU(contextMtl);
+        // 1. Before copying data from client, we need to synchronize modified data from GPU to
+        // shadow copy first.
+        ensureShadowCopySyncedFromGPU(contextMtl);

-    // 2. Copy data from client to shadow copy.
-    std::copy(srcPtr, srcPtr + sizeToCopy, mShadowCopy.data() + offset);
+        // 2. Copy data from client to shadow copy.
+        std::copy(srcPtr, srcPtr + sizeToCopy, mShadowCopy.data() + offset);

-    // 3. Copy data from shadow copy to GPU.
-    ANGLE_TRY(commitShadowCopy(context));
+        // 3. Copy data from shadow copy to GPU.
+        ANGLE_TRY(commitShadowCopy(context));
+    }

    return angle::Result::Continue;
 }

--- a/src/libANGLE/renderer/metal/VertexArrayMtl.mm
+++ b/src/libANGLE/renderer/metal/VertexArrayMtl.mm
@@ -158,7 +158,8 @@ VertexArrayMtl::VertexArrayMtl(const gl::VertexArrayState &state, ContextMtl *co
    mDynamicVertexData.initialize(context, 0, mtl::kVertexAttribBufferStrideAlignment,
                                  mtl::kMaxVertexAttribs);

-    mDynamicIndexData.initialize(context, kDynamicIndexDataSize, mtl::kIndexBufferOffsetAlignment);
+    mDynamicIndexData.initialize(context, kDynamicIndexDataSize, mtl::kIndexBufferOffsetAlignment,
+                                 0);
 }
 VertexArrayMtl::~VertexArrayMtl() {}


--- a/src/libANGLE/renderer/metal/mtl_buffer_pool.h
+++ b/src/libANGLE/renderer/metal/mtl_buffer_pool.h
@@ -12,6 +12,8 @@

 #include "libANGLE/renderer/metal/mtl_resources.h"

+#include <deque>
+
 namespace rx
 {

@@ -20,6 +22,20 @@ class ContextMtl;
 namespace mtl
 {

+enum class BufferPoolMemPolicy
+{
+    // Always allocate buffer in shared memory, useful for dynamic small buffer.
+    // This translates to MTLResourceStorageModeShared.
+    AlwaysSharedMem,
+    // Always allocate buffer in GPU dedicated memory. Note: a CPU side copy is also allocated so
+    // that buffer can still be mapped on CPU side.
+    // This translates to MTLResourceStorageModeManaged on macOS or MTLResourceStorageModeShared on
+    // iOS.
+    AlwaysGPUMem,
+    // Auto allocate buffer in shared memory if it is small. GPU otherwise.
+    Auto,
+};
+
 // A buffer pool is conceptually an infinitely long buffer. Each time you write to the buffer,
 // you will always write to a previously unused portion. After a series of writes, you must flush
 // the buffer data to the device. Buffer lifetime currently assumes that each new allocation will
@@ -34,16 +50,24 @@ namespace mtl
 class BufferPool
 {
  public:
-    // alwaysAllocNewBuffer=true will always allocate new buffer or reuse free buffer on allocate(),
-    // regardless of whether current buffer still has unused portion or not.
-    BufferPool(bool alwaysAllocNewBuffer = false);
+    BufferPool();
+    // - alwaysAllocNewBuffer=true will always allocate new buffer or reuse free buffer on
+    // allocate(), regardless of whether current buffer still has unused portion or not.
+    // - memPolicy: indicate the allocated buffers should be in shared memory or not.
+    // See BufferPoolMemPolicy.
+    BufferPool(bool alwaysAllocNewBuffer);
+    BufferPool(bool alwaysAllocNewBuffer, BufferPoolMemPolicy memPolicy);
    ~BufferPool();

    // Init is called after the buffer creation so that the alignment can be specified later.
-    void initialize(ContextMtl *contextMtl,
-                    size_t initialSize,
-                    size_t alignment,
-                    size_t maxBuffers = 0);
+    void initialize(Context *context, size_t initialSize, size_t alignment, size_t maxBuffers);
+    // Calling this without initialize() will have same effect as calling initialize().
+    // If called after initialize(), the old pending buffers will be flushed and might be re-used if
+    // their size are big enough for the requested initialSize parameter.
+    angle::Result reset(ContextMtl *contextMtl,
+                        size_t initialSize,
+                        size_t alignment,
+                        size_t maxBuffers);

    // This call will allocate a new region at the end of the buffer. It internally may trigger
    // a new buffer to be created (which is returned in the optional parameter
@@ -56,7 +80,7 @@ class BufferPool
                           size_t *offsetOut           = nullptr,
                           bool *newBufferAllocatedOut = nullptr);

-    // After a sequence of writes, call commit to ensure the data is visible to the device.
+    // After a sequence of CPU writes, call commit to ensure the data is visible to the device.
    angle::Result commit(ContextMtl *contextMtl);

    // This releases all the buffers that have been allocated since this was last called.
@@ -68,28 +92,42 @@ class BufferPool
    const BufferRef &getCurrentBuffer() { return mBuffer; }

    size_t getAlignment() { return mAlignment; }
-    void updateAlignment(ContextMtl *contextMtl, size_t alignment);
+    void updateAlignment(Context *context, size_t alignment);
+
+    size_t getMaxBuffers() const { return mMaxBuffers; }

    // Set whether allocate() will always allocate new buffer or attempting to append to previous
    // buffer or not. Default is false.
    void setAlwaysAllocateNewBuffer(bool e) { mAlwaysAllocateNewBuffer = e; }

+    void setMemoryPolicy(BufferPoolMemPolicy policy) { mMemPolicy = policy; }
+
+    // Set all subsequent allocated buffers should always use shared memory
+    void setAlwaysUseSharedMem() { setMemoryPolicy(BufferPoolMemPolicy::AlwaysSharedMem); }
+
+    // Set all subsequent allocated buffers should always use GPU memory
+    void setAlwaysUseGPUMem() { setMemoryPolicy(BufferPoolMemPolicy::AlwaysGPUMem); }
+
  private:
+    bool shouldAllocateInSharedMem(ContextMtl *contextMtl) const;
    void reset();
    angle::Result allocateNewBuffer(ContextMtl *contextMtl);
-    void destroyBufferList(ContextMtl *contextMtl, std::vector<BufferRef> *buffers);
+    void destroyBufferList(ContextMtl *contextMtl, std::deque<BufferRef> *buffers);
+    angle::Result finalizePendingBuffer(ContextMtl *contextMtl);

    size_t mInitialSize;
    BufferRef mBuffer;
    uint32_t mNextAllocationOffset;
+    uint32_t mLastFlushOffset;
    size_t mSize;
    size_t mAlignment;

-    std::vector<BufferRef> mInFlightBuffers;
-    std::vector<BufferRef> mBufferFreeList;
+    std::deque<BufferRef> mInFlightBuffers;
+    std::deque<BufferRef> mBufferFreeList;

    size_t mBuffersAllocated;
    size_t mMaxBuffers;
+    BufferPoolMemPolicy mMemPolicy;
    bool mAlwaysAllocateNewBuffer;
 };


--- a/src/libANGLE/renderer/metal/mtl_buffer_pool.mm
+++ b/src/libANGLE/renderer/metal/mtl_buffer_pool.mm
@@ -10,6 +10,7 @@
 #include "libANGLE/renderer/metal/mtl_buffer_pool.h"

 #include "libANGLE/renderer/metal/ContextMtl.h"
+#include "libANGLE/renderer/metal/DisplayMtl.h"

 namespace rx
 {
@@ -18,35 +19,110 @@ namespace mtl
 {

 // BufferPool implementation.
+BufferPool::BufferPool() : BufferPool(false) {}
 BufferPool::BufferPool(bool alwaysAllocNewBuffer)
+    : BufferPool(alwaysAllocNewBuffer, BufferPoolMemPolicy::Auto)
+{}
+BufferPool::BufferPool(bool alwaysAllocNewBuffer, BufferPoolMemPolicy policy)
    : mInitialSize(0),
      mBuffer(nullptr),
      mNextAllocationOffset(0),
+      mLastFlushOffset(0),
      mSize(0),
      mAlignment(1),
      mBuffersAllocated(0),
      mMaxBuffers(0),
+      mMemPolicy(policy),
      mAlwaysAllocateNewBuffer(alwaysAllocNewBuffer)

 {}

-void BufferPool::initialize(ContextMtl *contextMtl,
+angle::Result BufferPool::reset(ContextMtl *contextMtl,
+                                size_t initialSize,
+                                size_t alignment,
+                                size_t maxBuffers)
+{
+    ANGLE_TRY(finalizePendingBuffer(contextMtl));
+    releaseInFlightBuffers(contextMtl);
+
+    mSize = 0;
+    if (mBufferFreeList.size() && mInitialSize <= mBufferFreeList.front()->size())
+    {
+        // Instead of deleteing old buffers, we should reset them to avoid excessive
+        // memory re-allocations
+        if (maxBuffers && mBufferFreeList.size() > maxBuffers)
+        {
+            mBufferFreeList.resize(maxBuffers);
+            mBuffersAllocated = maxBuffers;
+        }
+
+        mSize = mBufferFreeList.front()->size();
+        for (size_t i = 0; i < mBufferFreeList.size(); ++i)
+        {
+            BufferRef &buffer = mBufferFreeList[i];
+            if (!buffer->isBeingUsedByGPU(contextMtl))
+            {
+                // If buffer is not used by GPU, re-use it immediately.
+                continue;
+            }
+            bool useSharedMem = shouldAllocateInSharedMem(contextMtl);
+            if (IsError(buffer->resetWithSharedMemOpt(contextMtl, useSharedMem, mSize, nullptr)))
+            {
+                mBufferFreeList.clear();
+                mBuffersAllocated = 0;
+                mSize             = 0;
+                break;
+            }
+        }
+    }
+    else
+    {
+        mBufferFreeList.clear();
+        mBuffersAllocated = 0;
+    }
+
+    mInitialSize = initialSize;
+
+    mMaxBuffers = maxBuffers;
+
+    updateAlignment(contextMtl, alignment);
+
+    return angle::Result::Continue;
+}
+
+void BufferPool::initialize(Context *context,
                            size_t initialSize,
                            size_t alignment,
                            size_t maxBuffers)
 {
-    destroy(contextMtl);
+    if (mBuffersAllocated)
+    {
+        // Invalid call, must call destroy() first.
+        UNREACHABLE();
+    }

    mInitialSize = initialSize;
-    mSize        = 0;

    mMaxBuffers = maxBuffers;

-    updateAlignment(contextMtl, alignment);
+    updateAlignment(context, alignment);
 }

 BufferPool::~BufferPool() {}

+bool BufferPool::shouldAllocateInSharedMem(ContextMtl *contextMtl) const
+{
+    switch (mMemPolicy)
+    {
+        case BufferPoolMemPolicy::AlwaysSharedMem:
+            return true;
+        case BufferPoolMemPolicy::AlwaysGPUMem:
+            return false;
+        default:
+            return mSize <= kSharedMemBufferMaxBufSizeHint;
+    }
+}
+
 angle::Result BufferPool::allocateNewBuffer(ContextMtl *contextMtl)
 {
    if (mMaxBuffers > 0 && mBuffersAllocated >= mMaxBuffers)
@@ -77,7 +153,9 @@ angle::Result BufferPool::allocateNewBuffer(ContextMtl *contextMtl)
        return angle::Result::Continue;
    }

-    ANGLE_TRY(Buffer::MakeBuffer(contextMtl, mSize, nullptr, &mBuffer));
+    bool useSharedMem = shouldAllocateInSharedMem(contextMtl);
+    ANGLE_TRY(
+        Buffer::MakeBufferWithSharedMemOpt(contextMtl, useSharedMem, mSize, nullptr, &mBuffer));

    ASSERT(mBuffer);

@@ -99,11 +177,13 @@ angle::Result BufferPool::allocate(ContextMtl *contextMtl,
    checkedNextWriteOffset += sizeToAllocate;

    if (!mBuffer || !checkedNextWriteOffset.IsValid() ||
-        checkedNextWriteOffset.ValueOrDie() >= mSize || mAlwaysAllocateNewBuffer)
+        checkedNextWriteOffset.ValueOrDie() >= mSize ||
+        // If the current buffer has been modified by GPU, do not reuse it:
+        mBuffer->isCPUReadMemNeedSync() || mAlwaysAllocateNewBuffer)
    {
        if (mBuffer)
        {
-            ANGLE_TRY(commit(contextMtl));
+            ANGLE_TRY(finalizePendingBuffer(contextMtl));
        }

        if (sizeToAllocate > mSize)
@@ -129,6 +209,7 @@ angle::Result BufferPool::allocate(ContextMtl *contextMtl,
        ASSERT(mBuffer->size() == mSize);

        mNextAllocationOffset = 0;
+        mLastFlushOffset      = 0;

        if (newBufferAllocatedOut != nullptr)
        {
@@ -150,7 +231,10 @@ angle::Result BufferPool::allocate(ContextMtl *contextMtl,
    // Optionally map() the buffer if possible
    if (ptrOut)
    {
-        *ptrOut = mBuffer->map(contextMtl) + mNextAllocationOffset;
+        // We don't need to synchronize with GPU access, since allocation should return a
+        // non-overlapped region each time.
+        *ptrOut = mBuffer->mapWithOpt(contextMtl, /** readOnly */ false, /** noSync */ true) +
+                  mNextAllocationOffset;
    }

    if (offsetOut)
@@ -163,15 +247,28 @@ angle::Result BufferPool::allocate(ContextMtl *contextMtl,

 angle::Result BufferPool::commit(ContextMtl *contextMtl)
 {
+    if (mBuffer && mNextAllocationOffset > mLastFlushOffset)
+    {
+        mBuffer->flush(contextMtl, mLastFlushOffset, mNextAllocationOffset - mLastFlushOffset);
+        mLastFlushOffset = mNextAllocationOffset;
+    }
+    return angle::Result::Continue;
+}
+
+angle::Result BufferPool::finalizePendingBuffer(ContextMtl *contextMtl)
+{
    if (mBuffer)
    {
-        mBuffer->unmap(contextMtl);
+        ANGLE_TRY(commit(contextMtl));
+        // commit() already flushes so no need to flush here.
+        mBuffer->unmapNoFlush(contextMtl);

        mInFlightBuffers.push_back(mBuffer);
        mBuffer = nullptr;
    }

    mNextAllocationOffset = 0;
+    mLastFlushOffset      = 0;

    return angle::Result::Continue;
 }
@@ -181,7 +278,12 @@ void BufferPool::releaseInFlightBuffers(ContextMtl *contextMtl)
    for (auto &toRelease : mInFlightBuffers)
    {
        // If the dynamic buffer was resized we cannot reuse the retained buffer.
-        if (toRelease->size() < mSize)
+        if (toRelease->size() < mSize
+#if TARGET_OS_OSX || TARGET_OS_MACCATALYST
+            // Also release buffer if it was allocated in different policy
+            || toRelease->useSharedMem() != shouldAllocateInSharedMem(contextMtl)
+#endif
+        )
        {
            toRelease = nullptr;
            mBuffersAllocated--;
@@ -195,7 +297,7 @@ void BufferPool::releaseInFlightBuffers(ContextMtl *contextMtl)
    mInFlightBuffers.clear();
 }

-void BufferPool::destroyBufferList(ContextMtl *contextMtl, std::vector<BufferRef> *buffers)
+void BufferPool::destroyBufferList(ContextMtl *contextMtl, std::deque<BufferRef> *buffers)
 {
    ASSERT(mBuffersAllocated >= buffers->size());
    mBuffersAllocated -= buffers->size();
@@ -217,19 +319,25 @@ void BufferPool::destroy(ContextMtl *contextMtl)
    }
 }

-void BufferPool::updateAlignment(ContextMtl *contextMtl, size_t alignment)
+void BufferPool::updateAlignment(Context *context, size_t alignment)
 {
    ASSERT(alignment > 0);

    // NOTE(hqle): May check additional platform limits.

-    mAlignment = alignment;
+    // If alignment has changed, make sure the next allocation is done at an aligned offset.
+    if (alignment != mAlignment)
+    {
+        mNextAllocationOffset = roundUp(mNextAllocationOffset, static_cast<uint32_t>(alignment));
+        mAlignment            = alignment;
+    }
 }

 void BufferPool::reset()
 {
    mSize                    = 0;
    mNextAllocationOffset    = 0;
+    mLastFlushOffset         = 0;
    mMaxBuffers              = 0;
    mAlwaysAllocateNewBuffer = false;
    mBuffersAllocated        = 0;

--- a/src/libANGLE/renderer/metal/mtl_common.h
+++ b/src/libANGLE/renderer/metal/mtl_common.h
@@ -99,6 +99,11 @@ constexpr uint32_t kMaxVertexAttribs = gl::MAX_VERTEX_ATTRIBS;
 // NOTE(hqle): support variable max number of render targets
 constexpr uint32_t kMaxRenderTargets = 4;

+// The max size of a buffer that will be allocated in shared memory.
+// NOTE(hqle): This is just a hint. There is no official document on what is the max allowed size
+// for shared memory.
+constexpr size_t kSharedMemBufferMaxBufSizeHint = 128 * 1024;
+
 constexpr size_t kDefaultAttributeSize = 4 * sizeof(float);

 // Metal limits

--- a/src/libANGLE/renderer/metal/mtl_resources.h
+++ b/src/libANGLE/renderer/metal/mtl_resources.h
@@ -257,6 +257,12 @@ class Buffer final : public Resource, public WrappedObject<id<MTLBuffer>>
                                    const uint8_t *data,
                                    BufferRef *bufferOut);

+    static angle::Result MakeBufferWithSharedMemOpt(ContextMtl *context,
+                                                    bool forceUseSharedMem,
+                                                    size_t size,
+                                                    const uint8_t *data,
+                                                    BufferRef *bufferOut);
+
    static angle::Result MakeBufferWithResOpt(ContextMtl *context,
                                              MTLResourceOptions resourceOptions,
                                              size_t size,
@@ -264,23 +270,33 @@ class Buffer final : public Resource, public WrappedObject<id<MTLBuffer>>
                                              BufferRef *bufferOut);

    angle::Result reset(ContextMtl *context, size_t size, const uint8_t *data);
+    angle::Result resetWithSharedMemOpt(ContextMtl *context,
+                                        bool forceUseSharedMem,
+                                        size_t size,
+                                        const uint8_t *data);
    angle::Result resetWithResOpt(ContextMtl *context,
                                  MTLResourceOptions resourceOptions,
                                  size_t size,
                                  const uint8_t *data);

    const uint8_t *mapReadOnly(ContextMtl *context);
-    uint8_t *mapWithOpt(ContextMtl *context, bool readonly);
    uint8_t *map(ContextMtl *context);
+    uint8_t *mapWithOpt(ContextMtl *context, bool readonly, bool noSync);
+
    void unmap(ContextMtl *context);
+    // Same as unmap but do not do implicit flush()
+    void unmapNoFlush(ContextMtl *context);
+    void unmapAndFlushSubset(ContextMtl *context, size_t offsetWritten, size_t sizeWritten);
+    void flush(ContextMtl *context, size_t offsetWritten, size_t sizeWritten);

    size_t size() const;
+    bool useSharedMem() const;

    // Explicitly sync content between CPU and GPU
    void syncContent(ContextMtl *context, mtl::BlitCommandEncoder *encoder);

  private:
-    Buffer(ContextMtl *context, size_t size, const uint8_t *data);
+    Buffer(ContextMtl *context, bool forceUseSharedMem, size_t size, const uint8_t *data);
    Buffer(ContextMtl *context,
           MTLResourceOptions resourceOptions,
           size_t size,

--- a/src/libANGLE/renderer/metal/mtl_resources.mm
+++ b/src/libANGLE/renderer/metal/mtl_resources.mm
@@ -654,7 +654,17 @@ angle::Result Buffer::MakeBuffer(ContextMtl *context,
                                 const uint8_t *data,
                                 BufferRef *bufferOut)
 {
-    bufferOut->reset(new Buffer(context, size, data));
+
+    return MakeBufferWithSharedMemOpt(context, false, size, data, bufferOut);
+}
+
+angle::Result Buffer::MakeBufferWithSharedMemOpt(ContextMtl *context,
+                                                 bool forceUseSharedMem,
+                                                 size_t size,
+                                                 const uint8_t *data,
+                                                 BufferRef *bufferOut)
+{
+    bufferOut->reset(new Buffer(context, forceUseSharedMem, size, data));

    if (!bufferOut || !bufferOut->get())
    {
@@ -680,9 +690,9 @@ angle::Result Buffer::MakeBufferWithResOpt(ContextMtl *context,
    return angle::Result::Continue;
 }

-Buffer::Buffer(ContextMtl *context, size_t size, const uint8_t *data)
+Buffer::Buffer(ContextMtl *context, bool forceUseSharedMem, size_t size, const uint8_t *data)
 {
-    (void)reset(context, size, data);
+    (void)resetWithSharedMemOpt(context, forceUseSharedMem, size, data);
 }

 Buffer::Buffer(ContextMtl *context, MTLResourceOptions options, size_t size, const uint8_t *data)
@@ -692,12 +702,27 @@ Buffer::Buffer(ContextMtl *context, MTLResourceOptions options, size_t size, con

 angle::Result Buffer::reset(ContextMtl *context, size_t size, const uint8_t *data)
 {
+    return resetWithSharedMemOpt(context, false, size, data);
+}
+
+angle::Result Buffer::resetWithSharedMemOpt(ContextMtl *context,
+                                            bool forceUseSharedMem,
+                                            size_t size,
+                                            const uint8_t *data)
+{
    MTLResourceOptions options;

    options = 0;
 #if TARGET_OS_OSX || TARGET_OS_MACCATALYST
-    options |= MTLResourceStorageModeManaged;
+    if (!forceUseSharedMem)
+    {
+        options |= MTLResourceStorageModeManaged;
+    }
+    else
 #endif
+    {
+        options |= MTLResourceStorageModeShared;
+    }

    return resetWithResOpt(context, options, size, data);
 }
@@ -737,49 +762,77 @@ void Buffer::syncContent(ContextMtl *context, mtl::BlitCommandEncoder *blitEncod

 const uint8_t *Buffer::mapReadOnly(ContextMtl *context)
 {
-    return mapWithOpt(context, true);
+    return mapWithOpt(context, true, false);
 }

 uint8_t *Buffer::map(ContextMtl *context)
 {
-    return mapWithOpt(context, false);
+    return mapWithOpt(context, false, false);
 }

-uint8_t *Buffer::mapWithOpt(ContextMtl *context, bool readonly)
+uint8_t *Buffer::mapWithOpt(ContextMtl *context, bool readonly, bool noSync)
 {
    mMapReadOnly = readonly;

-    CommandQueue &cmdQueue = context->cmdQueue();
+    if (!noSync)
+    {
+        CommandQueue &cmdQueue = context->cmdQueue();

-    EnsureCPUMemWillBeSynced(context, this);
+        EnsureCPUMemWillBeSynced(context, this);

-    if (this->isBeingUsedByGPU(context))
-    {
-        context->flushCommandBufer();
-    }
+        if (this->isBeingUsedByGPU(context))
+        {
+            context->flushCommandBufer();
+        }

-    cmdQueue.ensureResourceReadyForCPU(this);
+        cmdQueue.ensureResourceReadyForCPU(this);
+    }

    return reinterpret_cast<uint8_t *>([get() contents]);
 }

 void Buffer::unmap(ContextMtl *context)
 {
+    flush(context, 0, size());
+
+    // Reset read only flag
+    mMapReadOnly = true;
+}
+
+void Buffer::unmapNoFlush(ContextMtl *context)
+{
+    mMapReadOnly = true;
+}
+
+void Buffer::unmapAndFlushSubset(ContextMtl *context, size_t offsetWritten, size_t sizeWritten)
+{
+#if TARGET_OS_OSX || TARGET_OS_MACCATALYST
+    flush(context, offsetWritten, sizeWritten);
+#endif
+    mMapReadOnly = true;
+}
+
+void Buffer::flush(ContextMtl *context, size_t offsetWritten, size_t sizeWritten)
+{
 #if TARGET_OS_OSX || TARGET_OS_MACCATALYST
    if (!mMapReadOnly)
    {
        if (get().storageMode == MTLStorageModeManaged)
        {
-            [get() didModifyRange:NSMakeRange(0, size())];
+            [get() didModifyRange:NSMakeRange(offsetWritten, sizeWritten)];
        }
    }
 #endif
-    mMapReadOnly = true;
 }

 size_t Buffer::size() const
 {
    return get().length;
 }
+
+bool Buffer::useSharedMem() const
+{
+    return get().storageMode == MTLStorageModeShared;
+}
 }
 }
--- a/src/tests/gl_tests/BufferDataTest.cpp
+++ b/src/tests/gl_tests/BufferDataTest.cpp
@@ -681,6 +681,87 @@ TEST_P(BufferDataTest, MapBufferOES)
    EXPECT_EQ(data, actualData);
 }

+// Test to verify mapping a dynamic buffer with GL_MAP_UNSYNCHRONIZED_BIT to modify a portion
+// won't affect draw calls using other portions.
+TEST_P(BufferDataTest, MapDynamicBufferUnsynchronizedEXTTest)
+{
+    ANGLE_SKIP_TEST_IF(!IsGLExtensionEnabled("GL_EXT_map_buffer_range"));
+
+    const char simpleVertex[]   = R"(attribute vec2 position;
+attribute vec4 color;
+varying vec4 vColor;
+void main()
+{
+    gl_Position = vec4(position, 0, 1);
+    vColor = color;
+}
+)";
+    const char simpleFragment[] = R"(precision mediump float;
+varying vec4 vColor;
+void main()
+{
+    gl_FragColor = vColor;
+}
+)";
+
+    constexpr int kNumVertices = 6;
+
+    std::vector<GLubyte> color(8 * kNumVertices);
+    for (int i = 0; i < kNumVertices; ++i)
+    {
+        color[4 * i]     = 255;
+        color[4 * i + 3] = 255;
+    }
+    GLBuffer buffer;
+    glBindBuffer(GL_ARRAY_BUFFER, buffer.get());
+    glBufferData(GL_ARRAY_BUFFER, color.size(), color.data(), GL_DYNAMIC_DRAW);
+
+    ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
+    glUseProgram(program);
+
+    GLint colorLoc = glGetAttribLocation(program, "color");
+    ASSERT_NE(-1, colorLoc);
+
+    glVertexAttribPointer(colorLoc, 4, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
+    glEnableVertexAttribArray(colorLoc);
+
+    glViewport(0, 0, 2, 2);
+    drawQuad(program, "position", 0.5f, 1.0f, true);
+    ASSERT_GL_NO_ERROR();
+
+    // Map with GL_MAP_UNSYNCHRONIZED_BIT and overwrite buffers data at offset 24
+    uint8_t *data = reinterpret_cast<uint8_t *>(
+        glMapBufferRangeEXT(GL_ARRAY_BUFFER, 4 * kNumVertices, 4 * kNumVertices,
+                            GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT));
+    EXPECT_GL_NO_ERROR();
+    for (int i = 0; i < kNumVertices; ++i)
+    {
+        data[4 * i]     = 0;
+        data[4 * i + 1] = 255;
+        data[4 * i + 2] = 0;
+        data[4 * i + 3] = 255;
+    }
+    glUnmapBufferOES(GL_ARRAY_BUFFER);
+    EXPECT_GL_NO_ERROR();
+
+    // Re-draw using offset = 0 but to different viewport
+    glViewport(0, 2, 2, 2);
+    drawQuad(program, "position", 0.5f, 1.0f, true);
+    ASSERT_GL_NO_ERROR();
+
+    // Change vertex attribute to use buffer starting from offset 24
+    glVertexAttribPointer(colorLoc, 4, GL_UNSIGNED_BYTE, GL_TRUE, 0,
+                          reinterpret_cast<void *>(4 * kNumVertices));
+
+    glViewport(2, 2, 2, 2);
+    drawQuad(program, "position", 0.5f, 1.0f, true);
+    ASSERT_GL_NO_ERROR();
+
+    EXPECT_PIXEL_COLOR_EQ(1, 1, GLColor::red);
+    EXPECT_PIXEL_COLOR_EQ(1, 3, GLColor::red);
+    EXPECT_PIXEL_COLOR_EQ(3, 3, GLColor::green);
+}
+
 // Tests a bug where copying buffer data immediately after creation hit a nullptr in D3D11.
 TEST_P(BufferDataTestES3, NoBufferInitDataCopyBug)
 {