Handle the compression of big pipeline cache.

Big pipeline cache will cost much time to compress. Regarding the perfomance, handle the compression of big pipeline cache in this way: 1)Return when the pipeline cache data is larger than 10M. 2)Use worker thread to complete compression. Bug: angleproject:4722 Change-Id: I62eb69d8c46729261f0502af01450ec301c258f3 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/2788169 Commit-Queue: Jamie Madill <jmadill@chromium.org> Reviewed-by: Jamie Madill <jmadill@chromium.org> Reviewed-by: Shahbaz Youssefi <syoussefi@chromium.org>

Handle the compression of big pipeline cache.
95935176 · Amy Liu · Commit Bot · 2f808349 · 95935176 · 95935176
Commit 95935176 authored Mar 26, 2021 by Amy Liu Committed by Commit Bot Apr 20, 2021
9 changed files
--- a/include/platform/FrontendFeatures.h
+++ b/include/platform/FrontendFeatures.h
@@ -70,6 +70,12 @@ struct FrontendFeatures : angle::FeatureSetBase
                                    angle::FeatureCategory::FrontendFeatures,
                                    "Set the context limits like frame capturing was enabled",
                                    &members, "http://anglebug.com/5750"};
+    // Whether we should compress pipeline cache in thread pool before it's stored in blob cache.
+    // http://anglebug.com/4722
+    angle::Feature enableCompressingPipelineCacheInThreadPool = {
+        "enableCompressingPipelineCacheInThreadPool", angle::FeatureCategory::FrontendWorkarounds,
+        "Enable compressing pipeline cache in thread pool.", &members, "http://anglebug.com/4722"};
 };
 inline FrontendFeatures::FrontendFeatures()  = default;

--- a/src/libANGLE/BlobCache.cpp
+++ b/src/libANGLE/BlobCache.cpp
@@ -34,9 +34,11 @@ enum CacheResult
 // In oder to store more cache in blob cache, compress cacheData to compressedData
 // before being stored.
-bool CompressBlobCacheData(angle::MemoryBuffer *cacheData, angle::MemoryBuffer *compressedData)
+bool CompressBlobCacheData(const size_t cacheSize,
+                           const uint8_t *cacheData,
+                           angle::MemoryBuffer *compressedData)
 {
-    uLong uncompressedSize       = static_cast<uLong>(cacheData->size());
+    uLong uncompressedSize       = static_cast<uLong>(cacheSize);
    uLong expectedCompressedSize = zlib_internal::GzipExpectedCompressedSize(uncompressedSize);
    // Allocate memory.
@@ -46,9 +48,8 @@ bool CompressBlobCacheData(angle::MemoryBuffer *cacheData, angle::MemoryBuffer *
        return false;
    }
-    int zResult =
+    int zResult = zlib_internal::GzipCompressHelper(compressedData->data(), &expectedCompressedSize,
-        zlib_internal::GzipCompressHelper(compressedData->data(), &expectedCompressedSize,
+                                                    cacheData, uncompressedSize, nullptr, nullptr);
-                                          cacheData->data(), uncompressedSize, nullptr, nullptr);
    if (zResult != Z_OK)
    {
@@ -120,6 +121,7 @@ void BlobCache::put(const BlobCache::Key &key, angle::MemoryBuffer &&value)
 void BlobCache::putApplication(const BlobCache::Key &key, const angle::MemoryBuffer &value)
 {
+    std::lock_guard<std::mutex> lock(mBlobCacheMutex);
    if (areBlobCacheFuncsSet())
    {
        mSetBlobFunc(key.data(), key.size(), value.data(), value.size());

--- a/src/libANGLE/BlobCache.h
+++ b/src/libANGLE/BlobCache.h
@@ -48,7 +48,9 @@ struct hash<egl::BlobCacheKey>
 namespace egl
 {
-bool CompressBlobCacheData(angle::MemoryBuffer *cacheData, angle::MemoryBuffer *compressedData);
+bool CompressBlobCacheData(const size_t cacheSize,
+                           const uint8_t *cacheData,
+                           angle::MemoryBuffer *compressedData);
 bool DecompressBlobCacheData(const uint8_t *compressedData,
                             const size_t compressedSize,
                             angle::MemoryBuffer *uncompressedData);
@@ -148,6 +150,8 @@ class BlobCache final : angle::NonCopyable
  private:
    // This internal cache is used only if the application is not providing caching callbacks
    using CacheEntry = std::pair<angle::MemoryBuffer, CacheSource>;
+    std::mutex mBlobCacheMutex;
    angle::SizedMRUCache<BlobCache::Key, CacheEntry> mBlobCache;
    EGLSetBlobFuncANDROID mSetBlobFunc;

--- a/src/libANGLE/Context.cpp
+++ b/src/libANGLE/Context.cpp
@@ -3930,7 +3930,9 @@ void Context::updateCaps()
        mValidBufferBindings.set(BufferBinding::Texture);
    }
-    mThreadPool = angle::WorkerThreadPool::Create(mState.mExtensions.parallelShaderCompile);
+    mThreadPool = angle::WorkerThreadPool::Create(
+        mState.mExtensions.parallelShaderCompile ||
+        getFrontendFeatures().enableCompressingPipelineCacheInThreadPool.enabled);
    // Reinitialize some dirty bits that depend on extensions.
    if (mState.isRobustResourceInitEnabled())

--- a/src/libANGLE/Display.cpp
+++ b/src/libANGLE/Display.cpp
@@ -1856,6 +1856,10 @@ void Display::initializeFrontendFeatures()
    mImplementation->initializeFrontendFeatures(&mFrontendFeatures);
    rx::ApplyFeatureOverrides(&mFrontendFeatures, mState);
+    // Disabled by default. To reduce the risk, create a feature to enable
+    // compressing pipeline cache in multi-thread pool.
+    ANGLE_FEATURE_CONDITION(&mFrontendFeatures, enableCompressingPipelineCacheInThreadPool, false);
 }
 const DisplayExtensions &Display::getExtensions() const

--- a/src/libANGLE/MemoryProgramCache.cpp
+++ b/src/libANGLE/MemoryProgramCache.cpp
@@ -210,7 +210,8 @@ angle::Result MemoryProgramCache::putProgram(const egl::BlobCache::Key &programH
    ANGLE_TRY(program->serialize(context, &serializedProgram));
    angle::MemoryBuffer compressedData;
-    if (!egl::CompressBlobCacheData(&serializedProgram, &compressedData))
+    if (!egl::CompressBlobCacheData(serializedProgram.size(), serializedProgram.data(),
+                                    &compressedData))
    {
        ERR() << "Error compressing binary data.";
        return angle::Result::Incomplete;

--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@@ -505,28 +505,30 @@ void ComputePipelineCacheVkChunkKey(VkPhysicalDeviceProperties physicalDevicePro
                               hashString.length(), hashOut->data());
 }
-angle::Result CompressAndStorePipelineCacheVk(VkPhysicalDeviceProperties physicalDeviceProperties,
+bool CompressAndStorePipelineCacheVk(VkPhysicalDeviceProperties physicalDeviceProperties,
-                                              DisplayVk *displayVk,
+                                     DisplayVk *displayVk,
-                                              ContextVk *contextVk,
+                                     ContextVk *contextVk,
-                                              angle::MemoryBuffer *pipelineCacheData,
+                                     const std::vector<uint8_t> &cacheData,
-                                              bool *success)
+                                     const size_t maxTotalSize)
 {
-    // There is a limitation in android, we can only store cache data less than 64kb in blob cache.
+    // Though the pipeline cache will be compressed and divided into several chunks to store in blob
-    // So there is no use to handle big pipeline cache when android will reject it finally.
+    // cache, the largest total size of blob cache is only 2M in android now, so there is no use to
-    constexpr size_t kMaxTotalSize = 64 * 1024;
+    // handle big pipeline cache when android will reject it finally.
+    if (cacheData.size() >= maxTotalSize)
-    if (pipelineCacheData->size() >= kMaxTotalSize)
    {
        // TODO: handle the big pipeline cache. http://anglebug.com/4722
        ANGLE_PERF_WARNING(contextVk->getDebug(), GL_DEBUG_SEVERITY_LOW,
-                           "Skip syncing pipeline cache data when it's larger than 64kb.");
+                           "Skip syncing pipeline cache data when it's larger than maxTotalSize.");
-        return angle::Result::Continue;
+        return false;
    }
    // To make it possible to store more pipeline cache data, compress the whole pipelineCache.
    angle::MemoryBuffer compressedData;
-    ANGLE_VK_CHECK(displayVk, egl::CompressBlobCacheData(pipelineCacheData, &compressedData),
-                   VK_ERROR_INITIALIZATION_FAILED);
+    if (!egl::CompressBlobCacheData(cacheData.size(), cacheData.data(), &compressedData))
+    {
+        return false;
+    }
    // If the size of compressedData is larger than (kMaxBlobCacheSize - sizeof(numChunks)),
    // the pipelineCache still can't be stored in blob cache. Divide the large compressed
@@ -553,8 +555,10 @@ angle::Result CompressAndStorePipelineCacheVk(VkPhysicalDeviceProperties physica
        }
        angle::MemoryBuffer keyData;
-        ANGLE_VK_CHECK(displayVk, keyData.resize(kBlobHeaderSize + chunkSize),
+        if (!keyData.resize(kBlobHeaderSize + chunkSize))
-                       VK_ERROR_INITIALIZATION_FAILED);
+        {
+            return false;
+        }
        ASSERT(numChunks <= UINT8_MAX);
        keyData.data()[0] = static_cast<uint8_t>(numChunks);
@@ -565,12 +569,59 @@ angle::Result CompressAndStorePipelineCacheVk(VkPhysicalDeviceProperties physica
        // Create unique hash key.
        egl::BlobCache::Key chunkCacheHash;
        ComputePipelineCacheVkChunkKey(physicalDeviceProperties, chunkIndex, &chunkCacheHash);
        displayVk->getBlobCache()->putApplication(chunkCacheHash, keyData);
    }
-    *success = true;
-    return angle::Result::Continue;
+    return true;
 }
+class CompressAndStorePipelineCacheTask : public angle::Closure
+{
+  public:
+    CompressAndStorePipelineCacheTask(DisplayVk *displayVk,
+                                      ContextVk *contextVk,
+                                      std::vector<uint8_t> &&cacheData,
+                                      size_t kMaxTotalSize)
+        : mDisplayVk(displayVk),
+          mContextVk(contextVk),
+          mCacheData(std::move(cacheData)),
+          mMaxTotalSize(kMaxTotalSize),
+          mResult(true)
+    {}
+    void operator()() override
+    {
+        ANGLE_TRACE_EVENT0("gpu.angle", "CompressAndStorePipelineCacheVk");
+        mResult = CompressAndStorePipelineCacheVk(
+            mContextVk->getRenderer()->getPhysicalDeviceProperties(), mDisplayVk, mContextVk,
+            mCacheData, mMaxTotalSize);
+    }
+    bool getResult() { return mResult; }
+  private:
+    DisplayVk *mDisplayVk;
+    ContextVk *mContextVk;
+    std::vector<uint8_t> mCacheData;
+    size_t mMaxTotalSize;
+    bool mResult;
+};
+class WaitableCompressEventImpl : public WaitableCompressEvent
+{
+  public:
+    WaitableCompressEventImpl(std::shared_ptr<angle::WaitableEvent> waitableEvent,
+                              std::shared_ptr<CompressAndStorePipelineCacheTask> compressTask)
+        : WaitableCompressEvent(waitableEvent), mCompressTask(compressTask)
+    {}
+    bool getResult() override { return mCompressTask->getResult(); }
+  private:
+    std::shared_ptr<CompressAndStorePipelineCacheTask> mCompressTask;
+};
 angle::Result GetAndDecompressPipelineCacheVk(VkPhysicalDeviceProperties physicalDeviceProperties,
                                              DisplayVk *displayVk,
                                              angle::MemoryBuffer *uncompressedData,
@@ -765,6 +816,12 @@ void RendererVk::onDestroy(vk::Context *context)
        mInstance = VK_NULL_HANDLE;
    }
+    if (mCompressEvent)
+    {
+        mCompressEvent->wait();
+        mCompressEvent.reset();
+    }
    mMemoryProperties.destroy();
    mPhysicalDevice = VK_NULL_HANDLE;
 }
@@ -2452,7 +2509,7 @@ angle::Result RendererVk::getPipelineCacheSize(DisplayVk *displayVk, size_t *pip
    return angle::Result::Continue;
 }
-angle::Result RendererVk::syncPipelineCacheVk(DisplayVk *displayVk, ContextVk *contextVk)
+angle::Result RendererVk::syncPipelineCacheVk(DisplayVk *displayVk, const gl::Context *context)
 {
    // TODO: Synchronize access to the pipeline/blob caches?
    ASSERT(mPipelineCache.valid());
@@ -2480,13 +2537,23 @@ angle::Result RendererVk::syncPipelineCacheVk(DisplayVk *displayVk, ContextVk *c
        return angle::Result::Continue;
    }
-    angle::MemoryBuffer *pipelineCacheData = nullptr;
+    ContextVk *contextVk = vk::GetImpl(context);
-    ANGLE_VK_CHECK_ALLOC(displayVk,
-                         displayVk->getScratchBuffer(pipelineCacheSize, &pipelineCacheData));
+    // Use worker thread pool to complete compression.
+    // If the last task hasn't been finished, skip the syncing.
+    if (mCompressEvent && (!mCompressEvent->isReady() || !mCompressEvent->getResult()))
+    {
+        ANGLE_PERF_WARNING(contextVk->getDebug(), GL_DEBUG_SEVERITY_LOW,
+                           "Skip syncing pipeline cache data when the last task is not ready or "
+                           "the compress task failed.");
+        return angle::Result::Continue;
+    }
+    std::vector<uint8_t> pipelineCacheData(pipelineCacheSize);
    size_t oldPipelineCacheSize = pipelineCacheSize;
    VkResult result =
-        mPipelineCache.getCacheData(mDevice, &pipelineCacheSize, pipelineCacheData->data());
+        mPipelineCache.getCacheData(mDevice, &pipelineCacheSize, pipelineCacheData.data());
    // We don't need all of the cache data, so just make sure we at least got the header
    // Vulkan Spec 9.6. Pipeline Cache
    // https://www.khronos.org/registry/vulkan/specs/1.1-extensions/html/chap9.html#pipelines-cache
@@ -2511,21 +2578,42 @@ angle::Result RendererVk::syncPipelineCacheVk(DisplayVk *displayVk, ContextVk *c
    // If vkGetPipelineCacheData ends up writing fewer bytes than requested, zero out the rest of
    // the buffer to avoid leaking garbage memory.
-    ASSERT(pipelineCacheSize <= pipelineCacheData->size());
+    ASSERT(pipelineCacheSize <= pipelineCacheData.size());
-    if (pipelineCacheSize < pipelineCacheData->size())
+    if (pipelineCacheSize < pipelineCacheData.size())
    {
-        memset(pipelineCacheData->data() + pipelineCacheSize, 0,
+        memset(pipelineCacheData.data() + pipelineCacheSize, 0,
-               pipelineCacheData->size() - pipelineCacheSize);
+               pipelineCacheData.size() - pipelineCacheSize);
    }
-    bool success = false;
+    if (context->getFrontendFeatures().enableCompressingPipelineCacheInThreadPool.enabled)
-    ANGLE_TRY(CompressAndStorePipelineCacheVk(mPhysicalDeviceProperties, displayVk, contextVk,
-                                              pipelineCacheData, &success));
-    if (success)
    {
+        // The function zlib_internal::GzipCompressHelper() can compress 10M pipeline cache data
+        // into about 2M, to save the time of compression, set kMaxTotalSize to 10M.
+        constexpr size_t kMaxTotalSize = 10 * 1024 * 1024;
+        // Create task to compress.
+        auto compressAndStorePipelineCacheTask =
+            std::make_shared<CompressAndStorePipelineCacheTask>(
+                displayVk, contextVk, std::move(pipelineCacheData), kMaxTotalSize);
+        mCompressEvent = std::make_shared<WaitableCompressEventImpl>(
+            angle::WorkerThreadPool::PostWorkerTask(context->getWorkerThreadPool(),
+                                                    compressAndStorePipelineCacheTask),
+            compressAndStorePipelineCacheTask);
        mPipelineCacheDirty = false;
    }
+    else
+    {
+        // If enableCompressingPipelineCacheInThreadPool is diabled, to avoid the risk, set
+        // kMaxTotalSize to 64k.
+        constexpr size_t kMaxTotalSize = 64 * 1024;
+        bool compressResult            = CompressAndStorePipelineCacheVk(
+            mPhysicalDeviceProperties, displayVk, contextVk, pipelineCacheData, kMaxTotalSize);
+        if (compressResult)
+        {
+            mPipelineCacheDirty = false;
+        }
+    }
    return angle::Result::Continue;
 }

--- a/src/libANGLE/renderer/vulkan/RendererVk.h
+++ b/src/libANGLE/renderer/vulkan/RendererVk.h
@@ -26,6 +26,7 @@
 #include "common/vulkan/vulkan_icd.h"
 #include "libANGLE/BlobCache.h"
 #include "libANGLE/Caps.h"
+#include "libANGLE/WorkerThread.h"
 #include "libANGLE/renderer/vulkan/CommandProcessor.h"
 #include "libANGLE/renderer/vulkan/DebugAnnotatorVk.h"
 #include "libANGLE/renderer/vulkan/QueryVk.h"
@@ -96,6 +97,25 @@ void CollectGarbage(std::vector<vk::GarbageObject> *garbageOut, ArgT object, Arg
    CollectGarbage(garbageOut, objectsIn...);
 }
+class WaitableCompressEvent
+{
+  public:
+    WaitableCompressEvent(std::shared_ptr<angle::WaitableEvent> waitableEvent)
+        : mWaitableEvent(waitableEvent)
+    {}
+    virtual ~WaitableCompressEvent() {}
+    void wait() { return mWaitableEvent->wait(); }
+    bool isReady() { return mWaitableEvent->isReady(); }
+    virtual bool getResult() = 0;
+  private:
+    std::shared_ptr<angle::WaitableEvent> mWaitableEvent;
+};
 class RendererVk : angle::NonCopyable
 {
  public:
@@ -165,7 +185,7 @@ class RendererVk : angle::NonCopyable
    const vk::Format &getFormat(angle::FormatID formatID) const { return mFormatTable[formatID]; }
    angle::Result getPipelineCacheSize(DisplayVk *displayVk, size_t *pipelineCacheSizeOut);
-    angle::Result syncPipelineCacheVk(DisplayVk *displayVk, ContextVk *contextVk);
+    angle::Result syncPipelineCacheVk(DisplayVk *displayVk, const gl::Context *context);
    // Issues a new serial for linked shader modules. Used in the pipeline cache.
    Serial issueShaderSerial();
@@ -509,6 +529,9 @@ class RendererVk : angle::NonCopyable
    // Note that this mask can have bits set that don't correspond to valid stages, so it's strictly
    // only useful for masking out unsupported stages in an otherwise valid set of stages.
    VkPipelineStageFlags mSupportedVulkanPipelineStageMask;
+    // Use thread pool to compress cache data.
+    std::shared_ptr<rx::WaitableCompressEvent> mCompressEvent;
 };
 }  // namespace rx

--- a/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
+++ b/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
@@ -1537,7 +1537,7 @@ angle::Result WindowSurfaceVk::doDeferredAcquireNextImage(const gl::Context *con
    }
    RendererVk *renderer = contextVk->getRenderer();
-    ANGLE_TRY(renderer->syncPipelineCacheVk(displayVk, contextVk));
+    ANGLE_TRY(renderer->syncPipelineCacheVk(displayVk, context));
    return angle::Result::Continue;
 }