Vulkan: Faster state transitions.

Implements a transition table from Pipeline Cache entry to state change neighbouring Pipeline Cache entries. We use a 64-bit mask to do a quick scan over the pipeline desc. This ends up being a lot faster than doing a full hash and memcmp over the pipeline description. Note that there could be future optimizations to this design. We might keep a hash map of the pipeline transitions instead of a list. Or use a sorted list. This could speed up the search when there are many transitions for cache entries. Also we could skip the transition table and opt to do a full hash when there are more than a configurable number of dirty states. This might be a bit faster in some cases. Likely this will be something we can add performance tests for in the future. Documentation is also added in a README file for the Vulkan back end. This will be extended over time. Improves performance about 30-35% on the VBO state change test. Bug: angleproject:3013 Change-Id: I793f9e3efd8887acf00ad60e4ac2502a54c95dee Reviewed-on: https://chromium-review.googlesource.com/c/1369287 Commit-Queue: Jamie Madill <jmadill@chromium.org> Reviewed-by: Yuly Novikov <ynovikov@chromium.org>

Vulkan: Faster state transitions.
3f0c4a56 · Jamie Madill · Commit Bot · 80766cfa · 3f0c4a56 · 3f0c4a56
Commit 3f0c4a56 authored Jan 10, 2019 by Jamie Madill Committed by Commit Bot Jan 10, 2019
11 changed files
--- a/src/common/mathutil.h
+++ b/src/common/mathutil.h
@@ -1271,6 +1271,11 @@ inline uint16_t RotR16(uint16_t x, int8_t r)
 #    define ANGLE_ROTR16(x, y) ::rx::RotR16(x, y)
 #endif  // namespace rx
+constexpr unsigned int Log2(unsigned int bytes)
+{
+    return bytes == 1 ? 0 : (1 + Log2(bytes / 2));
+}
 }  // namespace rx
 #endif  // COMMON_MATHUTIL_H_
--- a/src/libANGLE/formatutils.cpp
+++ b/src/libANGLE/formatutils.cpp
@@ -40,16 +40,11 @@ bool CheckedMathResult(const CheckedNumeric<GLuint> &value, GLuint *resultOut)
    }
 }
-constexpr GLuint Log2(GLuint bytes)
-{
-    return bytes == 1 ? 0 : (1 + Log2(bytes / 2));
-}
 constexpr uint32_t PackTypeInfo(GLuint bytes, bool specialized)
 {
    // static_assert within constexpr requires c++17
    // static_assert(isPow2(bytes));
-    return bytes | (Log2(bytes) << 8) | (specialized << 16);
+    return bytes | (rx::Log2(bytes) << 8) | (specialized << 16);
 }
 }  // anonymous namespace

--- a/src/libANGLE/renderer/vulkan/ContextVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ContextVk.cpp
--- a/src/libANGLE/renderer/vulkan/ContextVk.h
+++ b/src/libANGLE/renderer/vulkan/ContextVk.h
@@ -179,7 +179,8 @@ class ContextVk : public ContextImpl, public vk::Context
                                              GLuint relativeOffset)
    {
        invalidateVertexAndIndexBuffers();
-        mGraphicsPipelineDesc->updateVertexInput(static_cast<uint32_t>(attribIndex), stride,
+        mGraphicsPipelineDesc->updateVertexInput(&mGraphicsPipelineTransition,
+                                                 static_cast<uint32_t>(attribIndex), stride,
                                                 divisor, format, relativeOffset);
    }
@@ -270,7 +271,6 @@ class ContextVk : public ContextImpl, public vk::Context
        mDirtyBits.set(DIRTY_BIT_PIPELINE);
        mDirtyBits.set(DIRTY_BIT_VIEWPORT);
        mDirtyBits.set(DIRTY_BIT_SCISSOR);
-        mCurrentPipeline = nullptr;
    }
    void invalidateCurrentTextures();
@@ -291,12 +291,13 @@ class ContextVk : public ContextImpl, public vk::Context
    angle::Result handleDirtyViewport(const gl::Context *context, vk::CommandBuffer *commandBuffer);
    angle::Result handleDirtyScissor(const gl::Context *context, vk::CommandBuffer *commandBuffer);
-    vk::PipelineAndSerial *mCurrentPipeline;
+    vk::PipelineHelper *mCurrentPipeline;
    gl::PrimitiveMode mCurrentDrawMode;
    // Keep a cached pipeline description structure that can be used to query the pipeline cache.
    // Kept in a pointer so allocations can be aligned, and structs can be portably packed.
    std::unique_ptr<vk::GraphicsPipelineDesc> mGraphicsPipelineDesc;
+    vk::GraphicsPipelineTransitionBits mGraphicsPipelineTransition;
    // The descriptor pools are externally sychronized, so cannot be accessed from different
    // threads simultaneously. Hence, we keep them in the ContextVk instead of the RendererVk.

--- a/src/libANGLE/renderer/vulkan/ProgramVk.h
+++ b/src/libANGLE/renderer/vulkan/ProgramVk.h
@@ -123,7 +123,8 @@ class ProgramVk : public ProgramImpl
                                      gl::PrimitiveMode mode,
                                      const vk::GraphicsPipelineDesc &desc,
                                      const gl::AttributesMask &activeAttribLocations,
-                                      vk::PipelineAndSerial **pipelineOut)
+                                      const vk::GraphicsPipelineDesc **descPtrOut,
+                                      vk::PipelineHelper **pipelineOut)
    {
        vk::ShaderProgramHelper *shaderProgram;
        ANGLE_TRY(initShaders(contextVk, mode, &shaderProgram));
@@ -132,7 +133,7 @@ class ProgramVk : public ProgramImpl
        return shaderProgram->getGraphicsPipeline(
            contextVk, &renderer->getRenderPassCache(), renderer->getPipelineCache(),
            renderer->getCurrentQueueSerial(), mPipelineLayout.get(), desc, activeAttribLocations,
-            pipelineOut);
+            descPtrOut, pipelineOut);
    }
  private:

--- a/src/libANGLE/renderer/vulkan/README.md
+++ b/src/libANGLE/renderer/vulkan/README.md
+# ANGLE: Vulkan Back-end
+ANGLE's Vulkan back-end implementation lives in this folder.
+[Vulkan](https://www.khronos.org/vulkan/) is an explicit graphics API. It has a lot in common with
+other explicit APIs such as Microsoft's
+[D3D12](https://docs.microsoft.com/en-us/windows/desktop/direct3d12/directx-12-programming-guide)
+and Apple's [Metal](https://developer.apple.com/metal/). Compared to APIs like OpenGL or D3D11
+explicit APIs can offer a number of significant benefits:
+ * Lower API call CPU overhead.
+ * A smaller API surface with more direct hardware control.
+ * Better support for multi-core programming.
+ * Vulkan in particular has open-source tooling and tests.
+## Back-end Design
+The [RendererVk](RendererVk.cpp) is a singleton. RendererVk owns shared global resources like the
+[VkDevice](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkDevice.html),
+[VkQueue](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkQueue.html), the
+[Vulkan format tables](vk_format_utils.h) and [internal Vulkan shaders](shaders). The back-end
+creates a new [ContextVk](ContextVk.cpp) instance to manage each allocated OpenGL Context. ContextVk
+processes state changes and handles action commands like `glDrawArrays` and `glDrawElements`.
+### Fast OpenGL State Transitions
+Typical OpenGL programs issue a few small state change commands between draw call commands. We want
+the typical app's use case to be as fast as possible so this leads to unique performance challenges.
+Vulkan in quite different from OpenGL because it requires a separate compiled
+[VkPipeline](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkPipeline.html)
+for each state vector. Compiling VkPipelines is multiple orders of magnitude slower than enabling or
+disabling an OpenGL render state. To speed this up we use three levels of caching when transitioning
+states in the Vulkan back-end.
+The first level is the driver's
+[VkPipelineCache](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkPipelineCache.html). The driver cache reduces pipeline recompilation time
+significantly. But even cached pipeline recompilations are orders of manitude slower than OpenGL
+state changes.
+The second level cache is an ANGLE-owned hash map from OpenGL state vectors to compiled pipelines.
+See
+[GraphicsPipelineCache](https://chromium.googlesource.com/angle/angle/+/225f08bf85a368f905362cdd1366e4795680452c/src/libANGLE/renderer/vulkan/vk_cache_utils.h#498)
+in [vk_cache_utils.h](vk_cache_utils.h). ANGLE's
+[GraphicsPipelineDesc](https://chromium.googlesource.com/angle/angle/+/225f08bf85a368f905362cdd1366e4795680452c/src/libANGLE/renderer/vulkan/vk_cache_utils.h#244)
+class is a tightly packed 256-byte description of the current OpenGL rendering state. We
+also use a [xxHash](https://github.com/Cyan4973/xxHash) for the fastest possible hash computation.
+The hash map speeds up state changes considerably. But it is still significantly slower than OpenGL
+implementations.
+To get best performance we use a transition table from each OpenGL state vector to neighbouring
+state vectors. The transition table points from GraphicsPipelineCache entries directly to
+neighbouring VkPipeline objects. When the application changes state the state change bits are
+recorded into a compact bit mask that covers the GraphicsPipelineDesc state vector. Then on the next
+draw call we scan the transition bit mask and compare the GraphicsPipelineDesc of the current state
+vector and the state vector of the cached transition. With the hash map we compute a hash over the
+entire state vector and then do a 256-byte `memcmp` to guard against hash collisions. With the
+transition table we will only compare as many bytes as were changed in the transition bit mask. By
+skipping the expensive hashing and `memcmp` we can get as good or faster performance than native
+OpenGL drivers.
+Note that the current design of the transition table stores transitions in an unsorted list. If
+applications map from one state to many this will slow down the transition time. This could be
+improved in the future using a faster look up. For instance we could keep a sorted transition table
+or use a small hash map for transitions.
--- a/src/libANGLE/renderer/vulkan/UtilsVk.cpp
+++ b/src/libANGLE/renderer/vulkan/UtilsVk.cpp
@@ -325,25 +325,30 @@ angle::Result UtilsVk::setupProgram(vk::Context *context,
    Serial serial = renderer->getCurrentQueueSerial();
-    vk::PipelineAndSerial *pipelineAndSerial;
    if (isCompute)
    {
+        vk::PipelineAndSerial *pipelineAndSerial;
        program->setShader(gl::ShaderType::Compute, fsCsShader);
        ANGLE_TRY(program->getComputePipeline(context, pipelineLayout.get(), &pipelineAndSerial));
+        pipelineAndSerial->updateSerial(serial);
+        commandBuffer->bindPipeline(bindPoint, pipelineAndSerial->get());
    }
    else
    {
        program->setShader(gl::ShaderType::Vertex, vsShader);
        program->setShader(gl::ShaderType::Fragment, fsCsShader);
+        // This value is not used but is passed to getGraphicsPipeline to avoid a nullptr check.
+        const vk::GraphicsPipelineDesc *descPtr;
+        vk::PipelineHelper *helper;
        ANGLE_TRY(program->getGraphicsPipeline(
            context, &renderer->getRenderPassCache(), renderer->getPipelineCache(), serial,
-            pipelineLayout.get(), *pipelineDesc, gl::AttributesMask(), &pipelineAndSerial));
+            pipelineLayout.get(), *pipelineDesc, gl::AttributesMask(), &descPtr, &helper));
+        helper->updateSerial(serial);
+        commandBuffer->bindPipeline(bindPoint, helper->getPipeline());
    }
-    commandBuffer->bindPipeline(bindPoint, pipelineAndSerial->get());
-    pipelineAndSerial->updateSerial(serial);
    if (descriptorSet != VK_NULL_HANDLE)
    {
        commandBuffer->bindDescriptorSets(bindPoint, pipelineLayout.get(), 0, 1, &descriptorSet, 0,
@@ -619,8 +624,8 @@ angle::Result UtilsVk::clearImage(ContextVk *contextVk,
    vk::GraphicsPipelineDesc pipelineDesc;
    pipelineDesc.initDefaults();
-    pipelineDesc.updateColorWriteMask(params.colorMaskFlags, *params.alphaMask);
+    pipelineDesc.setColorWriteMask(params.colorMaskFlags, *params.alphaMask);
-    pipelineDesc.updateRenderPassDesc(*params.renderPassDesc);
+    pipelineDesc.setRenderPassDesc(*params.renderPassDesc);
    vk::ShaderLibrary &shaderLibrary                    = renderer->getShaderLibrary();
    vk::RefCounted<vk::ShaderAndSerial> *vertexShader   = nullptr;
@@ -707,7 +712,7 @@ angle::Result UtilsVk::copyImage(vk::Context *context,
    vk::GraphicsPipelineDesc pipelineDesc;
    pipelineDesc.initDefaults();
-    pipelineDesc.updateRenderPassDesc(renderPassDesc);
+    pipelineDesc.setRenderPassDesc(renderPassDesc);
    gl::Rectangle renderArea;
    renderArea.x      = params.destOffset[0];

--- a/src/libANGLE/renderer/vulkan/vk_cache_utils.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_cache_utils.cpp
--- a/src/libANGLE/renderer/vulkan/vk_cache_utils.h
+++ b/src/libANGLE/renderer/vulkan/vk_cache_utils.h
--- a/src/libANGLE/renderer/vulkan/vk_helpers.h
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.h
@@ -641,7 +641,8 @@ class ShaderProgramHelper : angle::NonCopyable
        const PipelineLayout &pipelineLayout,
        const GraphicsPipelineDesc &pipelineDesc,
        const gl::AttributesMask &activeAttribLocationsMask,
-        PipelineAndSerial **pipelineOut)
+        const vk::GraphicsPipelineDesc **descPtrOut,
+        PipelineHelper **pipelineOut)
    {
        // Pull in a compatible RenderPass.
        vk::RenderPass *compatibleRenderPass = nullptr;
@@ -651,7 +652,7 @@ class ShaderProgramHelper : angle::NonCopyable
        return mGraphicsPipelines.getPipeline(
            context, pipelineCache, *compatibleRenderPass, pipelineLayout,
            activeAttribLocationsMask, mShaders[gl::ShaderType::Vertex].get().get(),
-            mShaders[gl::ShaderType::Fragment].get().get(), pipelineDesc, pipelineOut);
+            mShaders[gl::ShaderType::Fragment].get().get(), pipelineDesc, descPtrOut, pipelineOut);
    }
    angle::Result getComputePipeline(Context *context,
@@ -661,6 +662,8 @@ class ShaderProgramHelper : angle::NonCopyable
  private:
    gl::ShaderMap<BindingPointer<ShaderAndSerial>> mShaders;
    GraphicsPipelineCache mGraphicsPipelines;
+    // We should probably use PipelineHelper here so we can remove PipelineAndSerial.
    PipelineAndSerial mComputePipeline;
 };
 }  // namespace vk

--- a/src/tests/perf_tests/VulkanPipelineCachePerf.cpp
+++ b/src/tests/perf_tests/VulkanPipelineCachePerf.cpp
@@ -83,14 +83,15 @@ void VulkanPipelineCachePerfTest::step()
    vk::PipelineLayout pl;
    vk::PipelineCache pc;
    vk::ShaderModule sm;
-    vk::PipelineAndSerial *result = nullptr;
+    const vk::GraphicsPipelineDesc *desc = nullptr;
+    vk::PipelineHelper *result           = nullptr;
    gl::AttributesMask am;
    for (unsigned int iteration = 0; iteration < kIterationsPerStep; ++iteration)
    {
        for (const auto &hit : mCacheHits)
        {
-            (void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, hit, &result);
+            (void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, hit, &desc, &result);
        }
    }
@@ -98,7 +99,7 @@ void VulkanPipelineCachePerfTest::step()
         ++missCount, ++mMissIndex)
    {
        const auto &miss = mCacheMisses[mMissIndex];
-        (void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, miss, &result);
+        (void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, miss, &desc, &result);
    }
 }