Vulkan: revamp present semaphore management

See doc/PresentSemaphores.md for details. Bug: angleproject:3450 Bug: angleproject:3670 Change-Id: I52d5bd13a4af25f224d386c9584525c182af6f17 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1776880Reviewed-by: Tim Van Patten <timvp@google.com> Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org>

Vulkan: revamp present semaphore management
c3f57231 · Shahbaz Youssefi · Commit Bot · f0640bc3 · c3f57231 · c3f57231
Commit c3f57231 authored Aug 28, 2019 by Shahbaz Youssefi Committed by Commit Bot Aug 31, 2019
5 changed files
--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@@ -1569,7 +1569,7 @@ angle::Result RendererVk::newSharedFence(vk::Context *context,
    }
    else
    {
-        mFenceRecycler.fetch(mDevice, &fence);
+        mFenceRecycler.fetch(&fence);
        ANGLE_VK_TRY(context, fence.reset(mDevice));
    }
    sharedFenceOut->assign(mDevice, std::move(fence));

--- a/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
+++ b/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
--- a/src/libANGLE/renderer/vulkan/SurfaceVk.h
+++ b/src/libANGLE/renderer/vulkan/SurfaceVk.h
@@ -99,6 +99,78 @@ class OffscreenSurfaceVk : public SurfaceVk
    AttachmentImage mDepthStencilAttachment;
 };

+// Data structures used in WindowSurfaceVk
+namespace impl
+{
+// The submission fence of the context used to throttle the CPU.
+struct SwapHistory : angle::NonCopyable
+{
+    SwapHistory();
+    SwapHistory(SwapHistory &&other) = delete;
+    SwapHistory &operator=(SwapHistory &&other) = delete;
+    ~SwapHistory();
+
+    void destroy(RendererVk *renderer);
+
+    angle::Result waitFence(ContextVk *contextVk);
+
+    // Fence associated with the last submitted work to render to this swapchain image.
+    vk::Shared<vk::Fence> sharedFence;
+};
+static constexpr size_t kSwapHistorySize = 2;
+
+// Old swapchain and associated present semaphores that need to be scheduled for destruction when
+// appropriate.
+struct SwapchainCleanupData : angle::NonCopyable
+{
+    SwapchainCleanupData();
+    SwapchainCleanupData(SwapchainCleanupData &&other);
+    ~SwapchainCleanupData();
+
+    void destroy(VkDevice device, vk::Recycler<vk::Semaphore> *semaphoreRecycler);
+
+    // The swapchain to be destroyed.
+    VkSwapchainKHR swapchain = VK_NULL_HANDLE;
+    // Any present semaphores that were pending destruction at the time the swapchain was
+    // recreated will be scheduled for destruction at the same time as the swapchain.
+    std::vector<vk::Semaphore> semaphores;
+};
+
+// A circular buffer per image stores the semaphores used for presenting that image.  Taking the
+// swap history into account, only the oldest semaphore is guaranteed to be no longer in use by the
+// presentation engine.  See doc/PresentSemaphores.md for details.
+//
+// Old swapchains are scheduled to be destroyed at the same time as the first semaphore used to
+// present an image of the new swapchain.  This is to ensure that the presentation engine is no
+// longer presenting an image from the old swapchain.
+struct ImagePresentHistory : angle::NonCopyable
+{
+    ImagePresentHistory();
+    ImagePresentHistory(ImagePresentHistory &&other);
+    ~ImagePresentHistory();
+
+    vk::Semaphore semaphore;
+    std::vector<SwapchainCleanupData> oldSwapchains;
+};
+
+// Swapchain images and their associated objects.
+struct SwapchainImage : angle::NonCopyable
+{
+    SwapchainImage();
+    SwapchainImage(SwapchainImage &&other);
+    ~SwapchainImage();
+
+    vk::ImageHelper image;
+    vk::ImageView imageView;
+    vk::Framebuffer framebuffer;
+
+    // A circular array of semaphores used for presenting this image.
+    static constexpr size_t kPresentHistorySize = kSwapHistorySize + 1;
+    std::array<ImagePresentHistory, kPresentHistorySize> presentHistory;
+    size_t currentPresentHistoryIndex = 0;
+};
+}  // namespace impl
+
 class WindowSurfaceVk : public SurfaceVk
 {
  public:
@@ -163,6 +235,7 @@ class WindowSurfaceVk : public SurfaceVk
    angle::Result checkForOutOfDateSwapchain(ContextVk *contextVk,
                                             uint32_t swapHistoryIndex,
                                             bool presentOutOfDate);
+    angle::Result resizeSwapchainImages(vk::Context *context, uint32_t imageCount);
    void releaseSwapchainImages(ContextVk *contextVk);
    void destroySwapChainImages(DisplayVk *displayVk);
    VkResult nextSwapchainImage(vk::Context *context);
@@ -173,6 +246,8 @@ class WindowSurfaceVk : public SurfaceVk

    angle::Result swapImpl(const gl::Context *context, EGLint *rects, EGLint n_rects);

+    angle::Result newPresentSemaphore(vk::Context *context, vk::Semaphore *semaphoreOut);
+
    bool isMultiSampled() const;

    VkSurfaceCapabilitiesKHR mSurfaceCaps;
@@ -186,46 +261,25 @@ class WindowSurfaceVk : public SurfaceVk
    VkSurfaceTransformFlagBitsKHR mPreTransform;
    VkCompositeAlphaFlagBitsKHR mCompositeAlpha;

-    uint32_t mCurrentSwapchainImageIndex;
-
-    struct SwapchainImage : angle::NonCopyable
-    {
-        SwapchainImage();
-        SwapchainImage(SwapchainImage &&other);
-        ~SwapchainImage();
+    // A circular buffer that stores the submission fence of the context on every swap.  The CPU is
+    // throttled by waiting for the 2nd previous serial to finish.
+    std::array<impl::SwapHistory, impl::kSwapHistorySize> mSwapHistory;
+    size_t mCurrentSwapHistoryIndex;

-        vk::ImageHelper image;
-        vk::ImageView imageView;
-        vk::Framebuffer framebuffer;
-    };
+    // The previous swapchain which needs to be scheduled for destruction when appropriate.  This
+    // will be done when the first image of the current swapchain is presented.  If there were
+    // older swapchains pending destruction when the swapchain is recreated, they will accumulate
+    // and be destroyed with the previous swapchain.
+    //
+    // Note that if the user resizes the window such that the swapchain is recreated every frame,
+    // this array can go grow indefinitely.
+    std::vector<impl::SwapchainCleanupData> mOldSwapchains;

-    std::vector<SwapchainImage> mSwapchainImages;
+    std::vector<impl::SwapchainImage> mSwapchainImages;
    vk::Semaphore mAcquireImageSemaphore;
+    uint32_t mCurrentSwapchainImageIndex;

-    // A circular buffer that stores the serial of the renderer on every swap.  The CPU is
-    // throttled by waiting for the 2nd previous serial to finish.  Old swapchains are scheduled to
-    // be destroyed at the same time.
-    struct SwapHistory : angle::NonCopyable
-    {
-        SwapHistory();
-        SwapHistory(SwapHistory &&other) = delete;
-        SwapHistory &operator=(SwapHistory &&other) = delete;
-        ~SwapHistory();
-
-        void destroy(RendererVk *renderer);
-
-        angle::Result waitFence(ContextVk *contextVk);
-
-        // Fence associated with the last submitted work to render to this swapchain image.
-        vk::Shared<vk::Fence> sharedFence;
-
-        vk::Semaphore presentImageSemaphore;
-
-        VkSwapchainKHR swapchain = VK_NULL_HANDLE;
-    };
-    static constexpr size_t kSwapHistorySize = 2;
-    std::array<SwapHistory, kSwapHistorySize> mSwapHistory;
-    size_t mCurrentSwapHistoryIndex;
+    vk::Recycler<vk::Semaphore> mPresentSemaphoreRecycler;

    // Depth/stencil image.  Possibly multisampled.
    vk::ImageHelper mDepthStencilImage;

--- a/src/libANGLE/renderer/vulkan/doc/PresentSemaphores.md
+++ b/src/libANGLE/renderer/vulkan/doc/PresentSemaphores.md
+# Queue Present Wait Semaphore Management
+
+The following shorthand notations are used throughout this document:
+
+- PE: Presentation Engine
+- ANI: vkAcquireNextImageKHR
+- QS: vkQueueSubmit
+- QP: vkQueuePresentKHR
+- W: Wait
+- S: Signal
+- R: Render
+- P: Present
+- SN: Semaphore N
+- IN: Swapchain image N
+- FN: Fence N
+
+---
+
+## Introduction
+
+Vulkan requires the application (ANGLE in this case) to acquire swapchain images and queue them for
+presentation, synchronizing GPU submissions with semaphores.  A single frame looks like the
+following:
+
+    CPU: ANI  ... QS   ... QP
+         S:S1     W:S1     W:S2
+                  S:S2
+    GPU:          <------------ R ----------->
+     PE:                                      <-------- P ------>
+
+That is, the GPU starts rendering after submission, and the presentation is done when rendering is
+finished.  With multiple frames, the pipeline looks different based on present mode.  Let's focus on
+FIFO (the arguments in this document translate to all modes) with 3 images:
+
+    CPU: QS QP QS QP QS QP QS QP
+         I1 I1 I2 I2 I3 I3 I1 I1
+    GPU: <---- R I1 ----><---- R I2 ----><---- R I3 ----><---- R I1 ---->
+     PE:                 <----- P I1 -----><----- P I2 -----><----- P I3 -----><----- P I1 ----->
+
+First, an issue is evident here.  The CPU is submitting jobs and queuing images for presentation
+faster than the GPU can render them or the PE can view them.  This causes the length of the PE queue
+to grow indefinitely, resulting in larger and larger input lag.
+
+To address this issue, ANGLE paces the CPU such that the length of the PE queue is kept at a maximum
+of 1 image (i.e. one image is being presented, and another one is in queue):
+
+    CPU: QS   QS          W:F1 QS         W:F2 QS
+         I1   I2               I3              I1
+         S:F1 S:F2             S:F3            S:F4
+    GPU: <---- R I1 ----><---- R I2 ----><---- R I3 ----><---- R I1 ---->
+
+> Note: While this works in heavy applications (as the rendering time is almost as long as the frame
+> (i.e. present time), in which case pacing the submissions similarly paces the presentation), it's
+> not technically keeping the PE queue length 1, but rather below n+2 where n is the number of
+> swapchain images.
+>
+> To understand why, imagine a FIFO swapchain with 1000 images and submissions that are
+> infinitesimally short.  In this case, the CPU pacing is effectively a no-op (as the GPU instantly
+> finishes jobs) for the first 1002 submissions.  The 1003rd submission waits for F1001 (which uses
+> I1).  However, the 1001st submission will not start until the PE is finished presenting I1 (at the
+> next V-Sync).  The CPU then waits for V-Sync before the 1003rd submission.  The CPU waits for one
+> V-Sync for every subsequent submission, keeping the length of the queue 1002.
+> [`VK_GOOGLE_display_timing`][DisplayTimingGOOGLE] is likely a solution to this problem.
+
+Associated with each QP operation is a semaphore signaled by the preceding QS and waited on by the
+PE before the image can be presented.  Currently, there's no feedback from Vulkan (See [internal
+Khronos issue][VulkanIssue1060]) regarding _when_ the PE has actually finished waiting on the
+semaphore!  This means that the application cannot generally know when to destroy the corresponding
+semaphore.  However, taking ANGLE's CPU pacing into account, we are able to destroy (or rather
+reuse) semaphores when they are provably unused.
+
+The interested reader may follow the discussion in this abandoned [gerrit CL][CL1757018] for more
+background and ideas.
+
+[DisplayTimingGOOGLE]: https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VK_GOOGLE_display_timing.html
+[VulkanIssue1060]: https://gitlab.khronos.org/vulkan/vulkan/issues/1060
+[CL1757018]: https://chromium-review.googlesource.com/c/angle/angle/+/1757018
+
+## Determining When a QP Semaphore is Waited On
+
+Let's combine the above diagrams with all the details:
+
+    CPU: ANI   | QS    | QP    | ANI   | QS    | QP    | ANI   | W:F1 | QS    | QP    | ANI   | W:F2 | QS    | QP
+         I1    | I1    | I1    | I2    | I2    | I2    | I3    |      | I3    | I3    | I1    |      | I1    | I1
+         S:SA1 | W:SA1 |       | S:SA2 | W:SA2 |       | S:SA3 |      | W:SA3 |       | S:SA4 |      | W:SA4 |
+               | S:SP1 | W:SP1 |       | S:SP2 | W:SP2 |       |      | S:SP3 | W:SP3 |       |      | S:SP4 | W:SP4
+               | S:F1  |       |       | S:F2  |       |       |      | S:F3  |       |       |      | S:F4  |
+
+Let's focus only on sequences that return the same image:
+
+    CPU: ANI   | W:F(X-2) | QS    | QP    | ... | ANI   | W:F(Y-2) | QS    | QP
+         I1    |          | I1    | I1    |     | I1    |          | I1    | I1
+         S:SAX |          | W:SAX |       |     | S:SAY |          | W:SAY |
+               |          | S:SPX | W:SPX |     |       |          | S:SPY | W:SPY
+               |          | S:FX  |       |     |       |          | S:FY  |
+
+Note that X and Y are arbitrarily distanced (including possibly being sequential).
+
+Say we are at frame Y+2.  There's therefore a wait on FY.  The following holds:
+
+    FY is signaled
+    => SAY is signaled
+    => Previous presentation of I1 (corresponding to SPX) is finished
+    => SPX is waited
+
+At this point, we can destroy SPX.  In other words, in frame Y+2, we can destroy SPX (note that 2 is
+the number of frames the CPU pacing code uses).  If frame Y+1 is not using I1, this means the
+history of present semaphores for I1 would be `{SPX, SPY}` and we can destroy the oldest semaphore
+in this list.  If frame Y+1 is also using I1, we should still destroy SPX in frame Y+2, but the
+history of the present semaphores for I1 would be `{SPX, SPY, SP(Y+1)}`.
+
+In the Vulkan backend, we simplify destruction of semaphores by always keeping a history of 3
+present semaphores for each image (again, 3 is H+1 where H is the swap history size used in CPU
+pacing) and always reuse (instead of destroy) the oldest semaphore of the image that is about to be
+presented.
+
+To summarize, we use the completion of a submission using an image to provably when the *previous*
+presentation of that image was finished.
+
+## Swapchain recreation
+
+When recreating the swapchain, all images are freed and new ones are created, possibly with a
+different count and present mode.  For the old swapchain, we can no longer rely on the completion of
+a future submission to know when a previous presentation is done, as there won't be any more
+submissions using images from the old swapchain.
+
+> For example, imagine the old swapchain was created in FIFO mode, and one image is being presented
+> until the next V-Sync.  Furthermore, imagine the new swapchain is created in MAILBOX mode.  Since
+> the old swapchain's image will remain presented until V-Sync, the new MAILBOX swapchain can
+> perform an arbitrarily large number of (throw-away) presentations.  The old swapchain (and its
+> associated present semaphores) cannot be destroyed until V-Sync; a signal that's not captured by
+> Vulkan.
+
+ANGLE resolves this issue by deferring the destruction of the old swapchain and its remaining
+present semaphores to the time when the semaphore corresponding to the first present of the new
+swapchain can be destroyed.  In the example in the previous section, if SPX is the present semaphore
+of the first QP done on the new swapchain, at frame Y+2, when we know SPX can be destroyed, we know
+that the first image of the new swapchain has already been presented.  This proves that all previous
+presentations of the old swapchain have finished.
+
+> Note: the swapchain can potentially be destroyed much earlier, but with no feedback from the
+> presentation engine, we cannot know that.  This delays means that the swapchain could be recreated
+> while there are pending old swapchains to be destroyed.  The destruction of both old swapchains
+> must now be deferred to when the first present of the new swapchain has finished.  If an
+> application resizes the window constantly and at a high rate, ANGLE would keep accumulating old
+> swapchains and not free them until it stops.  While a user will likely not be able to do this (as
+> the rate of window system events is lower than the framerate), this can be programmatically done
+> (as indeed done in EGL dEQP tests).  Nvidia for example fails creation of a new swapchain if there
+> are already 20 allocated (on desktop, or less than ten on Quadro).  If the backlog of old
+> swapchains get larger than a threshold, ANGLE calls `vkQueueWaitIdle()` and destroys the
+> swapchains.
--- a/src/libANGLE/renderer/vulkan/vk_utils.h
+++ b/src/libANGLE/renderer/vulkan/vk_utils.h
@@ -493,7 +493,7 @@ class Shared final : angle::NonCopyable
            if (!mRefCounted->isReferenced())
            {
                ASSERT(mRefCounted->get().valid());
-                recycler->recyle(std::move(mRefCounted->get()));
+                recycler->recycle(std::move(mRefCounted->get()));
                SafeDelete(mRefCounted);
            }

@@ -530,9 +530,9 @@ class Recycler final : angle::NonCopyable
  public:
    Recycler() = default;

-    void recyle(T &&garbageObject) { mObjectFreeList.emplace_back(std::move(garbageObject)); }
+    void recycle(T &&garbageObject) { mObjectFreeList.emplace_back(std::move(garbageObject)); }

-    void fetch(VkDevice device, T *outObject)
+    void fetch(T *outObject)
    {
        ASSERT(!empty());
        *outObject = std::move(mObjectFreeList.back());