Commit c3f57231 by Shahbaz Youssefi Committed by Commit Bot

Vulkan: revamp present semaphore management

See doc/PresentSemaphores.md for details. Bug: angleproject:3450 Bug: angleproject:3670 Change-Id: I52d5bd13a4af25f224d386c9584525c182af6f17 Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/1776880Reviewed-by: 's avatarTim Van Patten <timvp@google.com> Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org>
parent f0640bc3
......@@ -1569,7 +1569,7 @@ angle::Result RendererVk::newSharedFence(vk::Context *context,
}
else
{
mFenceRecycler.fetch(mDevice, &fence);
mFenceRecycler.fetch(&fence);
ANGLE_VK_TRY(context, fence.reset(mDevice));
}
sharedFenceOut->assign(mDevice, std::move(fence));
......
......@@ -99,6 +99,78 @@ class OffscreenSurfaceVk : public SurfaceVk
AttachmentImage mDepthStencilAttachment;
};
// Data structures used in WindowSurfaceVk
namespace impl
{
// The submission fence of the context used to throttle the CPU.
struct SwapHistory : angle::NonCopyable
{
SwapHistory();
SwapHistory(SwapHistory &&other) = delete;
SwapHistory &operator=(SwapHistory &&other) = delete;
~SwapHistory();
void destroy(RendererVk *renderer);
angle::Result waitFence(ContextVk *contextVk);
// Fence associated with the last submitted work to render to this swapchain image.
vk::Shared<vk::Fence> sharedFence;
};
static constexpr size_t kSwapHistorySize = 2;
// Old swapchain and associated present semaphores that need to be scheduled for destruction when
// appropriate.
struct SwapchainCleanupData : angle::NonCopyable
{
SwapchainCleanupData();
SwapchainCleanupData(SwapchainCleanupData &&other);
~SwapchainCleanupData();
void destroy(VkDevice device, vk::Recycler<vk::Semaphore> *semaphoreRecycler);
// The swapchain to be destroyed.
VkSwapchainKHR swapchain = VK_NULL_HANDLE;
// Any present semaphores that were pending destruction at the time the swapchain was
// recreated will be scheduled for destruction at the same time as the swapchain.
std::vector<vk::Semaphore> semaphores;
};
// A circular buffer per image stores the semaphores used for presenting that image. Taking the
// swap history into account, only the oldest semaphore is guaranteed to be no longer in use by the
// presentation engine. See doc/PresentSemaphores.md for details.
//
// Old swapchains are scheduled to be destroyed at the same time as the first semaphore used to
// present an image of the new swapchain. This is to ensure that the presentation engine is no
// longer presenting an image from the old swapchain.
struct ImagePresentHistory : angle::NonCopyable
{
ImagePresentHistory();
ImagePresentHistory(ImagePresentHistory &&other);
~ImagePresentHistory();
vk::Semaphore semaphore;
std::vector<SwapchainCleanupData> oldSwapchains;
};
// Swapchain images and their associated objects.
struct SwapchainImage : angle::NonCopyable
{
SwapchainImage();
SwapchainImage(SwapchainImage &&other);
~SwapchainImage();
vk::ImageHelper image;
vk::ImageView imageView;
vk::Framebuffer framebuffer;
// A circular array of semaphores used for presenting this image.
static constexpr size_t kPresentHistorySize = kSwapHistorySize + 1;
std::array<ImagePresentHistory, kPresentHistorySize> presentHistory;
size_t currentPresentHistoryIndex = 0;
};
} // namespace impl
class WindowSurfaceVk : public SurfaceVk
{
public:
......@@ -163,6 +235,7 @@ class WindowSurfaceVk : public SurfaceVk
angle::Result checkForOutOfDateSwapchain(ContextVk *contextVk,
uint32_t swapHistoryIndex,
bool presentOutOfDate);
angle::Result resizeSwapchainImages(vk::Context *context, uint32_t imageCount);
void releaseSwapchainImages(ContextVk *contextVk);
void destroySwapChainImages(DisplayVk *displayVk);
VkResult nextSwapchainImage(vk::Context *context);
......@@ -173,6 +246,8 @@ class WindowSurfaceVk : public SurfaceVk
angle::Result swapImpl(const gl::Context *context, EGLint *rects, EGLint n_rects);
angle::Result newPresentSemaphore(vk::Context *context, vk::Semaphore *semaphoreOut);
bool isMultiSampled() const;
VkSurfaceCapabilitiesKHR mSurfaceCaps;
......@@ -186,46 +261,25 @@ class WindowSurfaceVk : public SurfaceVk
VkSurfaceTransformFlagBitsKHR mPreTransform;
VkCompositeAlphaFlagBitsKHR mCompositeAlpha;
uint32_t mCurrentSwapchainImageIndex;
struct SwapchainImage : angle::NonCopyable
{
SwapchainImage();
SwapchainImage(SwapchainImage &&other);
~SwapchainImage();
// A circular buffer that stores the submission fence of the context on every swap. The CPU is
// throttled by waiting for the 2nd previous serial to finish.
std::array<impl::SwapHistory, impl::kSwapHistorySize> mSwapHistory;
size_t mCurrentSwapHistoryIndex;
vk::ImageHelper image;
vk::ImageView imageView;
vk::Framebuffer framebuffer;
};
// The previous swapchain which needs to be scheduled for destruction when appropriate. This
// will be done when the first image of the current swapchain is presented. If there were
// older swapchains pending destruction when the swapchain is recreated, they will accumulate
// and be destroyed with the previous swapchain.
//
// Note that if the user resizes the window such that the swapchain is recreated every frame,
// this array can go grow indefinitely.
std::vector<impl::SwapchainCleanupData> mOldSwapchains;
std::vector<SwapchainImage> mSwapchainImages;
std::vector<impl::SwapchainImage> mSwapchainImages;
vk::Semaphore mAcquireImageSemaphore;
uint32_t mCurrentSwapchainImageIndex;
// A circular buffer that stores the serial of the renderer on every swap. The CPU is
// throttled by waiting for the 2nd previous serial to finish. Old swapchains are scheduled to
// be destroyed at the same time.
struct SwapHistory : angle::NonCopyable
{
SwapHistory();
SwapHistory(SwapHistory &&other) = delete;
SwapHistory &operator=(SwapHistory &&other) = delete;
~SwapHistory();
void destroy(RendererVk *renderer);
angle::Result waitFence(ContextVk *contextVk);
// Fence associated with the last submitted work to render to this swapchain image.
vk::Shared<vk::Fence> sharedFence;
vk::Semaphore presentImageSemaphore;
VkSwapchainKHR swapchain = VK_NULL_HANDLE;
};
static constexpr size_t kSwapHistorySize = 2;
std::array<SwapHistory, kSwapHistorySize> mSwapHistory;
size_t mCurrentSwapHistoryIndex;
vk::Recycler<vk::Semaphore> mPresentSemaphoreRecycler;
// Depth/stencil image. Possibly multisampled.
vk::ImageHelper mDepthStencilImage;
......
# Queue Present Wait Semaphore Management
The following shorthand notations are used throughout this document:
- PE: Presentation Engine
- ANI: vkAcquireNextImageKHR
- QS: vkQueueSubmit
- QP: vkQueuePresentKHR
- W: Wait
- S: Signal
- R: Render
- P: Present
- SN: Semaphore N
- IN: Swapchain image N
- FN: Fence N
---
## Introduction
Vulkan requires the application (ANGLE in this case) to acquire swapchain images and queue them for
presentation, synchronizing GPU submissions with semaphores. A single frame looks like the
following:
CPU: ANI ... QS ... QP
S:S1 W:S1 W:S2
S:S2
GPU: <------------ R ----------->
PE: <-------- P ------>
That is, the GPU starts rendering after submission, and the presentation is done when rendering is
finished. With multiple frames, the pipeline looks different based on present mode. Let's focus on
FIFO (the arguments in this document translate to all modes) with 3 images:
CPU: QS QP QS QP QS QP QS QP
I1 I1 I2 I2 I3 I3 I1 I1
GPU: <---- R I1 ----><---- R I2 ----><---- R I3 ----><---- R I1 ---->
PE: <----- P I1 -----><----- P I2 -----><----- P I3 -----><----- P I1 ----->
First, an issue is evident here. The CPU is submitting jobs and queuing images for presentation
faster than the GPU can render them or the PE can view them. This causes the length of the PE queue
to grow indefinitely, resulting in larger and larger input lag.
To address this issue, ANGLE paces the CPU such that the length of the PE queue is kept at a maximum
of 1 image (i.e. one image is being presented, and another one is in queue):
CPU: QS QS W:F1 QS W:F2 QS
I1 I2 I3 I1
S:F1 S:F2 S:F3 S:F4
GPU: <---- R I1 ----><---- R I2 ----><---- R I3 ----><---- R I1 ---->
> Note: While this works in heavy applications (as the rendering time is almost as long as the frame
> (i.e. present time), in which case pacing the submissions similarly paces the presentation), it's
> not technically keeping the PE queue length 1, but rather below n+2 where n is the number of
> swapchain images.
>
> To understand why, imagine a FIFO swapchain with 1000 images and submissions that are
> infinitesimally short. In this case, the CPU pacing is effectively a no-op (as the GPU instantly
> finishes jobs) for the first 1002 submissions. The 1003rd submission waits for F1001 (which uses
> I1). However, the 1001st submission will not start until the PE is finished presenting I1 (at the
> next V-Sync). The CPU then waits for V-Sync before the 1003rd submission. The CPU waits for one
> V-Sync for every subsequent submission, keeping the length of the queue 1002.
> [`VK_GOOGLE_display_timing`][DisplayTimingGOOGLE] is likely a solution to this problem.
Associated with each QP operation is a semaphore signaled by the preceding QS and waited on by the
PE before the image can be presented. Currently, there's no feedback from Vulkan (See [internal
Khronos issue][VulkanIssue1060]) regarding _when_ the PE has actually finished waiting on the
semaphore! This means that the application cannot generally know when to destroy the corresponding
semaphore. However, taking ANGLE's CPU pacing into account, we are able to destroy (or rather
reuse) semaphores when they are provably unused.
The interested reader may follow the discussion in this abandoned [gerrit CL][CL1757018] for more
background and ideas.
[DisplayTimingGOOGLE]: https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VK_GOOGLE_display_timing.html
[VulkanIssue1060]: https://gitlab.khronos.org/vulkan/vulkan/issues/1060
[CL1757018]: https://chromium-review.googlesource.com/c/angle/angle/+/1757018
## Determining When a QP Semaphore is Waited On
Let's combine the above diagrams with all the details:
CPU: ANI | QS | QP | ANI | QS | QP | ANI | W:F1 | QS | QP | ANI | W:F2 | QS | QP
I1 | I1 | I1 | I2 | I2 | I2 | I3 | | I3 | I3 | I1 | | I1 | I1
S:SA1 | W:SA1 | | S:SA2 | W:SA2 | | S:SA3 | | W:SA3 | | S:SA4 | | W:SA4 |
| S:SP1 | W:SP1 | | S:SP2 | W:SP2 | | | S:SP3 | W:SP3 | | | S:SP4 | W:SP4
| S:F1 | | | S:F2 | | | | S:F3 | | | | S:F4 |
Let's focus only on sequences that return the same image:
CPU: ANI | W:F(X-2) | QS | QP | ... | ANI | W:F(Y-2) | QS | QP
I1 | | I1 | I1 | | I1 | | I1 | I1
S:SAX | | W:SAX | | | S:SAY | | W:SAY |
| | S:SPX | W:SPX | | | | S:SPY | W:SPY
| | S:FX | | | | | S:FY |
Note that X and Y are arbitrarily distanced (including possibly being sequential).
Say we are at frame Y+2. There's therefore a wait on FY. The following holds:
FY is signaled
=> SAY is signaled
=> Previous presentation of I1 (corresponding to SPX) is finished
=> SPX is waited
At this point, we can destroy SPX. In other words, in frame Y+2, we can destroy SPX (note that 2 is
the number of frames the CPU pacing code uses). If frame Y+1 is not using I1, this means the
history of present semaphores for I1 would be `{SPX, SPY}` and we can destroy the oldest semaphore
in this list. If frame Y+1 is also using I1, we should still destroy SPX in frame Y+2, but the
history of the present semaphores for I1 would be `{SPX, SPY, SP(Y+1)}`.
In the Vulkan backend, we simplify destruction of semaphores by always keeping a history of 3
present semaphores for each image (again, 3 is H+1 where H is the swap history size used in CPU
pacing) and always reuse (instead of destroy) the oldest semaphore of the image that is about to be
presented.
To summarize, we use the completion of a submission using an image to provably when the *previous*
presentation of that image was finished.
## Swapchain recreation
When recreating the swapchain, all images are freed and new ones are created, possibly with a
different count and present mode. For the old swapchain, we can no longer rely on the completion of
a future submission to know when a previous presentation is done, as there won't be any more
submissions using images from the old swapchain.
> For example, imagine the old swapchain was created in FIFO mode, and one image is being presented
> until the next V-Sync. Furthermore, imagine the new swapchain is created in MAILBOX mode. Since
> the old swapchain's image will remain presented until V-Sync, the new MAILBOX swapchain can
> perform an arbitrarily large number of (throw-away) presentations. The old swapchain (and its
> associated present semaphores) cannot be destroyed until V-Sync; a signal that's not captured by
> Vulkan.
ANGLE resolves this issue by deferring the destruction of the old swapchain and its remaining
present semaphores to the time when the semaphore corresponding to the first present of the new
swapchain can be destroyed. In the example in the previous section, if SPX is the present semaphore
of the first QP done on the new swapchain, at frame Y+2, when we know SPX can be destroyed, we know
that the first image of the new swapchain has already been presented. This proves that all previous
presentations of the old swapchain have finished.
> Note: the swapchain can potentially be destroyed much earlier, but with no feedback from the
> presentation engine, we cannot know that. This delays means that the swapchain could be recreated
> while there are pending old swapchains to be destroyed. The destruction of both old swapchains
> must now be deferred to when the first present of the new swapchain has finished. If an
> application resizes the window constantly and at a high rate, ANGLE would keep accumulating old
> swapchains and not free them until it stops. While a user will likely not be able to do this (as
> the rate of window system events is lower than the framerate), this can be programmatically done
> (as indeed done in EGL dEQP tests). Nvidia for example fails creation of a new swapchain if there
> are already 20 allocated (on desktop, or less than ten on Quadro). If the backlog of old
> swapchains get larger than a threshold, ANGLE calls `vkQueueWaitIdle()` and destroys the
> swapchains.
......@@ -493,7 +493,7 @@ class Shared final : angle::NonCopyable
if (!mRefCounted->isReferenced())
{
ASSERT(mRefCounted->get().valid());
recycler->recyle(std::move(mRefCounted->get()));
recycler->recycle(std::move(mRefCounted->get()));
SafeDelete(mRefCounted);
}
......@@ -530,9 +530,9 @@ class Recycler final : angle::NonCopyable
public:
Recycler() = default;
void recyle(T &&garbageObject) { mObjectFreeList.emplace_back(std::move(garbageObject)); }
void recycle(T &&garbageObject) { mObjectFreeList.emplace_back(std::move(garbageObject)); }
void fetch(VkDevice device, T *outObject)
void fetch(T *outObject)
{
ASSERT(!empty());
*outObject = std::move(mObjectFreeList.back());
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment