Vulkan: functionally complete worker thread

Working on enhancing worker thread to completely own primary command buffers. This will include not only processing SCBs from main thread into a primary, but also submitting those command buffers to the queue. The CommandProcessor is a vk::Context so it can handle errors in the worker thread. When the main thread submits tasks to the worker thread it also syncs any outstanding errors from the worker. Include asynchronousCommandProcessing feature that will control whether the worker thread task does it's work in parallel or not. If false, we wait for the thread to complete it's work before letting the main thread continue. If true, the thread can execute in parallel with the main thread. Bug: b/154030730 Bug: b/161912801 Change-Id: I00f8f013d6cbb2af12a172c4f7927855db2f0ebf Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/2328992 Commit-Queue: Courtney Goeltzenleuchter <courtneygo@google.com> Reviewed-by: Tim Van Patten <timvp@google.com> Reviewed-by: Jamie Madill <jmadill@chromium.org>

Vulkan: functionally complete worker thread
ed876984 · Courtney Goeltzenleuchter · Commit Bot · 2882e1af · ed876984 · ed876984
Commit ed876984 authored Oct 03, 2020 by Courtney Goeltzenleuchter Committed by Commit Bot Oct 20, 2020
16 changed files
--- a/include/platform/FeaturesVk.h
+++ b/include/platform/FeaturesVk.h
@@ -343,6 +343,13 @@ struct FeaturesVk : FeatureSetBase
        "Enable parallel processing and submission of Vulkan commands in worker thread", &members,
        "http://anglebug.com/4324"};

+    // Enable parallel thread execution when enableCommandProcessingThread is enabled.
+    // Currently off by default.
+    Feature asynchronousCommandProcessing = {"asynchronous_command_processing",
+                                             FeatureCategory::VulkanFeatures,
+                                             "Enable/Disable parallel processing of worker thread",
+                                             &members, "http://anglebug.com/4324"};
+
    // Whether the VkDevice supports the VK_KHR_shader_float16_int8 extension and has the
    // shaderFloat16 feature.
    Feature supportsShaderFloat16 = {"supports_shader_float16", FeatureCategory::VulkanFeatures,

--- a/src/libANGLE/renderer/vulkan/CommandProcessor.cpp
+++ b/src/libANGLE/renderer/vulkan/CommandProcessor.cpp
@@ -8,12 +8,145 @@
 //

 #include "libANGLE/renderer/vulkan/CommandProcessor.h"
+#include "libANGLE/renderer/vulkan/RendererVk.h"
 #include "libANGLE/trace.h"

 namespace rx
 {
+namespace
+{
+constexpr size_t kInFlightCommandsLimit = 100u;
+constexpr bool kOutputVmaStatsString    = false;
+
+void InitializeSubmitInfo(VkSubmitInfo *submitInfo,
+                          const vk::PrimaryCommandBuffer &commandBuffer,
+                          const std::vector<VkSemaphore> &waitSemaphores,
+                          std::vector<VkPipelineStageFlags> *waitSemaphoreStageMasks,
+                          const vk::Semaphore *signalSemaphore)
+{
+    // Verify that the submitInfo has been zero'd out.
+    ASSERT(submitInfo->signalSemaphoreCount == 0);
+
+    submitInfo->sType              = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+    submitInfo->commandBufferCount = commandBuffer.valid() ? 1 : 0;
+    submitInfo->pCommandBuffers    = commandBuffer.ptr();
+
+    if (waitSemaphoreStageMasks->size() < waitSemaphores.size())
+    {
+        waitSemaphoreStageMasks->resize(waitSemaphores.size(), VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    }
+
+    submitInfo->waitSemaphoreCount = static_cast<uint32_t>(waitSemaphores.size());
+    submitInfo->pWaitSemaphores    = waitSemaphores.data();
+    submitInfo->pWaitDstStageMask  = waitSemaphoreStageMasks->data();
+
+    if (signalSemaphore)
+    {
+        submitInfo->signalSemaphoreCount = 1;
+        submitInfo->pSignalSemaphores    = signalSemaphore->ptr();
+    }
+    else
+    {
+        submitInfo->signalSemaphoreCount = 0;
+        submitInfo->pSignalSemaphores    = nullptr;
+    }
+}
+}  // namespace
+
 namespace vk
 {
+void CommandProcessorTask::initTask()
+{
+    mTask                  = CustomTask::Invalid;
+    mContextVk             = nullptr;
+    mRenderPass            = nullptr;
+    mCommandBuffer         = nullptr;
+    mSemaphore             = nullptr;
+    mOneOffFence           = nullptr;
+    mOneOffCommandBufferVk = VK_NULL_HANDLE;
+}
+
+// CommandProcessorTask implementation
+void CommandProcessorTask::initProcessCommands(ContextVk *contextVk,
+                                               CommandBufferHelper *commandBuffer,
+                                               vk::RenderPass *renderPass)
+{
+    mTask          = vk::CustomTask::ProcessCommands;
+    mContextVk     = contextVk;
+    mCommandBuffer = commandBuffer;
+    mRenderPass    = renderPass;
+}
+
+void CommandProcessorTask::initPresent(egl::ContextPriority priority, VkPresentInfoKHR presentInfo)
+{
+    mTask        = vk::CustomTask::Present;
+    mPresentInfo = presentInfo;
+    mPriority    = priority;
+}
+
+void CommandProcessorTask::initFinishToSerial(Serial serial)
+{
+    // Note: sometimes the serial is not valid and that's okay, the finish will early exit in the
+    // TaskProcessor::finishToSerial
+    mTask   = vk::CustomTask::FinishToSerial;
+    mSerial = serial;
+}
+
+void CommandProcessorTask::initFlushAndQueueSubmit(
+    std::vector<VkSemaphore> &&waitSemaphores,
+    std::vector<VkPipelineStageFlags> &&waitSemaphoreStageMasks,
+    const vk::Semaphore *semaphore,
+    egl::ContextPriority priority,
+    vk::GarbageList &&currentGarbage,
+    vk::ResourceUseList &&currentResources)
+{
+    mTask                    = vk::CustomTask::FlushAndQueueSubmit;
+    mWaitSemaphores          = std::move(waitSemaphores);
+    mWaitSemaphoreStageMasks = std::move(waitSemaphoreStageMasks);
+    mSemaphore               = semaphore;
+    mGarbage                 = std::move(currentGarbage);
+    mResourceUseList         = std::move(currentResources);
+    mPriority                = priority;
+}
+
+void CommandProcessorTask::initOneOffQueueSubmit(VkCommandBuffer oneOffCommandBufferVk,
+                                                 egl::ContextPriority priority,
+                                                 const vk::Fence *fence)
+{
+    mTask                  = vk::CustomTask::OneOffQueueSubmit;
+    mOneOffCommandBufferVk = oneOffCommandBufferVk;
+    mOneOffFence           = fence;
+    mPriority              = priority;
+}
+
+CommandProcessorTask &CommandProcessorTask::operator=(CommandProcessorTask &&rhs)
+{
+    if (this == &rhs)
+    {
+        return *this;
+    }
+
+    mContextVk     = rhs.mContextVk;
+    mRenderPass    = rhs.mRenderPass;
+    mCommandBuffer = rhs.mCommandBuffer;
+    std::swap(mTask, rhs.mTask);
+    std::swap(mWaitSemaphores, rhs.mWaitSemaphores);
+    std::swap(mWaitSemaphoreStageMasks, rhs.mWaitSemaphoreStageMasks);
+    mSemaphore   = rhs.mSemaphore;
+    mOneOffFence = rhs.mOneOffFence;
+    std::swap(mGarbage, rhs.mGarbage);
+    std::swap(mSerial, rhs.mSerial);
+    std::swap(mPresentInfo, rhs.mPresentInfo);
+    std::swap(mPriority, rhs.mPriority);
+    std::swap(mResourceUseList, rhs.mResourceUseList);
+    mOneOffCommandBufferVk = rhs.mOneOffCommandBufferVk;
+
+    // clear rhs now that everything has moved.
+    rhs.initTask();
+
+    return *this;
+}
+
 // CommandBatch implementation.
 CommandBatch::CommandBatch() = default;

@@ -39,67 +172,612 @@ void CommandBatch::destroy(VkDevice device)
    commandPool.destroy(device);
    fence.reset(device);
 }
-}  // namespace vk

-CommandProcessor::CommandProcessor() : mWorkerThreadIdle(true) {}
+// TaskProcessor implementation.
+TaskProcessor::TaskProcessor() = default;
+
+TaskProcessor::~TaskProcessor() = default;
+
+void TaskProcessor::destroy(VkDevice device)
+{
+    mPrimaryCommandPool.destroy(device);
+    ASSERT(mInFlightCommands.empty() && mGarbageQueue.empty());
+}
+
+angle::Result TaskProcessor::init(vk::Context *context, std::thread::id threadId)
+{
+    mThreadId = threadId;
+
+    // Initialize the command pool now that we know the queue family index.
+    ANGLE_TRY(mPrimaryCommandPool.init(context, context->getRenderer()->getQueueFamilyIndex()));
+
+    return angle::Result::Continue;
+}
+
+angle::Result TaskProcessor::checkCompletedCommands(vk::Context *context)
+{
+    ANGLE_TRACE_EVENT0("gpu.angle", "TaskProcessor::checkCompletedCommands");
+    VkDevice device        = context->getDevice();
+    RendererVk *rendererVk = context->getRenderer();
+
+    int finishedCount = 0;
+
+    for (vk::CommandBatch &batch : mInFlightCommands)
+    {
+        VkResult result = batch.fence.get().getStatus(device);
+        if (result == VK_NOT_READY)
+        {
+            break;
+        }
+        ANGLE_VK_TRY(context, result);
+
+        rendererVk->onCompletedSerial(batch.serial);
+
+        rendererVk->resetSharedFence(&batch.fence);
+
+        ANGLE_TRACE_EVENT0("gpu.angle", "command buffer recycling");
+        batch.commandPool.destroy(device);
+        ANGLE_TRY(releasePrimaryCommandBuffer(context, std::move(batch.primaryCommands)));
+        ++finishedCount;
+    }
+
+    if (finishedCount > 0)
+    {
+        auto beginIter = mInFlightCommands.begin();
+        mInFlightCommands.erase(beginIter, beginIter + finishedCount);
+    }
+
+    Serial lastCompleted = rendererVk->getLastCompletedQueueSerial();
+
+    size_t freeIndex = 0;
+    for (; freeIndex < mGarbageQueue.size(); ++freeIndex)
+    {
+        vk::GarbageAndSerial &garbageList = mGarbageQueue[freeIndex];
+        if (garbageList.getSerial() <= lastCompleted)
+        {
+            for (vk::GarbageObject &garbage : garbageList.get())
+            {
+                garbage.destroy(rendererVk);
+            }
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    // Remove the entries from the garbage list - they should be ready to go.
+    if (freeIndex > 0)
+    {
+        mGarbageQueue.erase(mGarbageQueue.begin(), mGarbageQueue.begin() + freeIndex);
+    }
+
+    return angle::Result::Continue;
+}
+
+angle::Result TaskProcessor::releaseToCommandBatch(vk::Context *context,
+                                                   vk::PrimaryCommandBuffer &&commandBuffer,
+                                                   vk::CommandPool *commandPool,
+                                                   vk::CommandBatch *batch)
+{
+    batch->primaryCommands = std::move(commandBuffer);
+
+    if (commandPool->valid())
+    {
+        batch->commandPool = std::move(*commandPool);
+        // Recreate CommandPool
+        VkCommandPoolCreateInfo poolInfo = {};
+        poolInfo.sType                   = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
+        poolInfo.flags                   = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT;
+        poolInfo.queueFamilyIndex        = context->getRenderer()->getQueueFamilyIndex();
+
+        ANGLE_VK_TRY(context, commandPool->init(context->getDevice(), poolInfo));
+    }
+
+    return angle::Result::Continue;
+}
+
+angle::Result TaskProcessor::allocatePrimaryCommandBuffer(
+    vk::Context *context,
+    vk::PrimaryCommandBuffer *commandBufferOut)
+{
+    return mPrimaryCommandPool.allocate(context, commandBufferOut);
+}
+
+angle::Result TaskProcessor::releasePrimaryCommandBuffer(vk::Context *context,
+                                                         vk::PrimaryCommandBuffer &&commandBuffer)
+{
+    ASSERT(mPrimaryCommandPool.valid());
+    return mPrimaryCommandPool.collect(context, std::move(commandBuffer));
+}
+
+void TaskProcessor::handleDeviceLost(vk::Context *context)
+{
+    VkDevice device = context->getDevice();
+
+    for (vk::CommandBatch &batch : mInFlightCommands)
+    {
+        // On device loss we need to wait for fence to be signaled before destroying it
+        VkResult status =
+            batch.fence.get().wait(device, context->getRenderer()->getMaxFenceWaitTimeNs());
+        // If the wait times out, it is probably not possible to recover from lost device
+        ASSERT(status == VK_SUCCESS || status == VK_ERROR_DEVICE_LOST);
+
+        // On device lost, here simply destroy the CommandBuffer, it will be fully cleared later by
+        // CommandPool::destroy
+        batch.primaryCommands.destroy(device);

-void CommandProcessor::queueCommands(const vk::CommandProcessorTask &commands)
+        batch.commandPool.destroy(device);
+        batch.fence.reset(device);
+    }
+    mInFlightCommands.clear();
+}
+
+// If there are any inflight commands worker will look for fence that corresponds to the request
+// serial or the last available fence and wait on that fence. Will then do necessary cleanup work.
+// This can cause the worker thread to block.
+// TODO: https://issuetracker.google.com/issues/170312581 - A more optimal solution might be to do
+// the wait in CommandProcessor rather than the worker thread. That would require protecting access
+// to mInFlightCommands
+angle::Result TaskProcessor::finishToSerial(vk::Context *context, Serial serial)
 {
-    ANGLE_TRACE_EVENT0("gpu.angle", "CommandProcessor::queueCommands");
-    std::lock_guard<std::mutex> queueLock(mWorkerMutex);
-    ASSERT(commands.commandBuffer == nullptr || !commands.commandBuffer->empty());
-    mCommandsQueue.push(commands);
-    mWorkAvailableCondition.notify_one();
+    RendererVk *rendererVk = context->getRenderer();
+    uint64_t timeout       = rendererVk->getMaxFenceWaitTimeNs();
+
+    if (mInFlightCommands.empty())
+    {
+        // No outstanding work, nothing to wait for.
+        return angle::Result::Continue;
+    }
+
+    // Find the first batch with serial equal to or bigger than given serial (note that
+    // the batch serials are unique, otherwise upper-bound would have been necessary).
+    size_t batchIndex = mInFlightCommands.size() - 1;
+    for (size_t i = 0; i < mInFlightCommands.size(); ++i)
+    {
+        if (mInFlightCommands[i].serial >= serial)
+        {
+            batchIndex = i;
+            break;
+        }
+    }
+    const vk::CommandBatch &batch = mInFlightCommands[batchIndex];
+
+    // Wait for it finish
+    VkDevice device = context->getDevice();
+    ANGLE_VK_TRY(context, batch.fence.get().wait(device, timeout));
+
+    // Clean up finished batches.
+    return checkCompletedCommands(context);
 }

-void CommandProcessor::processCommandProcessorTasks()
+VkResult TaskProcessor::present(VkQueue queue, const VkPresentInfoKHR &presentInfo)
 {
+    ANGLE_TRACE_EVENT0("gpu.angle", "vkQueuePresentKHR");
+    return vkQueuePresentKHR(queue, &presentInfo);
+}
+
+angle::Result TaskProcessor::submitFrame(vk::Context *context,
+                                         VkQueue queue,
+                                         const VkSubmitInfo &submitInfo,
+                                         const vk::Shared<vk::Fence> &sharedFence,
+                                         vk::GarbageList *currentGarbage,
+                                         vk::CommandPool *commandPool,
+                                         vk::PrimaryCommandBuffer &&commandBuffer,
+                                         const Serial &queueSerial)
+{
+    ASSERT(std::this_thread::get_id() == mThreadId);
+    ANGLE_TRACE_EVENT0("gpu.angle", "TaskProcessor::submitFrame");
+
+    VkDevice device = context->getDevice();
+
+    vk::DeviceScoped<vk::CommandBatch> scopedBatch(device);
+    vk::CommandBatch &batch = scopedBatch.get();
+    batch.fence.copy(device, sharedFence);
+    batch.serial = queueSerial;
+
+    ANGLE_TRY(queueSubmit(context, queue, submitInfo, &batch.fence.get()));
+
+    if (!currentGarbage->empty())
+    {
+        mGarbageQueue.emplace_back(std::move(*currentGarbage), queueSerial);
+    }
+
+    // Store the primary CommandBuffer and command pool used for secondary CommandBuffers
+    // in the in-flight list.
+    ANGLE_TRY(releaseToCommandBatch(context, std::move(commandBuffer), commandPool, &batch));
+
+    mInFlightCommands.emplace_back(scopedBatch.release());
+
+    ANGLE_TRY(checkCompletedCommands(context));
+
+    // CPU should be throttled to avoid mInFlightCommands from growing too fast. Important for
+    // off-screen scenarios.
+    while (mInFlightCommands.size() > kInFlightCommandsLimit)
+    {
+        ANGLE_TRY(finishToSerial(context, mInFlightCommands[0].serial));
+    }
+
+    return angle::Result::Continue;
+}
+
+vk::Shared<vk::Fence> TaskProcessor::getLastSubmittedFenceWithLock(VkDevice device) const
+{
+    vk::Shared<vk::Fence> fence;
+    // Note: this must be called when the work queue is empty and while holding mWorkerMutex to
+    // ensure that worker isn't touching mInFlightCommands
+    if (!mInFlightCommands.empty())
+    {
+        fence.copy(device, mInFlightCommands.back().fence);
+    }
+
+    return fence;
+}
+
+angle::Result TaskProcessor::queueSubmit(vk::Context *context,
+                                         VkQueue queue,
+                                         const VkSubmitInfo &submitInfo,
+                                         const vk::Fence *fence)
+{
+    ASSERT(std::this_thread::get_id() == mThreadId);
+    if (kOutputVmaStatsString)
+    {
+        context->getRenderer()->outputVmaStatString();
+    }
+
+    // Don't need a QueueMutex since all queue accesses are serialized through the worker.
+    VkFence handle = fence ? fence->getHandle() : VK_NULL_HANDLE;
+    ANGLE_VK_TRY(context, vkQueueSubmit(queue, 1, &submitInfo, handle));
+
+    return angle::Result::Continue;
+}
+
+void CommandProcessor::handleError(VkResult errorCode,
+                                   const char *file,
+                                   const char *function,
+                                   unsigned int line)
+{
+    ASSERT(errorCode != VK_SUCCESS);
+
+    std::stringstream errorStream;
+    errorStream << "Internal Vulkan error (" << errorCode << "): " << VulkanResultString(errorCode)
+                << ".";
+
+    if (errorCode == VK_ERROR_DEVICE_LOST)
+    {
+        WARN() << errorStream.str();
+        handleDeviceLost();
+    }
+
+    std::lock_guard<std::mutex> queueLock(mErrorMutex);
+    vk::Error error = {errorCode, file, function, line};
+    mErrors.emplace(error);
+}
+
+CommandProcessor::CommandProcessor(RendererVk *renderer)
+    : vk::Context(renderer),
+      mWorkerThreadIdle(false),
+      mCommandProcessorLastSubmittedSerial(mQueueSerialFactory.generate()),
+      mCommandProcessorCurrentQueueSerial(mQueueSerialFactory.generate())
+{
+    std::lock_guard<std::mutex> queueLock(mErrorMutex);
+    while (!mErrors.empty())
+    {
+        mErrors.pop();
+    }
+}
+
+CommandProcessor::~CommandProcessor() = default;
+
+bool CommandProcessor::isRobustResourceInitEnabled() const
+{
+    // Unused for worker thread, just return false.
+    return false;
+}
+
+vk::Error CommandProcessor::getAndClearPendingError()
+{
+    std::lock_guard<std::mutex> queueLock(mErrorMutex);
+    vk::Error tmpError({VK_SUCCESS, nullptr, nullptr, 0});
+    if (!mErrors.empty())
+    {
+        tmpError = mErrors.front();
+        mErrors.pop();
+    }
+    return tmpError;
+}
+
+void CommandProcessor::queueCommand(vk::Context *context, vk::CommandProcessorTask *task)
+{
+    ANGLE_TRACE_EVENT0("gpu.angle", "CommandProcessor::queueCommand");
+    {
+        // Grab the worker mutex so that we put things on the queue in the same order as we give out
+        // serials.
+        std::lock_guard<std::mutex> queueLock(mWorkerMutex);
+
+        if (task->getTaskCommand() == vk::CustomTask::FlushAndQueueSubmit ||
+            task->getTaskCommand() == vk::CustomTask::OneOffQueueSubmit)
+        {
+            std::lock_guard<std::mutex> lock(mCommandProcessorQueueSerialMutex);
+            // Flush submits work, so give it the current serial and generate a new one.
+            Serial queueSerial = mCommandProcessorCurrentQueueSerial;
+            task->setQueueSerial(queueSerial);
+            mCommandProcessorLastSubmittedSerial = mCommandProcessorCurrentQueueSerial;
+            mCommandProcessorCurrentQueueSerial  = mQueueSerialFactory.generate();
+
+            task->getResourceUseList().releaseResourceUsesAndUpdateSerials(queueSerial);
+        }
+
+        mTasks.emplace(std::move(*task));
+        mWorkAvailableCondition.notify_one();
+    }
+
+    if (getRenderer()->getFeatures().asynchronousCommandProcessing.enabled)
+    {
+        return;
+    }
+
+    // parallel task processing disabled so wait for work to complete.
+    waitForWorkComplete(context);
+}
+
+void CommandProcessor::processTasks()
+{
+
+    while (true)
+    {
+        bool exitThread      = false;
+        angle::Result result = processTasksImpl(&exitThread);
+        if (exitThread)
+        {
+            // We are doing a controlled exit of the thread, break out of the while loop.
+            break;
+        }
+        if (result != angle::Result::Continue)
+        {
+            // TODO: https://issuetracker.google.com/issues/170311829 - follow-up on error handling
+            // ContextVk::commandProcessorSyncErrorsAndQueueCommand and WindowSurfaceVk::destroy
+            // do error processing, is anything required here? Don't think so, mostly need to
+            // continue the worker thread until it's been told to exit.
+            UNREACHABLE();
+        }
+    }
+}
+
+angle::Result CommandProcessor::processTasksImpl(bool *exitThread)
+{
+    // Initialization prior to work thread loop
+    ANGLE_TRY(mTaskProcessor.init(this, std::this_thread::get_id()));
+    // Allocate and begin primary command buffer
+    ANGLE_TRY(mTaskProcessor.allocatePrimaryCommandBuffer(this, &mPrimaryCommandBuffer));
+    VkCommandBufferBeginInfo beginInfo = {};
+    beginInfo.sType                    = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+    beginInfo.flags                    = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+    beginInfo.pInheritanceInfo         = nullptr;
+
+    ANGLE_VK_TRY(this, mPrimaryCommandBuffer.begin(beginInfo));
+
    while (true)
    {
        std::unique_lock<std::mutex> lock(mWorkerMutex);
-        mWorkerIdleCondition.notify_one();
-        mWorkerThreadIdle = true;
-        // Only wake if notified and command queue is not empty
-        mWorkAvailableCondition.wait(lock, [this] { return !mCommandsQueue.empty(); });
-        mWorkerThreadIdle             = false;
-        vk::CommandProcessorTask task = mCommandsQueue.front();
-        mCommandsQueue.pop();
+        if (mTasks.empty())
+        {
+            mWorkerThreadIdle = true;
+            mWorkerIdleCondition.notify_all();
+            // Only wake if notified and command queue is not empty
+            mWorkAvailableCondition.wait(lock, [this] { return !mTasks.empty(); });
+        }
+        mWorkerThreadIdle = false;
+        vk::CommandProcessorTask task(std::move(mTasks.front()));
+        mTasks.pop();
        lock.unlock();
-        // Either both ptrs should be null or non-null
-        ASSERT((task.commandBuffer != nullptr && task.contextVk != nullptr) ||
-               (task.commandBuffer == nullptr && task.contextVk == nullptr));
-        // A work block with null ptrs signals worker thread to exit
-        if (task.commandBuffer == nullptr && task.contextVk == nullptr)
+
+        switch (task.getTaskCommand())
+        {
+            case vk::CustomTask::Exit:
+            {
+                ANGLE_TRY(mTaskProcessor.finishToSerial(this, Serial::Infinite()));
+                *exitThread = true;
+                // Shutting down so cleanup
+                mTaskProcessor.destroy(mRenderer->getDevice());
+                mCommandPool.destroy(mRenderer->getDevice());
+                mPrimaryCommandBuffer.destroy(mRenderer->getDevice());
+                mWorkerThreadIdle = true;
+                mWorkerIdleCondition.notify_one();
+                return angle::Result::Continue;
+            }
+            default:
+                ANGLE_TRY(processTask(&task));
+                break;
+        }
+    }
+
+    UNREACHABLE();
+    return angle::Result::Stop;
+}
+
+angle::Result CommandProcessor::processTask(vk::CommandProcessorTask *task)
+{
+    switch (task->getTaskCommand())
+    {
+        case vk::CustomTask::FlushAndQueueSubmit:
        {
+            // End command buffer
+            ANGLE_VK_TRY(this, mPrimaryCommandBuffer.end());
+            // 1. Create submitInfo
+            VkSubmitInfo submitInfo = {};
+            InitializeSubmitInfo(&submitInfo, mPrimaryCommandBuffer, task->getWaitSemaphores(),
+                                 &task->getWaitSemaphoreStageMasks(), task->getSemaphore());
+
+            // 2. Get shared submit fence. It's possible there are other users of this fence that
+            // must wait for the work to be submitted before waiting on the fence. Reset the fence
+            // immediately so we are sure to get a fresh one next time.
+            vk::Shared<vk::Fence> fence;
+            ANGLE_TRY(mRenderer->getNextSubmitFence(&fence, true));
+
+            // 3. Call submitFrame()
+            ANGLE_TRY(mTaskProcessor.submitFrame(
+                this, getRenderer()->getVkQueue(task->getPriority()), submitInfo, fence,
+                &task->getGarbage(), &mCommandPool, std::move(mPrimaryCommandBuffer),
+                task->getQueueSerial()));
+            // 4. Allocate & begin new primary command buffer
+            ANGLE_TRY(mTaskProcessor.allocatePrimaryCommandBuffer(this, &mPrimaryCommandBuffer));
+
+            VkCommandBufferBeginInfo beginInfo = {};
+            beginInfo.sType                    = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
+            beginInfo.flags                    = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT;
+            beginInfo.pInheritanceInfo         = nullptr;
+            ANGLE_VK_TRY(this, mPrimaryCommandBuffer.begin(beginInfo));
+
+            // Free this local reference
+            getRenderer()->resetSharedFence(&fence);
+
+            ASSERT(task->getGarbage().empty());
            break;
        }
+        case vk::CustomTask::OneOffQueueSubmit:
+        {
+            VkSubmitInfo submitInfo = {};
+            submitInfo.sType        = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+            if (task->getOneOffCommandBufferVk() != VK_NULL_HANDLE)
+            {
+                submitInfo.commandBufferCount = 1;
+                submitInfo.pCommandBuffers    = &task->getOneOffCommandBufferVk();
+            }

-        ASSERT(!task.commandBuffer->empty());
-        // TODO: Will need some way to synchronize error reporting between threads
-        (void)(task.commandBuffer->flushToPrimary(task.contextVk, task.primaryCB));
-        ASSERT(task.commandBuffer->empty());
-        task.commandBuffer->releaseToContextQueue(task.contextVk);
+            // TODO: https://issuetracker.google.com/issues/170328907 - vkQueueSubmit should be
+            // owned by TaskProcessor to ensure proper synchronization
+            ANGLE_TRY(mTaskProcessor.queueSubmit(this,
+                                                 getRenderer()->getVkQueue(task->getPriority()),
+                                                 submitInfo, task->getOneOffFence()));
+            ANGLE_TRY(mTaskProcessor.checkCompletedCommands(this));
+            break;
+        }
+        case vk::CustomTask::FinishToSerial:
+        {
+            ANGLE_TRY(mTaskProcessor.finishToSerial(this, task->getQueueSerial()));
+            break;
+        }
+        case vk::CustomTask::Present:
+        {
+            VkResult result = mTaskProcessor.present(getRenderer()->getVkQueue(task->getPriority()),
+                                                     task->getPresentInfo());
+            if (ANGLE_UNLIKELY(result != VK_SUCCESS))
+            {
+                // Save the error so that we can handle it (e.g. VK_OUT_OF_DATE)
+                // Don't leave processing loop, don't consider errors from present to be fatal.
+                // TODO: https://issuetracker.google.com/issues/170329600 - This needs to improve to
+                // properly parallelize present
+                handleError(result, __FILE__, __FUNCTION__, __LINE__);
+            }
+            break;
+        }
+        case vk::CustomTask::ProcessCommands:
+        {
+            ASSERT(!task->getCommandBuffer()->empty());
+            ANGLE_TRY(task->getCommandBuffer()->flushToPrimary(
+                getRenderer()->getFeatures(), &mPrimaryCommandBuffer, task->getRenderPass()));
+            ASSERT(task->getCommandBuffer()->empty());
+            task->getCommandBuffer()->releaseToContextQueue(task->getContextVk());
+            break;
+        }
+        default:
+            UNREACHABLE();
+            break;
    }
+
+    return angle::Result::Continue;
 }

-void CommandProcessor::waitForWorkComplete()
+void CommandProcessor::waitForWorkComplete(vk::Context *context)
 {
    ANGLE_TRACE_EVENT0("gpu.angle", "CommandProcessor::waitForWorkerThreadIdle");
    std::unique_lock<std::mutex> lock(mWorkerMutex);
-    mWorkerIdleCondition.wait(lock,
-                              [this] { return (mCommandsQueue.empty() && mWorkerThreadIdle); });
+    mWorkerIdleCondition.wait(lock, [this] { return (mTasks.empty() && mWorkerThreadIdle); });
    // Worker thread is idle and command queue is empty so good to continue
-    lock.unlock();
+
+    if (!context)
+    {
+        return;
+    }
+
+    // Sync any errors to the context
+    while (hasPendingError())
+    {
+        vk::Error workerError = getAndClearPendingError();
+        if (workerError.mErrorCode != VK_SUCCESS)
+        {
+            context->handleError(workerError.mErrorCode, workerError.mFile, workerError.mFunction,
+                                 workerError.mLine);
+        }
+    }
 }

 void CommandProcessor::shutdown(std::thread *commandProcessorThread)
 {
-    waitForWorkComplete();
-    const vk::CommandProcessorTask endTask = vk::kEndCommandProcessorThread;
-    queueCommands(endTask);
+    vk::CommandProcessorTask endTask;
+    endTask.initTask(vk::CustomTask::Exit);
+    queueCommand(nullptr, &endTask);
+    waitForWorkComplete(nullptr);
    if (commandProcessorThread->joinable())
    {
        commandProcessorThread->join();
    }
 }
+
+// Return the fence for the last submit. This may mean waiting on the worker to process tasks to
+// actually get to the last submit
+vk::Shared<vk::Fence> CommandProcessor::getLastSubmittedFence() const
+{
+    ANGLE_TRACE_EVENT0("gpu.angle", "CommandProcessor::getLastSubmittedFence");
+    std::unique_lock<std::mutex> lock(mWorkerMutex);
+    mWorkerIdleCondition.wait(lock, [this] { return (mTasks.empty() && mWorkerThreadIdle); });
+
+    // Worker thread is idle and command queue is empty so good to continue
+
+    return mTaskProcessor.getLastSubmittedFenceWithLock(getDevice());
+}
+
+Serial CommandProcessor::getLastSubmittedSerial()
+{
+    std::lock_guard<std::mutex> lock(mCommandProcessorQueueSerialMutex);
+    return mCommandProcessorLastSubmittedSerial;
+}
+
+Serial CommandProcessor::getCurrentQueueSerial()
+{
+    std::lock_guard<std::mutex> lock(mCommandProcessorQueueSerialMutex);
+    return mCommandProcessorCurrentQueueSerial;
+}
+
+// Wait until all commands up to and including serial have been processed
+void CommandProcessor::finishToSerial(vk::Context *context, Serial serial)
+{
+    vk::CommandProcessorTask finishToSerial;
+    finishToSerial.initFinishToSerial(serial);
+    queueCommand(context, &finishToSerial);
+
+    // Wait until the worker is idle. At that point we know that the finishToSerial command has
+    // completed executing, including any associated state cleanup.
+    waitForWorkComplete(context);
+}
+
+void CommandProcessor::handleDeviceLost()
+{
+    ANGLE_TRACE_EVENT0("gpu.angle", "CommandProcessor::handleDeviceLost");
+    std::unique_lock<std::mutex> lock(mWorkerMutex);
+    mWorkerIdleCondition.wait(lock, [this] { return (mTasks.empty() && mWorkerThreadIdle); });
+
+    // Worker thread is idle and command queue is empty so good to continue
+    mTaskProcessor.handleDeviceLost(this);
+}
+
+void CommandProcessor::finishAllWork(vk::Context *context)
+{
+    // Wait for GPU work to finish
+    finishToSerial(context, Serial::Infinite());
+}
+
+}  // namespace vk
+
 }  // namespace rx
--- a/src/libANGLE/renderer/vulkan/CommandProcessor.h
+++ b/src/libANGLE/renderer/vulkan/CommandProcessor.h
@@ -17,27 +17,120 @@
 #include <thread>

 #include "common/vulkan/vk_headers.h"
+#include "libANGLE/renderer/vulkan/PersistentCommandPool.h"
 #include "libANGLE/renderer/vulkan/vk_helpers.h"

 namespace rx
 {
+class RendererVk;
+class CommandProcessor;
+
 namespace vk
 {
 // CommandProcessorTask is used to queue a task to the worker thread when
-//  enableCommandProcessingThread feature is true.
-// The typical task includes pointers in all values and the worker thread will
-//  process the SecondaryCommandBuffer commands in cbh into the primaryCB.
-// There is a special task in which all of the pointers are null that will trigger
-//  the worker thread to exit, and is sent when the renderer instance shuts down.
-struct CommandProcessorTask
+// enableCommandProcessingThread feature is true.
+// Issuing the CustomTask::Exit command will cause the worker thread to clean up it's resources and
+// shut down. This command is sent when the renderer instance shuts down. Custom tasks are:
+
+enum CustomTask
 {
-    ContextVk *contextVk;
-    // TODO: b/153666475 Removed primaryCB in threading phase2.
-    vk::PrimaryCommandBuffer *primaryCB;
-    CommandBufferHelper *commandBuffer;
+    Invalid = 0,
+    // Process SecondaryCommandBuffer commands into the primary CommandBuffer.
+    ProcessCommands,
+    // End the current command buffer and submit commands to the queue
+    FlushAndQueueSubmit,
+    // Submit custom command buffer, excludes some state management
+    OneOffQueueSubmit,
+    // Finish queue commands up to given serial value, process garbage
+    FinishToSerial,
+    // Execute QueuePresent
+    Present,
+    // Exit the command processor thread
+    Exit,
 };

-static const CommandProcessorTask kEndCommandProcessorThread = {nullptr, nullptr, nullptr};
+class CommandProcessorTask
+{
+  public:
+    CommandProcessorTask() { initTask(); }
+
+    void initTask();
+
+    void initTask(CustomTask command) { mTask = command; }
+
+    void initProcessCommands(ContextVk *contextVk,
+                             CommandBufferHelper *commandBuffer,
+                             vk::RenderPass *renderPass);
+
+    void initPresent(egl::ContextPriority priority, VkPresentInfoKHR presentInfo);
+
+    void initFinishToSerial(Serial serial);
+
+    void initFlushAndQueueSubmit(std::vector<VkSemaphore> &&waitSemaphores,
+                                 std::vector<VkPipelineStageFlags> &&waitSemaphoreStageMasks,
+                                 const vk::Semaphore *semaphore,
+                                 egl::ContextPriority priority,
+                                 vk::GarbageList &&currentGarbage,
+                                 vk::ResourceUseList &&currentResources);
+
+    void initOneOffQueueSubmit(VkCommandBuffer oneOffCommandBufferVk,
+                               egl::ContextPriority priority,
+                               const vk::Fence *fence);
+
+    CommandProcessorTask &operator=(CommandProcessorTask &&rhs);
+
+    CommandProcessorTask(CommandProcessorTask &&other) : CommandProcessorTask()
+    {
+        *this = std::move(other);
+    }
+
+    void setQueueSerial(Serial serial) { mSerial = serial; }
+    Serial getQueueSerial() const { return mSerial; }
+    vk::ResourceUseList &getResourceUseList() { return mResourceUseList; }
+    vk::CustomTask getTaskCommand() { return mTask; }
+    std::vector<VkSemaphore> &getWaitSemaphores() { return mWaitSemaphores; }
+    std::vector<VkPipelineStageFlags> &getWaitSemaphoreStageMasks()
+    {
+        return mWaitSemaphoreStageMasks;
+    }
+    const vk::Semaphore *getSemaphore() { return mSemaphore; }
+    vk::GarbageList &getGarbage() { return mGarbage; }
+    egl::ContextPriority getPriority() const { return mPriority; }
+    const VkCommandBuffer &getOneOffCommandBufferVk() const { return mOneOffCommandBufferVk; }
+    const vk::Fence *getOneOffFence() { return mOneOffFence; }
+    const VkPresentInfoKHR &getPresentInfo() const { return mPresentInfo; }
+    vk::RenderPass *getRenderPass() const { return mRenderPass; }
+    CommandBufferHelper *getCommandBuffer() const { return mCommandBuffer; }
+    ContextVk *getContextVk() const { return mContextVk; }
+
+  private:
+    CustomTask mTask;
+
+    // ProcessCommands
+    ContextVk *mContextVk;
+    vk::RenderPass *mRenderPass;
+    CommandBufferHelper *mCommandBuffer;
+
+    // Flush data
+    std::vector<VkSemaphore> mWaitSemaphores;
+    std::vector<VkPipelineStageFlags> mWaitSemaphoreStageMasks;
+    const vk::Semaphore *mSemaphore;
+    vk::GarbageList mGarbage;
+    vk::ResourceUseList mResourceUseList;
+
+    // FinishToSerial & Flush command data
+    Serial mSerial;
+
+    // Present command data
+    VkPresentInfoKHR mPresentInfo;
+
+    // Used by OneOffQueueSubmit
+    VkCommandBuffer mOneOffCommandBufferVk;
+    const vk::Fence *mOneOffFence;
+
+    // Flush, Present & QueueWaitIdle data
+    egl::ContextPriority mPriority;
+};

 struct CommandBatch final : angle::NonCopyable
 {
@@ -54,37 +147,139 @@ struct CommandBatch final : angle::NonCopyable
    vk::Shared<vk::Fence> fence;
    Serial serial;
 };
-}  // namespace vk

-class CommandProcessor : angle::NonCopyable
+class TaskProcessor : angle::NonCopyable
+{
+  public:
+    TaskProcessor();
+    ~TaskProcessor();
+
+    angle::Result init(vk::Context *context, std::thread::id threadId);
+    void destroy(VkDevice device);
+
+    angle::Result allocatePrimaryCommandBuffer(vk::Context *context,
+                                               vk::PrimaryCommandBuffer *commandBufferOut);
+    angle::Result releasePrimaryCommandBuffer(vk::Context *context,
+                                              vk::PrimaryCommandBuffer &&commandBuffer);
+
+    void clearAllGarbage(vk::Context *context);
+
+    angle::Result finishToSerial(vk::Context *context, Serial serial);
+
+    VkResult present(VkQueue queue, const VkPresentInfoKHR &presentInfo);
+
+    angle::Result submitFrame(vk::Context *context,
+                              VkQueue queue,
+                              const VkSubmitInfo &submitInfo,
+                              const vk::Shared<vk::Fence> &sharedFence,
+                              vk::GarbageList *currentGarbage,
+                              vk::CommandPool *commandPool,
+                              vk::PrimaryCommandBuffer &&commandBuffer,
+                              const Serial &queueSerial);
+    angle::Result queueSubmit(vk::Context *context,
+                              VkQueue queue,
+                              const VkSubmitInfo &submitInfo,
+                              const vk::Fence *fence);
+
+    vk::Shared<vk::Fence> getLastSubmittedFenceWithLock(VkDevice device) const;
+
+    // Check to see which batches have finished completion (forward progress for
+    // mLastCompletedQueueSerial, for example for when the application busy waits on a query
+    // result). It would be nice if we didn't have to expose this for QueryVk::getResult.
+    angle::Result checkCompletedCommands(vk::Context *context);
+
+    void handleDeviceLost(vk::Context *context);
+
+  private:
+    angle::Result releaseToCommandBatch(vk::Context *context,
+                                        vk::PrimaryCommandBuffer &&commandBuffer,
+                                        vk::CommandPool *commandPool,
+                                        vk::CommandBatch *batch);
+
+    vk::GarbageQueue mGarbageQueue;
+    std::vector<vk::CommandBatch> mInFlightCommands;
+
+    // Keeps a free list of reusable primary command buffers.
+    vk::PersistentCommandPool mPrimaryCommandPool;
+    std::thread::id mThreadId;
+};
+
+class CommandProcessor : public vk::Context
 {
  public:
-    CommandProcessor();
-    ~CommandProcessor() = default;
-
-    // Main worker loop that should be launched in its own thread. The
-    //  loop waits for work to be submitted from a separate thread.
-    void processCommandProcessorTasks();
-    // Called asynchronously from workLoop() thread to queue work that is
-    //  then processed by the workLoop() thread
-    void queueCommands(const vk::CommandProcessorTask &commands);
-    // Used by separate thread to wait for worker thread to complete all
-    //  outstanding work.
-    void waitForWorkComplete();
-    // Stop the command processor loop
+    CommandProcessor(RendererVk *renderer);
+    ~CommandProcessor() override;
+
+    void handleError(VkResult result,
+                     const char *file,
+                     const char *function,
+                     unsigned int line) override;
+
+    bool isRobustResourceInitEnabled() const override;
+
+    // Entry point for command processor thread, calls processTasksImpl to do the
+    // work. called by RendererVk::initialization on main thread
+    void processTasks();
+
+    // Called asynchronously from main thread to queue work that is then processed by the worker
+    // thread
+    void queueCommand(vk::Context *context, vk::CommandProcessorTask *task);
+
+    // Used by main thread to wait for worker thread to complete all outstanding work.
+    void waitForWorkComplete(vk::Context *context);
+    Serial getCurrentQueueSerial();
+    Serial getLastSubmittedSerial();
+
+    // Wait until desired serial has been processed.
+    void finishToSerial(vk::Context *context, Serial serial);
+
+    vk::Shared<vk::Fence> getLastSubmittedFence() const;
+    void handleDeviceLost();
+
+    bool hasPendingError() const
+    {
+        std::lock_guard<std::mutex> queueLock(mErrorMutex);
+        return !mErrors.empty();
+    }
+    vk::Error getAndClearPendingError();
+
+    // Stop the command processor thread
    void shutdown(std::thread *commandProcessorThread);

+    void finishAllWork(vk::Context *context);
+
  private:
-    std::queue<vk::CommandProcessorTask> mCommandsQueue;
-    std::mutex mWorkerMutex;
+    // Command processor thread, called by processTasks. The loop waits for work to
+    // be submitted from a separate thread.
+    angle::Result processTasksImpl(bool *exitThread);
+
+    // Command processor thread, process a task
+    angle::Result processTask(vk::CommandProcessorTask *task);
+
+    std::queue<vk::CommandProcessorTask> mTasks;
+    mutable std::mutex mWorkerMutex;
    // Signal worker thread when work is available
    std::condition_variable mWorkAvailableCondition;
    // Signal main thread when all work completed
-    std::condition_variable mWorkerIdleCondition;
+    mutable std::condition_variable mWorkerIdleCondition;
    // Track worker thread Idle state for assertion purposes
    bool mWorkerThreadIdle;
+    // Command pool to allocate processor thread primary command buffers from
+    vk::CommandPool mCommandPool;
+    vk::PrimaryCommandBuffer mPrimaryCommandBuffer;
+    TaskProcessor mTaskProcessor;
+
+    AtomicSerialFactory mQueueSerialFactory;
+    std::mutex mCommandProcessorQueueSerialMutex;
+    Serial mCommandProcessorLastSubmittedSerial;
+    Serial mCommandProcessorCurrentQueueSerial;
+
+    mutable std::mutex mErrorMutex;
+    std::queue<vk::Error> mErrors;
 };

+}  // namespace vk
+
 }  // namespace rx

 #endif  // LIBANGLE_RENDERER_VULKAN_COMMAND_PROCESSOR_H_
--- a/src/libANGLE/renderer/vulkan/ContextVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ContextVk.cpp
@@ -480,7 +480,6 @@ void CommandQueue::clearAllGarbage(RendererVk *renderer)
 }

 angle::Result CommandQueue::allocatePrimaryCommandBuffer(vk::Context *context,
-                                                         const vk::CommandPool &commandPool,
                                                         vk::PrimaryCommandBuffer *commandBufferOut)
 {
    return mPrimaryCommandPool.allocate(context, commandBufferOut);
@@ -523,6 +522,8 @@ bool CommandQueue::hasInFlightCommands() const

 angle::Result CommandQueue::finishToSerial(vk::Context *context, Serial serial, uint64_t timeout)
 {
+    ASSERT(!context->getRenderer()->getFeatures().asynchronousCommandProcessing.enabled);
+
    if (mInFlightCommands.empty())
    {
        return angle::Result::Continue;
@@ -607,6 +608,8 @@ angle::Result CommandQueue::submitFrame(vk::Context *context,

 vk::Shared<vk::Fence> CommandQueue::getLastSubmittedFence(const vk::Context *context) const
 {
+    ASSERT(!context->getRenderer()->getFeatures().enableCommandProcessingThread.enabled);
+
    vk::Shared<vk::Fence> fence;
    if (!mInFlightCommands.empty())
    {
@@ -958,7 +961,7 @@ angle::Result ContextVk::initialize()

 angle::Result ContextVk::startPrimaryCommandBuffer()
 {
-    ANGLE_TRY(mCommandQueue.allocatePrimaryCommandBuffer(this, mCommandPool, &mPrimaryCommands));
+    ANGLE_TRY(mCommandQueue.allocatePrimaryCommandBuffer(this, &mPrimaryCommands));

    VkCommandBufferBeginInfo beginInfo = {};
    beginInfo.sType                    = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO;
@@ -1407,6 +1410,8 @@ angle::Result ContextVk::handleDirtyGraphicsPipeline(const gl::Context *context,
    commandBuffer->bindGraphicsPipeline(mCurrentGraphicsPipeline->getPipeline());
    // Update the queue serial for the pipeline object.
    ASSERT(mCurrentGraphicsPipeline && mCurrentGraphicsPipeline->valid());
+    // TODO: https://issuetracker.google.com/issues/169788986: Need to change this so that we get
+    // the actual serial used when this work is submitted.
    mCurrentGraphicsPipeline->updateSerial(getCurrentQueueSerial());
    return angle::Result::Continue;
 }
@@ -1421,6 +1426,8 @@ angle::Result ContextVk::handleDirtyComputePipeline(const gl::Context *context,
    }

    commandBuffer->bindComputePipeline(mCurrentComputePipeline->get());
+    // TODO: https://issuetracker.google.com/issues/169788986: Need to change this so that we get
+    // the actual serial used when this work is submitted.
    mCurrentComputePipeline->updateSerial(getCurrentQueueSerial());

    return angle::Result::Continue;
@@ -1750,10 +1757,30 @@ void ContextVk::addOverlayUsedBuffersCount(vk::CommandBufferHelper *commandBuffe
    }
 }

+void ContextVk::commandProcessorSyncErrors()
+{
+    while (mRenderer->hasPendingError())
+    {
+        vk::Error error = mRenderer->getAndClearPendingError();
+        if (error.mErrorCode != VK_SUCCESS)
+        {
+            handleError(error.mErrorCode, error.mFile, error.mFunction, error.mLine);
+        }
+    }
+}
+
+void ContextVk::commandProcessorSyncErrorsAndQueueCommand(vk::CommandProcessorTask *command)
+{
+    commandProcessorSyncErrors();
+    mRenderer->queueCommand(this, command);
+}
+
 angle::Result ContextVk::submitFrame(const VkSubmitInfo &submitInfo,
                                     vk::ResourceUseList *resourceList,
                                     vk::PrimaryCommandBuffer &&commandBuffer)
 {
+    ASSERT(!getRenderer()->getFeatures().enableCommandProcessingThread.enabled);
+
    if (vk::CommandBufferHelper::kEnableCommandStreamDiagnostics)
    {
        dumpCommandStreamDiagnostics();
@@ -1932,6 +1959,8 @@ angle::Result ContextVk::synchronizeCpuGpuTime()
        double TeS = platform->monotonicallyIncreasingTime(platform);

        // Get the query results
+        // Note: This LastSubmittedQueueSerial may include more work then was submitted above if
+        // another thread had submitted work.
        ANGLE_TRY(finishToSerial(getLastSubmittedQueueSerial()));

        uint64_t gpuTimestampCycles = 0;
@@ -2086,13 +2115,22 @@ void ContextVk::flushGpuEvents(double nextSyncGpuTimestampS, double nextSyncCpuT

 void ContextVk::clearAllGarbage()
 {
-    ANGLE_TRACE_EVENT0("gpu.angle", "ContextVk::clearAllGarbage");
+    ANGLE_TRACE_EVENT0("gpu.angle", "ContextVk::finishAllWork");
    for (vk::GarbageObject &garbage : mCurrentGarbage)
    {
        garbage.destroy(mRenderer);
    }
    mCurrentGarbage.clear();
-    mCommandQueue.clearAllGarbage(mRenderer);
+    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        // Issue command to CommandProcessor to ensure all work is complete, which will return any
+        // garbage items as well.
+        mRenderer->finishAllWork(this);
+    }
+    else
+    {
+        mCommandQueue.clearAllGarbage(mRenderer);
+    }
 }

 void ContextVk::handleDeviceLost()
@@ -2100,7 +2138,14 @@ void ContextVk::handleDeviceLost()
    mOutsideRenderPassCommands->reset();
    mRenderPassCommands->reset();

-    mCommandQueue.handleDeviceLost(mRenderer);
+    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        mRenderer->handleDeviceLost();
+    }
+    else
+    {
+        mCommandQueue.handleDeviceLost(mRenderer);
+    }
    clearAllGarbage();

    mRenderer->notifyDeviceLost();
@@ -3283,6 +3328,10 @@ angle::Result ContextVk::onMakeCurrent(const gl::Context *context)
 angle::Result ContextVk::onUnMakeCurrent(const gl::Context *context)
 {
    ANGLE_TRY(flushImpl(nullptr));
+    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        mRenderer->waitForCommandProcessorIdle(this);
+    }
    mCurrentWindowSurface = nullptr;
    return angle::Result::Continue;
 }
@@ -4286,32 +4335,66 @@ angle::Result ContextVk::flushImpl(const vk::Semaphore *signalSemaphore)
    mDefaultUniformStorage.releaseInFlightBuffersToResourceUseList(this);
    mStagingBuffer.releaseInFlightBuffersToResourceUseList(this);

+    // TODO: https://issuetracker.google.com/issues/170329600 - Verify that
+    // waitForSwapchainImageIfNecessary makes sense both w/ & w/o threading. I believe they do, but
+    // want confirmation.
+    waitForSwapchainImageIfNecessary();
+
    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
    {
-        // Worker thread must complete adding any commands that were just flushed above to the
-        //  primary command buffer before we can End the primary command buffer below.
-        mRenderer->waitForWorkerThreadIdle();
-    }
+        // Some tasks from ContextVk::submitFrame() that run before CommandQueue::submitFrame()
+        gl::RunningGraphWidget *renderPassCount =
+            mState.getOverlay()->getRunningGraphWidget(gl::WidgetId::VulkanRenderPassCount);
+        renderPassCount->add(mRenderPassCommands->getAndResetCounter());
+        renderPassCount->next();

-    ANGLE_VK_TRY(this, mPrimaryCommands.end());
+        if (vk::CommandBufferHelper::kEnableCommandStreamDiagnostics)
+        {
+            dumpCommandStreamDiagnostics();
+        }

-    waitForSwapchainImageIfNecessary();
+        // Send a flush command to worker thread that will:
+        // 1. Create submitInfo
+        // 2. Call submitFrame()
+        // 3. Allocate new primary command buffer
+        vk::CommandProcessorTask flushAndQueueSubmit;
+        flushAndQueueSubmit.initFlushAndQueueSubmit(
+            std::move(mWaitSemaphores), std::move(mWaitSemaphoreStageMasks), signalSemaphore,
+            mContextPriority, std::move(mCurrentGarbage), std::move(mResourceUseList));

-    VkSubmitInfo submitInfo = {};
-    InitializeSubmitInfo(&submitInfo, mPrimaryCommands, mWaitSemaphores, mWaitSemaphoreStageMasks,
-                         signalSemaphore);
+        commandProcessorSyncErrorsAndQueueCommand(&flushAndQueueSubmit);

-    ANGLE_TRY(submitFrame(submitInfo, &mResourceUseList, std::move(mPrimaryCommands)));
+        // Some tasks from ContextVk::submitFrame() that run after CommandQueue::submitFrame()
+        onRenderPassFinished();
+        mComputeDirtyBits |= mNewComputeCommandBufferDirtyBits;

-    ANGLE_TRY(startPrimaryCommandBuffer());
+        if (mGpuEventsEnabled)
+        {
+            ANGLE_TRY(checkCompletedGpuEvents());
+        }
+    }
+    else
+    {
+        ANGLE_VK_TRY(this, mPrimaryCommands.end());
+
+        VkSubmitInfo submitInfo = {};
+        InitializeSubmitInfo(&submitInfo, mPrimaryCommands, mWaitSemaphores,
+                             mWaitSemaphoreStageMasks, signalSemaphore);
+
+        ANGLE_TRY(submitFrame(submitInfo, &mResourceUseList, std::move(mPrimaryCommands)));
+
+        ANGLE_TRY(startPrimaryCommandBuffer());
+        mWaitSemaphores.clear();
+        mWaitSemaphoreStageMasks.clear();
+    }

    mPerfCounters.renderPasses                           = 0;
    mPerfCounters.writeDescriptorSets                    = 0;
    mPerfCounters.flushedOutsideRenderPassCommandBuffers = 0;
    mPerfCounters.resolveImageCommands                   = 0;

-    mWaitSemaphores.clear();
-    mWaitSemaphoreStageMasks.clear();
+    ASSERT(mWaitSemaphores.empty());
+    ASSERT(mWaitSemaphoreStageMasks.empty());

    mPerfCounters.primaryBuffers++;

@@ -4331,8 +4414,15 @@ angle::Result ContextVk::finishImpl()

    ANGLE_TRY(flushImpl(nullptr));

-    ANGLE_TRY(finishToSerial(getLastSubmittedQueueSerial()));
-    ASSERT(!mCommandQueue.hasInFlightCommands());
+    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        ANGLE_TRY(finishToSerial(getLastSubmittedQueueSerial()));
+    }
+    else
+    {
+        ANGLE_TRY(finishToSerial(getLastSubmittedQueueSerial()));
+        ASSERT(!mCommandQueue.hasInFlightCommands());
+    }

    clearAllGarbage();

@@ -4373,17 +4463,25 @@ bool ContextVk::isSerialInUse(Serial serial) const

 angle::Result ContextVk::checkCompletedCommands()
 {
+    ASSERT(!mRenderer->getFeatures().enableCommandProcessingThread.enabled);
+
    return mCommandQueue.checkCompletedCommands(this);
 }

 angle::Result ContextVk::finishToSerial(Serial serial)
 {
+    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        mRenderer->finishToSerial(this, serial);
+        return angle::Result::Continue;
+    }
    return mCommandQueue.finishToSerial(this, serial, mRenderer->getMaxFenceWaitTimeNs());
 }

 angle::Result ContextVk::getCompatibleRenderPass(const vk::RenderPassDesc &desc,
                                                 vk::RenderPass **renderPassOut)
 {
+    // Note: Each context has it's own RenderPassCache so no locking needed.
    return mRenderPassCache.getCompatibleRenderPass(this, desc, renderPassOut);
 }

@@ -4391,6 +4489,7 @@ angle::Result ContextVk::getRenderPassWithOps(const vk::RenderPassDesc &desc,
                                              const vk::AttachmentOpsArray &ops,
                                              vk::RenderPass **renderPassOut)
 {
+    // Note: Each context has it's own RenderPassCache so no locking needed.
    return mRenderPassCache.getRenderPassWithOps(this, desc, ops, renderPassOut);
 }

@@ -4400,12 +4499,13 @@ angle::Result ContextVk::ensureSubmitFenceInitialized()
    {
        return angle::Result::Continue;
    }
-
    return mRenderer->newSharedFence(this, &mSubmitFence);
 }

 angle::Result ContextVk::getNextSubmitFence(vk::Shared<vk::Fence> *sharedFenceOut)
 {
+    ASSERT(!getRenderer()->getFeatures().enableCommandProcessingThread.enabled);
+
    ANGLE_TRY(ensureSubmitFenceInitialized());

    ASSERT(!sharedFenceOut->isReferenced());
@@ -4415,6 +4515,10 @@ angle::Result ContextVk::getNextSubmitFence(vk::Shared<vk::Fence> *sharedFenceOu

 vk::Shared<vk::Fence> ContextVk::getLastSubmittedFence() const
 {
+    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        return mRenderer->getLastSubmittedFence();
+    }
    return mCommandQueue.getLastSubmittedFence(this);
 }

@@ -4817,17 +4921,26 @@ angle::Result ContextVk::flushCommandsAndEndRenderPass()

    mRenderPassCommands->endRenderPass(this);

+    if (vk::CommandBufferHelper::kEnableCommandStreamDiagnostics)
+    {
+        mRenderPassCommands->addCommandDiagnostics(this);
+    }
+
+    vk::RenderPass *renderPass = nullptr;
+    ANGLE_TRY(mRenderPassCommands->getRenderPassWithOps(this, &renderPass));
    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
    {
        mRenderPassCommands->markClosed();
-        vk::CommandProcessorTask task = {this, &mPrimaryCommands, mRenderPassCommands};
+        vk::CommandProcessorTask flushToPrimary;
+        flushToPrimary.initProcessCommands(this, mRenderPassCommands, renderPass);
        ANGLE_TRACE_EVENT0("gpu.angle", "ContextVk::flushInsideRenderPassCommands");
-        queueCommandsToWorker(task);
+        commandProcessorSyncErrorsAndQueueCommand(&flushToPrimary);
        getNextAvailableCommandBuffer(&mRenderPassCommands, true);
    }
    else
    {
-        ANGLE_TRY(mRenderPassCommands->flushToPrimary(this, &mPrimaryCommands));
+        ANGLE_TRY(mRenderPassCommands->flushToPrimary(this->getFeatures(), &mPrimaryCommands,
+                                                      renderPass));
    }

    mHasPrimaryCommands = true;
@@ -4958,17 +5071,26 @@ angle::Result ContextVk::flushOutsideRenderPassCommands()

    addOverlayUsedBuffersCount(mOutsideRenderPassCommands);

+    if (vk::CommandBufferHelper::kEnableCommandStreamDiagnostics)
+    {
+        mOutsideRenderPassCommands->addCommandDiagnostics(this);
+    }
+
+    vk::RenderPass *renderPass = nullptr;
+    ANGLE_TRY(mOutsideRenderPassCommands->getRenderPassWithOps(this, &renderPass));
    if (mRenderer->getFeatures().enableCommandProcessingThread.enabled)
    {
        mOutsideRenderPassCommands->markClosed();
-        vk::CommandProcessorTask task = {this, &mPrimaryCommands, mOutsideRenderPassCommands};
+        vk::CommandProcessorTask flushToPrimary;
+        flushToPrimary.initProcessCommands(this, mOutsideRenderPassCommands, renderPass);
        ANGLE_TRACE_EVENT0("gpu.angle", "ContextVk::flushOutsideRenderPassCommands");
-        queueCommandsToWorker(task);
+        commandProcessorSyncErrorsAndQueueCommand(&flushToPrimary);
        getNextAvailableCommandBuffer(&mOutsideRenderPassCommands, false);
    }
    else
    {
-        ANGLE_TRY(mOutsideRenderPassCommands->flushToPrimary(this, &mPrimaryCommands));
+        ANGLE_TRY(mOutsideRenderPassCommands->flushToPrimary(getFeatures(), &mPrimaryCommands,
+                                                             renderPass));
    }
    mHasPrimaryCommands = true;
    mPerfCounters.flushedOutsideRenderPassCommandBuffers++;

--- a/src/libANGLE/renderer/vulkan/ContextVk.h
+++ b/src/libANGLE/renderer/vulkan/ContextVk.h
@@ -47,7 +47,6 @@ class CommandQueue final : angle::NonCopyable
    bool hasInFlightCommands() const;

    angle::Result allocatePrimaryCommandBuffer(vk::Context *context,
-                                               const vk::CommandPool &commandPool,
                                               vk::PrimaryCommandBuffer *commandBufferOut);
    angle::Result releasePrimaryCommandBuffer(vk::Context *context,
                                              vk::PrimaryCommandBuffer &&commandBuffer);
@@ -653,11 +652,10 @@ class ContextVk : public ContextImpl, public vk::Context
    void updateOverlayOnPresent();
    void addOverlayUsedBuffersCount(vk::CommandBufferHelper *commandBuffer);

-    // Submit commands to worker thread for processing
-    ANGLE_INLINE void queueCommandsToWorker(const vk::CommandProcessorTask &commands)
-    {
-        mRenderer->queueCommands(commands);
-    }
+    // Sync any errors from the command processor
+    void commandProcessorSyncErrors();
+    // Sync any error from worker thread and queue up next command for processing
+    void commandProcessorSyncErrorsAndQueueCommand(vk::CommandProcessorTask *command);
    // When worker thread completes, it releases command buffers back to context queue
    void recycleCommandBuffer(vk::CommandBufferHelper *commandBuffer);

@@ -1085,6 +1083,7 @@ class ContextVk : public ContextImpl, public vk::Context
    // We use a single pool for recording commands. We also keep a free list for pool recycling.
    vk::CommandPool mCommandPool;

+    // TODO: This can be killed once threading is enabled https://issuetracker.google.com/153666475
    CommandQueue mCommandQueue;
    vk::GarbageList mCurrentGarbage;


--- a/src/libANGLE/renderer/vulkan/QueryVk.cpp
+++ b/src/libANGLE/renderer/vulkan/QueryVk.cpp
@@ -180,18 +180,30 @@ angle::Result QueryVk::getResult(const gl::Context *context, bool wait)
    // finite time.
    // Note regarding time-elapsed: end should have been called after begin, so flushing when end
    // has pending work should flush begin too.
-    if (mQueryHelper.hasPendingWork(contextVk))
+    // TODO: https://issuetracker.google.com/169788986 - can't guarantee hasPendingWork() works when
+    // using threaded worker
+    if (mQueryHelper.hasPendingWork(contextVk) ||
+        contextVk->getRenderer()->getFeatures().enableCommandProcessingThread.enabled)
    {
        ANGLE_TRY(contextVk->flushImpl(nullptr));
+        if (contextVk->getRenderer()->getFeatures().enableCommandProcessingThread.enabled)
+        {
+            // TODO: https://issuetracker.google.com/170312581 - For now just stalling here
+            contextVk->getRenderer()->waitForCommandProcessorIdle(contextVk);
+        }

        ASSERT(!mQueryHelperTimeElapsedBegin.hasPendingWork(contextVk));
        ASSERT(!mQueryHelper.hasPendingWork(contextVk));
    }

-    // If the command buffer this query is being written to is still in flight, its reset command
-    // may not have been performed by the GPU yet.  To avoid a race condition in this case, wait
-    // for the batch to finish first before querying (or return not-ready if not waiting).
-    ANGLE_TRY(contextVk->checkCompletedCommands());
+    if (!contextVk->getRenderer()->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        // If the command buffer this query is being written to is still in flight, its reset
+        // command may not have been performed by the GPU yet.  To avoid a race condition in this
+        // case, wait for the batch to finish first before querying (or return not-ready if not
+        // waiting).
+        ANGLE_TRY(contextVk->checkCompletedCommands());
+    }
    if (contextVk->isSerialInUse(mQueryHelper.getStoredQueueSerial()))
    {
        if (!wait)

--- a/src/libANGLE/renderer/vulkan/RendererVk.cpp
+++ b/src/libANGLE/renderer/vulkan/RendererVk.cpp
@@ -467,6 +467,7 @@ RendererVk::RendererVk()
      mPipelineCacheVkUpdateTimeout(kPipelineCacheVkUpdatePeriod),
      mPipelineCacheDirty(false),
      mPipelineCacheInitialized(false),
+      mCommandProcessor(this),
      mGlslangInitialized(false)
 {
    VkFormatProperties invalid = {0, 0, kInvalidFormatFeatureFlags};
@@ -536,6 +537,10 @@ void RendererVk::onDestroy()
        std::lock_guard<std::mutex> lock(mFenceRecyclerMutex);
        mFenceRecycler.destroy(mDevice);
    }
+    {
+        std::lock_guard<decltype(mNextSubmitFenceMutex)> lock(mNextSubmitFenceMutex);
+        mNextSubmitFence.reset(mDevice);
+    }

    mPipelineCache.destroy(mDevice);
    mSamplerCache.destroy(this);
@@ -580,7 +585,7 @@ void RendererVk::notifyDeviceLost()
 {
    {
        std::lock_guard<std::mutex> lock(mQueueSerialMutex);
-        mLastCompletedQueueSerial = mLastSubmittedQueueSerial;
+        mLastCompletedQueueSerial = getLastSubmittedQueueSerial();
    }
    mDeviceLost = true;
    mDisplay->notifyDeviceLost();
@@ -909,7 +914,8 @@ angle::Result RendererVk::initialize(DisplayVk *displayVk,
    if (getFeatures().enableCommandProcessingThread.enabled)
    {
        mCommandProcessorThread =
-            std::thread(&CommandProcessor::processCommandProcessorTasks, &mCommandProcessor);
+            std::thread(&vk::CommandProcessor::processTasks, &mCommandProcessor);
+        mCommandProcessor.waitForWorkComplete(nullptr);
    }

    return angle::Result::Continue;
@@ -1907,6 +1913,9 @@ void RendererVk::initFeatures(DisplayVk *displayVk, const ExtensionNameList &dev
    // Currently disabled by default: http://anglebug.com/4324
    ANGLE_FEATURE_CONDITION(&mFeatures, enableCommandProcessingThread, false);

+    // Currently disabled by default: http://anglebug.com/4324
+    ANGLE_FEATURE_CONDITION(&mFeatures, asynchronousCommandProcessing, false);
+
    ANGLE_FEATURE_CONDITION(&mFeatures, supportsYUVSamplerConversion,
                            mSamplerYcbcrConversionFeatures.samplerYcbcrConversion != VK_FALSE);

@@ -2183,6 +2192,17 @@ bool RendererVk::hasBufferFormatFeatureBits(VkFormat format,
    return hasFormatFeatureBits<&VkFormatProperties::bufferFeatures>(format, featureBits);
 }

+void RendererVk::outputVmaStatString()
+{
+    // Output the VMA stats string
+    // This JSON string can be passed to VmaDumpVis.py to generate a visualization of the
+    // allocations the VMA has performed.
+    char *statsString;
+    mAllocator.buildStatsString(&statsString, true);
+    INFO() << std::endl << statsString << std::endl;
+    mAllocator.freeStatsString(statsString);
+}
+
 angle::Result RendererVk::queueSubmit(vk::Context *context,
                                      egl::ContextPriority priority,
                                      const VkSubmitInfo &submitInfo,
@@ -2192,23 +2212,11 @@ angle::Result RendererVk::queueSubmit(vk::Context *context,
 {
    if (kOutputVmaStatsString)
    {
-        // Output the VMA stats string
-        // This JSON string can be passed to VmaDumpVis.py to generate a visualization of the
-        // allocations the VMA has performed.
-        char *statsString;
-        mAllocator.buildStatsString(&statsString, true);
-        INFO() << std::endl << statsString << std::endl;
-        mAllocator.freeStatsString(statsString);
+        outputVmaStatString();
    }

-    if (getFeatures().enableCommandProcessingThread.enabled)
-    {
-        // For initial threading phase 1 code make sure any outstanding command processing
-        //  is complete.
-        // TODO: b/153666475 For phase2 investigate if this is required as most submits will take
-        //  place through worker thread except for one-off submits below.
-        mCommandProcessor.waitForWorkComplete();
-    }
+    ASSERT(!getFeatures().enableCommandProcessingThread.enabled);
+
    {
        std::lock_guard<decltype(mQueueMutex)> lock(mQueueMutex);
        std::lock_guard<std::mutex> serialLock(mQueueSerialMutex);
@@ -2235,12 +2243,26 @@ angle::Result RendererVk::queueSubmitOneOff(vk::Context *context,
                                            const vk::Fence *fence,
                                            Serial *serialOut)
 {
-    VkSubmitInfo submitInfo       = {};
-    submitInfo.sType              = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-    submitInfo.commandBufferCount = 1;
-    submitInfo.pCommandBuffers    = primary.ptr();
+    if (getFeatures().enableCommandProcessingThread.enabled)
+    {
+        vk::CommandProcessorTask oneOffQueueSubmit;
+        oneOffQueueSubmit.initOneOffQueueSubmit(primary.getHandle(), priority, fence);
+        queueCommand(context, &oneOffQueueSubmit);
+        waitForCommandProcessorIdle(context);
+        *serialOut = getLastSubmittedQueueSerial();
+
+        ANGLE_TRY(cleanupGarbage(false));
+    }
+    else
+    {
+
+        VkSubmitInfo submitInfo       = {};
+        submitInfo.sType              = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+        submitInfo.commandBufferCount = 1;
+        submitInfo.pCommandBuffers    = primary.ptr();

-    ANGLE_TRY(queueSubmit(context, priority, submitInfo, nullptr, fence, serialOut));
+        ANGLE_TRY(queueSubmit(context, priority, submitInfo, nullptr, fence, serialOut));
+    }

    mPendingOneOffCommands.push_back({*serialOut, std::move(primary)});

@@ -2251,8 +2273,8 @@ angle::Result RendererVk::queueWaitIdle(vk::Context *context, egl::ContextPriori
 {
    if (getFeatures().enableCommandProcessingThread.enabled)
    {
-        // First make sure command processor is complete when waiting for queue idle.
-        mCommandProcessor.waitForWorkComplete();
+        // Wait for all pending commands to get sent before issuing vkQueueWaitIdle
+        waitForCommandProcessorIdle(context);
    }
    {
        std::lock_guard<decltype(mQueueMutex)> lock(mQueueMutex);
@@ -2268,8 +2290,8 @@ angle::Result RendererVk::deviceWaitIdle(vk::Context *context)
 {
    if (getFeatures().enableCommandProcessingThread.enabled)
    {
-        // First make sure command processor is complete when waiting for device idle.
-        mCommandProcessor.waitForWorkComplete();
+        // Wait for all pending commands to get sent before issuing vkQueueWaitIdle
+        waitForCommandProcessorIdle(context);
    }
    {
        std::lock_guard<decltype(mQueueMutex)> lock(mQueueMutex);
@@ -2286,12 +2308,7 @@ VkResult RendererVk::queuePresent(egl::ContextPriority priority,
 {
    ANGLE_TRACE_EVENT0("gpu.angle", "RendererVk::queuePresent");

-    if (getFeatures().enableCommandProcessingThread.enabled)
-    {
-        // First make sure command processor is complete before queue present as
-        //  present may have dependencies on that thread.
-        mCommandProcessor.waitForWorkComplete();
-    }
+    ASSERT(!getFeatures().enableCommandProcessingThread.enabled);

    std::lock_guard<decltype(mQueueMutex)> lock(mQueueMutex);

@@ -2329,6 +2346,30 @@ angle::Result RendererVk::newSharedFence(vk::Context *context,
    return angle::Result::Continue;
 }

+// Return a shared fence to be used for the next submit
+// Fence may be shared with a Sync object.
+// reset indicates that nextSubmitFence should be reset before returning. This ensures that the next
+// request for a submit fence gets a fresh fence.
+// TODO: https://issuetracker.google.com/issues/170312581 - move to CommandProcessor as part of
+// fence ownership follow-up task.
+angle::Result RendererVk::getNextSubmitFence(vk::Shared<vk::Fence> *sharedFenceOut, bool reset)
+{
+    std::lock_guard<decltype(mNextSubmitFenceMutex)> lock(mNextSubmitFenceMutex);
+    if (!mNextSubmitFence.isReferenced())
+    {
+        ANGLE_TRY(newSharedFence(&mCommandProcessor, &mNextSubmitFence));
+    }
+
+    ASSERT(!sharedFenceOut->isReferenced());
+    sharedFenceOut->copy(getDevice(), mNextSubmitFence);
+
+    if (reset)
+    {
+        resetSharedFence(&mNextSubmitFence);
+    }
+    return angle::Result::Continue;
+}
+
 template <VkFormatFeatureFlags VkFormatProperties::*features>
 VkFormatFeatureFlags RendererVk::getFormatFeatureBits(VkFormat format,
                                                      const VkFormatFeatureFlags featureBits) const

--- a/src/libANGLE/renderer/vulkan/RendererVk.h
+++ b/src/libANGLE/renderer/vulkan/RendererVk.h
@@ -168,6 +168,7 @@ class RendererVk : angle::NonCopyable
        return mPriorities[priority];
    }

+    // Queue submit that originates from the main thread
    angle::Result queueSubmit(vk::Context *context,
                              egl::ContextPriority priority,
                              const VkSubmitInfo &submitInfo,
@@ -197,6 +198,8 @@ class RendererVk : angle::NonCopyable
        sharedFenceIn->resetAndRecycle(&mFenceRecycler);
    }

+    angle::Result getNextSubmitFence(vk::Shared<vk::Fence> *sharedFenceOut, bool reset);
+
    template <typename... ArgsT>
    void collectGarbageAndReinit(vk::SharedResourceUse *use, ArgsT... garbageIn)
    {
@@ -224,6 +227,12 @@ class RendererVk : angle::NonCopyable
        }
    }

+    vk::Shared<vk::Fence> getLastSubmittedFence() const
+    {
+        return mCommandProcessor.getLastSubmittedFence();
+    }
+    void handleDeviceLost() { mCommandProcessor.handleDeviceLost(); }
+
    static constexpr size_t kMaxExtensionNames = 200;
    using ExtensionNameList = angle::FixedVector<const char *, kMaxExtensionNames>;

@@ -241,11 +250,19 @@ class RendererVk : angle::NonCopyable

    ANGLE_INLINE Serial getCurrentQueueSerial()
    {
+        if (getFeatures().enableCommandProcessingThread.enabled)
+        {
+            return mCommandProcessor.getCurrentQueueSerial();
+        }
        std::lock_guard<std::mutex> lock(mQueueSerialMutex);
        return mCurrentQueueSerial;
    }
    ANGLE_INLINE Serial getLastSubmittedQueueSerial()
    {
+        if (getFeatures().enableCommandProcessingThread.enabled)
+        {
+            return mCommandProcessor.getLastSubmittedSerial();
+        }
        std::lock_guard<std::mutex> lock(mQueueSerialMutex);
        return mLastSubmittedQueueSerial;
    }
@@ -264,11 +281,24 @@ class RendererVk : angle::NonCopyable
    vk::ActiveHandleCounter &getActiveHandleCounts() { return mActiveHandleCounts; }

    // Queue commands to worker thread for processing
-    void queueCommands(const vk::CommandProcessorTask &commands)
+    void queueCommand(vk::Context *context, vk::CommandProcessorTask *command)
    {
-        mCommandProcessor.queueCommands(commands);
+        mCommandProcessor.queueCommand(context, command);
+    }
+    bool hasPendingError() const { return mCommandProcessor.hasPendingError(); }
+    vk::Error getAndClearPendingError() { return mCommandProcessor.getAndClearPendingError(); }
+    void waitForCommandProcessorIdle(vk::Context *context)
+    {
+        mCommandProcessor.waitForWorkComplete(context);
    }
-    void waitForWorkerThreadIdle() { mCommandProcessor.waitForWorkComplete(); }
+
+    void finishToSerial(vk::Context *context, Serial serial)
+    {
+        mCommandProcessor.finishToSerial(context, serial);
+    }
+
+    void finishAllWork(vk::Context *context) { mCommandProcessor.finishAllWork(context); }
+    VkQueue getVkQueue(egl::ContextPriority priority) const { return mQueues[priority]; }

    bool getEnableValidationLayers() const { return mEnableValidationLayers; }

@@ -276,6 +306,8 @@ class RendererVk : angle::NonCopyable

    void setGlobalDebugAnnotator();

+    void outputVmaStatString();
+
  private:
    angle::Result initializeDevice(DisplayVk *displayVk, uint32_t queueFamilyIndex);
    void ensureCapsInitialized() const;
@@ -390,9 +422,13 @@ class RendererVk : angle::NonCopyable
    };
    std::deque<PendingOneOffCommands> mPendingOneOffCommands;

-    // Worker Thread
-    CommandProcessor mCommandProcessor;
+    // Command Processor Thread
+    vk::CommandProcessor mCommandProcessor;
    std::thread mCommandProcessorThread;
+    // mNextSubmitFence is the fence that's going to be signaled at the next submission.  This is
+    // used to support SyncVk objects, which may outlive the context (as EGLSync objects).
+    vk::Shared<vk::Fence> mNextSubmitFence;
+    std::mutex mNextSubmitFenceMutex;

    // track whether we initialized (or released) glslang
    bool mGlslangInitialized;

--- a/src/libANGLE/renderer/vulkan/ResourceVk.cpp
+++ b/src/libANGLE/renderer/vulkan/ResourceVk.cpp
@@ -102,11 +102,22 @@ ResourceUseList::ResourceUseList()
    mResourceUses.reserve(kDefaultResourceUseCount);
 }

+ResourceUseList::ResourceUseList(ResourceUseList &&other)
+{
+    *this = std::move(other);
+}
+
 ResourceUseList::~ResourceUseList()
 {
    ASSERT(mResourceUses.empty());
 }

+ResourceUseList &ResourceUseList::operator=(ResourceUseList &&rhs)
+{
+    std::swap(mResourceUses, rhs.mResourceUses);
+    return *this;
+}
+
 void ResourceUseList::releaseResourceUses()
 {
    for (SharedResourceUse &use : mResourceUses)

--- a/src/libANGLE/renderer/vulkan/ResourceVk.h
+++ b/src/libANGLE/renderer/vulkan/ResourceVk.h
@@ -136,7 +136,9 @@ class ResourceUseList final : angle::NonCopyable
 {
  public:
    ResourceUseList();
+    ResourceUseList(ResourceUseList &&other);
    virtual ~ResourceUseList();
+    ResourceUseList &operator=(ResourceUseList &&rhs);

    void add(const SharedResourceUse &resourceUse);


--- a/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
+++ b/src/libANGLE/renderer/vulkan/SurfaceVk.cpp
@@ -1186,6 +1186,7 @@ angle::Result WindowSurfaceVk::present(ContextVk *contextVk,
                                       bool *presentOutOfDate)
 {
    ANGLE_TRACE_EVENT0("gpu.angle", "WindowSurfaceVk::present");
+    RendererVk *renderer = contextVk->getRenderer();

    // Throttle the submissions to avoid getting too far ahead of the GPU.
    SwapHistory &swap = mSwapHistory[mCurrentSwapHistoryIndex];
@@ -1194,7 +1195,7 @@ angle::Result WindowSurfaceVk::present(ContextVk *contextVk,
        if (swap.sharedFence.isReferenced())
        {
            ANGLE_TRY(swap.waitFence(contextVk));
-            swap.destroy(contextVk->getRenderer());
+            swap.destroy(renderer);
        }
    }

@@ -1316,6 +1317,8 @@ angle::Result WindowSurfaceVk::present(ContextVk *contextVk,
    }

    // Update the swap history for this presentation
+    // TODO: https://issuetracker.google.com/issues/170312581 - this will force us to flush worker
+    // queue to get the fence.
    swap.sharedFence = contextVk->getLastSubmittedFence();
    ASSERT(!mAcquireImageSemaphore.valid());

@@ -1323,13 +1326,47 @@ angle::Result WindowSurfaceVk::present(ContextVk *contextVk,
    mCurrentSwapHistoryIndex =
        mCurrentSwapHistoryIndex == mSwapHistory.size() ? 0 : mCurrentSwapHistoryIndex;

-    VkResult result = contextVk->getRenderer()->queuePresent(contextVk->getPriority(), presentInfo);
+    VkResult result;
+    if (renderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        vk::CommandProcessorTask present;
+        present.initPresent(contextVk->getPriority(), presentInfo);
+
+        // Make sure everything has been submitted (and errors handled)
+        renderer->waitForCommandProcessorIdle(contextVk);
+
+        // Submit queuePresent all by itself (ignoring interference from other threads for now)
+        renderer->queueCommand(contextVk, &present);
+        // TODO: https://issuetracker.google.com/issues/170329600 - Just stalling here for now, but
+        // really want to let main thread continue
+        // need to figure out how to handle work below off-thread and sync to main
+        // Also, need to fix lifetime of presentInfo data when main thread continues.
+        // There is a bunch of work happening after present to deal with swapchain recreation.
+        // Will that require moving a large chunk of swapImpl to the CommandProcessor?
+        // That will likely require serializing access to the WindowSurfaceVk object in order
+        // to have current content.
+        result = VK_SUCCESS;
+
+        // wait for the queuePresent to be submitted and intentionally set the context to nullptr so
+        // that we can catch any error. Note this doesn't prevent another context from grabbing the
+        // error. Will be fixed properly in a follow-up as part of present work.
+        renderer->waitForCommandProcessorIdle(nullptr);
+        if (renderer->hasPendingError())
+        {
+            vk::Error error = renderer->getAndClearPendingError();
+            result          = error.mErrorCode;
+        }
+    }
+    else
+    {
+        result = renderer->queuePresent(contextVk->getPriority(), presentInfo);
+    }

    // If OUT_OF_DATE is returned, it's ok, we just need to recreate the swapchain before
    // continuing.
    // If VK_SUBOPTIMAL_KHR is returned it's because the device orientation changed and we should
    // recreate the swapchain with a new window orientation.
-    if (contextVk->getFeatures().enablePreRotateSurfaces.enabled)
+    if (renderer->getFeatures().enablePreRotateSurfaces.enabled)
    {
        // Also check for VK_SUBOPTIMAL_KHR.
        *presentOutOfDate = ((result == VK_ERROR_OUT_OF_DATE_KHR) || (result == VK_SUBOPTIMAL_KHR));

--- a/src/libANGLE/renderer/vulkan/SyncVk.cpp
+++ b/src/libANGLE/renderer/vulkan/SyncVk.cpp
@@ -32,6 +32,12 @@ SyncHelper::~SyncHelper() {}
 void SyncHelper::releaseToRenderer(RendererVk *renderer)
 {
    renderer->collectGarbageAndReinit(&mUse, &mEvent);
+    // TODO: https://issuetracker.google.com/170312581 - Currently just stalling on worker thread
+    // here to try and avoid race condition. If this works, need some alternate solution
+    if (renderer->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        renderer->waitForCommandProcessorIdle(nullptr);
+    }
    mFence.reset(renderer->getDevice());
 }

@@ -48,7 +54,17 @@ angle::Result SyncHelper::initialize(ContextVk *contextVk)

    DeviceScoped<Event> event(device);
    ANGLE_VK_TRY(contextVk, event.get().init(device, eventCreateInfo));
-    ANGLE_TRY(contextVk->getNextSubmitFence(&mFence));
+    // TODO: https://issuetracker.google.com/170312581 - For now wait for worker thread to finish
+    // then get next fence from renderer
+    if (contextVk->getRenderer()->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        contextVk->getRenderer()->waitForCommandProcessorIdle(contextVk);
+        ANGLE_TRY(contextVk->getRenderer()->getNextSubmitFence(&mFence, false));
+    }
+    else
+    {
+        ANGLE_TRY(contextVk->getNextSubmitFence(&mFence));
+    }

    mEvent = event.release();

@@ -90,9 +106,17 @@ angle::Result SyncHelper::clientWait(Context *context,
        ANGLE_TRY(contextVk->flushImpl(nullptr));
    }

+    // If we are using worker need to wait for the commands to be issued before waiting on the
+    // fence.
+    if (contextVk->getRenderer()->getFeatures().enableCommandProcessingThread.enabled)
+    {
+        contextVk->getRenderer()->waitForCommandProcessorIdle(contextVk);
+    }
+
    // Wait on the fence that's expected to be signaled on the first vkQueueSubmit after
    // `initialize` was called. The first fence is the fence created to signal this sync.
    ASSERT(mFence.get().valid());
+    // TODO: https://issuetracker.google.com/170312581 - Wait could be command to worker
    VkResult status = mFence.get().wait(renderer->getDevice(), timeout);

    // Check for errors, but don't consider timeout as such.
@@ -189,11 +213,24 @@ angle::Result SyncHelperNativeFence::initializeWithFd(ContextVk *contextVk, int 

        retain(&contextVk->getResourceUseList());

-        Serial serialOut;
-        VkSubmitInfo submitInfo = {};
-        submitInfo.sType        = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-        ANGLE_TRY(renderer->queueSubmit(contextVk, contextVk->getPriority(), submitInfo, nullptr,
-                                        &fence.get(), &serialOut));
+        if (renderer->getFeatures().enableCommandProcessingThread.enabled)
+        {
+            CommandProcessorTask oneOffQueueSubmit;
+            oneOffQueueSubmit.initOneOffQueueSubmit(VK_NULL_HANDLE, contextVk->getPriority(),
+                                                    &fence.get());
+            renderer->queueCommand(contextVk, &oneOffQueueSubmit);
+            // TODO: https://issuetracker.google.com/170312581 - wait for now
+            renderer->waitForCommandProcessorIdle(contextVk);
+        }
+        else
+        {
+            Serial serialOut;
+            VkSubmitInfo submitInfo = {};
+            submitInfo.sType        = VK_STRUCTURE_TYPE_SUBMIT_INFO;
+
+            ANGLE_TRY(renderer->queueSubmit(contextVk, contextVk->getPriority(), submitInfo,
+                                            nullptr, &fence.get(), &serialOut));
+        }

        VkFenceGetFdInfoKHR fenceGetFdInfo = {};
        fenceGetFdInfo.sType               = VK_STRUCTURE_TYPE_FENCE_GET_FD_INFO_KHR;
@@ -253,6 +290,14 @@ angle::Result SyncHelperNativeFence::clientWait(Context *context,
    {
        ANGLE_TRY(contextVk->flushImpl(nullptr));
    }
+
+    // If we are using worker need to wait for the commands to be issued before waiting on the
+    // fence.
+    if (contextVk->getRenderer()->getFeatures().asynchronousCommandProcessing.enabled)
+    {
+        contextVk->getRenderer()->waitForCommandProcessorIdle(contextVk);
+    }
+
    // Wait for mFenceWithFd to be signaled.
    VkResult status = mFenceWithFd.wait(renderer->getDevice(), timeout);


--- a/src/libANGLE/renderer/vulkan/UtilsVk.cpp
+++ b/src/libANGLE/renderer/vulkan/UtilsVk.cpp
@@ -969,6 +969,7 @@ angle::Result UtilsVk::setupProgram(ContextVk *contextVk,
        vk::PipelineAndSerial *pipelineAndSerial;
        program->setShader(gl::ShaderType::Compute, fsCsShader);
        ANGLE_TRY(program->getComputePipeline(contextVk, pipelineLayout.get(), &pipelineAndSerial));
+        // TODO: https://issuetracker.google.com/issues/169788986: Update serial handling.
        pipelineAndSerial->updateSerial(serial);
        commandBuffer->bindComputePipeline(pipelineAndSerial->get());
    }

--- a/src/libANGLE/renderer/vulkan/vk_helpers.cpp
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.cpp
@@ -916,7 +916,8 @@ void CommandBufferHelper::restoreStencilContent()
    }
 }

-void CommandBufferHelper::executeBarriers(ContextVk *contextVk, PrimaryCommandBuffer *primary)
+void CommandBufferHelper::executeBarriers(const angle::FeaturesVk &features,
+                                          PrimaryCommandBuffer *primary)
 {
    // make a local copy for faster access
    PipelineStagesMask mask = mPipelineBarrierMask;
@@ -925,7 +926,7 @@ void CommandBufferHelper::executeBarriers(ContextVk *contextVk, PrimaryCommandBu
        return;
    }

-    if (contextVk->getFeatures().preferAggregateBarrierCalls.enabled)
+    if (features.preferAggregateBarrierCalls.enabled)
    {
        PipelineStagesMask::Iterator iter = mask.begin();
        PipelineBarrier &barrier          = mPipelineBarriers[*iter];
@@ -1152,24 +1153,31 @@ void CommandBufferHelper::endTransformFeedback()
    mValidTransformFeedbackBufferCount = 0;
 }

-angle::Result CommandBufferHelper::flushToPrimary(ContextVk *contextVk,
-                                                  PrimaryCommandBuffer *primary)
+angle::Result CommandBufferHelper::getRenderPassWithOps(ContextVk *contextVk,
+                                                        RenderPass **renderPass)
 {
-    ANGLE_TRACE_EVENT0("gpu.angle", "CommandBufferHelper::flushToPrimary");
-    ASSERT(!empty());
-    if (kEnableCommandStreamDiagnostics)
+    *renderPass = nullptr;
+    if (mIsRenderPassCommandBuffer)
    {
-        addCommandDiagnostics(contextVk);
+        ANGLE_TRY(contextVk->getRenderPassWithOps(mRenderPassDesc, mAttachmentOps, renderPass));
    }
+    return angle::Result::Continue;
+}
+
+angle::Result CommandBufferHelper::flushToPrimary(const angle::FeaturesVk &features,
+                                                  PrimaryCommandBuffer *primary,
+                                                  RenderPass *renderPass)
+{
+    ANGLE_TRACE_EVENT0("gpu.angle", "CommandBufferHelper::flushToPrimary");
+    ASSERT(!empty());
+
    // Commands that are added to primary before beginRenderPass command
-    executeBarriers(contextVk, primary);
+    executeBarriers(features, primary);

    if (mIsRenderPassCommandBuffer)
    {
        mCommandBuffer.executeQueuedResetQueryPoolCommands(primary->getHandle());
-        // Pull a RenderPass from the cache.
-        RenderPass *renderPass = nullptr;
-        ANGLE_TRY(contextVk->getRenderPassWithOps(mRenderPassDesc, mAttachmentOps, &renderPass));
+        ASSERT(renderPass != nullptr);

        VkRenderPassBeginInfo beginInfo    = {};
        beginInfo.sType                    = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;

--- a/src/libANGLE/renderer/vulkan/vk_helpers.h
+++ b/src/libANGLE/renderer/vulkan/vk_helpers.h
@@ -962,9 +962,12 @@ class CommandBufferHelper : angle::NonCopyable

    CommandBuffer &getCommandBuffer() { return mCommandBuffer; }

-    angle::Result flushToPrimary(ContextVk *contextVk, PrimaryCommandBuffer *primary);
+    angle::Result getRenderPassWithOps(ContextVk *contextVk, RenderPass **renderPass);
+    angle::Result flushToPrimary(const angle::FeaturesVk &features,
+                                 PrimaryCommandBuffer *primary,
+                                 RenderPass *renderPass);

-    void executeBarriers(ContextVk *contextVk, PrimaryCommandBuffer *primary);
+    void executeBarriers(const angle::FeaturesVk &features, PrimaryCommandBuffer *primary);

    void setHasRenderPass(bool hasRenderPass) { mIsRenderPassCommandBuffer = hasRenderPass; }

@@ -1129,9 +1132,9 @@ class CommandBufferHelper : angle::NonCopyable

    bool isReadOnlyDepthMode() const { return mReadOnlyDepthStencilMode; }

-  private:
    void addCommandDiagnostics(ContextVk *contextVk);

+  private:
    bool onDepthStencilAccess(ResourceAccess access,
                              uint32_t *cmdCountInvalidated,
                              uint32_t *cmdCountDisabled);

--- a/src/libANGLE/renderer/vulkan/vk_utils.h
+++ b/src/libANGLE/renderer/vulkan/vk_utils.h
@@ -943,15 +943,15 @@ gl::LevelIndex GetLevelIndex(vk::LevelIndex levelVk, gl::LevelIndex baseLevel);

 }  // namespace rx

-#define ANGLE_VK_TRY(context, command)                                                 \
-    do                                                                                 \
-    {                                                                                  \
-        auto ANGLE_LOCAL_VAR = command;                                                \
-        if (ANGLE_UNLIKELY(ANGLE_LOCAL_VAR != VK_SUCCESS))                             \
-        {                                                                              \
-            context->handleError(ANGLE_LOCAL_VAR, __FILE__, ANGLE_FUNCTION, __LINE__); \
-            return angle::Result::Stop;                                                \
-        }                                                                              \
+#define ANGLE_VK_TRY(context, command)                                                   \
+    do                                                                                   \
+    {                                                                                    \
+        auto ANGLE_LOCAL_VAR = command;                                                  \
+        if (ANGLE_UNLIKELY(ANGLE_LOCAL_VAR != VK_SUCCESS))                               \
+        {                                                                                \
+            (context)->handleError(ANGLE_LOCAL_VAR, __FILE__, ANGLE_FUNCTION, __LINE__); \
+            return angle::Result::Stop;                                                  \
+        }                                                                                \
    } while (0)

 #define ANGLE_VK_CHECK(context, test, error) ANGLE_VK_TRY(context, test ? VK_SUCCESS : error)