Commit 3f0c4a56 by Jamie Madill Committed by Commit Bot

Vulkan: Faster state transitions.

Implements a transition table from Pipeline Cache entry to state change neighbouring Pipeline Cache entries. We use a 64-bit mask to do a quick scan over the pipeline desc. This ends up being a lot faster than doing a full hash and memcmp over the pipeline description. Note that there could be future optimizations to this design. We might keep a hash map of the pipeline transitions instead of a list. Or use a sorted list. This could speed up the search when there are many transitions for cache entries. Also we could skip the transition table and opt to do a full hash when there are more than a configurable number of dirty states. This might be a bit faster in some cases. Likely this will be something we can add performance tests for in the future. Documentation is also added in a README file for the Vulkan back end. This will be extended over time. Improves performance about 30-35% on the VBO state change test. Bug: angleproject:3013 Change-Id: I793f9e3efd8887acf00ad60e4ac2502a54c95dee Reviewed-on: https://chromium-review.googlesource.com/c/1369287 Commit-Queue: Jamie Madill <jmadill@chromium.org> Reviewed-by: 's avatarYuly Novikov <ynovikov@chromium.org>
parent 80766cfa
......@@ -1271,6 +1271,11 @@ inline uint16_t RotR16(uint16_t x, int8_t r)
# define ANGLE_ROTR16(x, y) ::rx::RotR16(x, y)
#endif // namespace rx
constexpr unsigned int Log2(unsigned int bytes)
{
return bytes == 1 ? 0 : (1 + Log2(bytes / 2));
}
} // namespace rx
#endif // COMMON_MATHUTIL_H_
......@@ -40,16 +40,11 @@ bool CheckedMathResult(const CheckedNumeric<GLuint> &value, GLuint *resultOut)
}
}
constexpr GLuint Log2(GLuint bytes)
{
return bytes == 1 ? 0 : (1 + Log2(bytes / 2));
}
constexpr uint32_t PackTypeInfo(GLuint bytes, bool specialized)
{
// static_assert within constexpr requires c++17
// static_assert(isPow2(bytes));
return bytes | (Log2(bytes) << 8) | (specialized << 16);
return bytes | (rx::Log2(bytes) << 8) | (specialized << 16);
}
} // anonymous namespace
......
......@@ -179,7 +179,8 @@ class ContextVk : public ContextImpl, public vk::Context
GLuint relativeOffset)
{
invalidateVertexAndIndexBuffers();
mGraphicsPipelineDesc->updateVertexInput(static_cast<uint32_t>(attribIndex), stride,
mGraphicsPipelineDesc->updateVertexInput(&mGraphicsPipelineTransition,
static_cast<uint32_t>(attribIndex), stride,
divisor, format, relativeOffset);
}
......@@ -270,7 +271,6 @@ class ContextVk : public ContextImpl, public vk::Context
mDirtyBits.set(DIRTY_BIT_PIPELINE);
mDirtyBits.set(DIRTY_BIT_VIEWPORT);
mDirtyBits.set(DIRTY_BIT_SCISSOR);
mCurrentPipeline = nullptr;
}
void invalidateCurrentTextures();
......@@ -291,12 +291,13 @@ class ContextVk : public ContextImpl, public vk::Context
angle::Result handleDirtyViewport(const gl::Context *context, vk::CommandBuffer *commandBuffer);
angle::Result handleDirtyScissor(const gl::Context *context, vk::CommandBuffer *commandBuffer);
vk::PipelineAndSerial *mCurrentPipeline;
vk::PipelineHelper *mCurrentPipeline;
gl::PrimitiveMode mCurrentDrawMode;
// Keep a cached pipeline description structure that can be used to query the pipeline cache.
// Kept in a pointer so allocations can be aligned, and structs can be portably packed.
std::unique_ptr<vk::GraphicsPipelineDesc> mGraphicsPipelineDesc;
vk::GraphicsPipelineTransitionBits mGraphicsPipelineTransition;
// The descriptor pools are externally sychronized, so cannot be accessed from different
// threads simultaneously. Hence, we keep them in the ContextVk instead of the RendererVk.
......
......@@ -123,7 +123,8 @@ class ProgramVk : public ProgramImpl
gl::PrimitiveMode mode,
const vk::GraphicsPipelineDesc &desc,
const gl::AttributesMask &activeAttribLocations,
vk::PipelineAndSerial **pipelineOut)
const vk::GraphicsPipelineDesc **descPtrOut,
vk::PipelineHelper **pipelineOut)
{
vk::ShaderProgramHelper *shaderProgram;
ANGLE_TRY(initShaders(contextVk, mode, &shaderProgram));
......@@ -132,7 +133,7 @@ class ProgramVk : public ProgramImpl
return shaderProgram->getGraphicsPipeline(
contextVk, &renderer->getRenderPassCache(), renderer->getPipelineCache(),
renderer->getCurrentQueueSerial(), mPipelineLayout.get(), desc, activeAttribLocations,
pipelineOut);
descPtrOut, pipelineOut);
}
private:
......
# ANGLE: Vulkan Back-end
ANGLE's Vulkan back-end implementation lives in this folder.
[Vulkan](https://www.khronos.org/vulkan/) is an explicit graphics API. It has a lot in common with
other explicit APIs such as Microsoft's
[D3D12](https://docs.microsoft.com/en-us/windows/desktop/direct3d12/directx-12-programming-guide)
and Apple's [Metal](https://developer.apple.com/metal/). Compared to APIs like OpenGL or D3D11
explicit APIs can offer a number of significant benefits:
* Lower API call CPU overhead.
* A smaller API surface with more direct hardware control.
* Better support for multi-core programming.
* Vulkan in particular has open-source tooling and tests.
## Back-end Design
The [RendererVk](RendererVk.cpp) is a singleton. RendererVk owns shared global resources like the
[VkDevice](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkDevice.html),
[VkQueue](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkQueue.html), the
[Vulkan format tables](vk_format_utils.h) and [internal Vulkan shaders](shaders). The back-end
creates a new [ContextVk](ContextVk.cpp) instance to manage each allocated OpenGL Context. ContextVk
processes state changes and handles action commands like `glDrawArrays` and `glDrawElements`.
### Fast OpenGL State Transitions
Typical OpenGL programs issue a few small state change commands between draw call commands. We want
the typical app's use case to be as fast as possible so this leads to unique performance challenges.
Vulkan in quite different from OpenGL because it requires a separate compiled
[VkPipeline](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkPipeline.html)
for each state vector. Compiling VkPipelines is multiple orders of magnitude slower than enabling or
disabling an OpenGL render state. To speed this up we use three levels of caching when transitioning
states in the Vulkan back-end.
The first level is the driver's
[VkPipelineCache](https://www.khronos.org/registry/vulkan/specs/1.1-extensions/man/html/VkPipelineCache.html). The driver cache reduces pipeline recompilation time
significantly. But even cached pipeline recompilations are orders of manitude slower than OpenGL
state changes.
The second level cache is an ANGLE-owned hash map from OpenGL state vectors to compiled pipelines.
See
[GraphicsPipelineCache](https://chromium.googlesource.com/angle/angle/+/225f08bf85a368f905362cdd1366e4795680452c/src/libANGLE/renderer/vulkan/vk_cache_utils.h#498)
in [vk_cache_utils.h](vk_cache_utils.h). ANGLE's
[GraphicsPipelineDesc](https://chromium.googlesource.com/angle/angle/+/225f08bf85a368f905362cdd1366e4795680452c/src/libANGLE/renderer/vulkan/vk_cache_utils.h#244)
class is a tightly packed 256-byte description of the current OpenGL rendering state. We
also use a [xxHash](https://github.com/Cyan4973/xxHash) for the fastest possible hash computation.
The hash map speeds up state changes considerably. But it is still significantly slower than OpenGL
implementations.
To get best performance we use a transition table from each OpenGL state vector to neighbouring
state vectors. The transition table points from GraphicsPipelineCache entries directly to
neighbouring VkPipeline objects. When the application changes state the state change bits are
recorded into a compact bit mask that covers the GraphicsPipelineDesc state vector. Then on the next
draw call we scan the transition bit mask and compare the GraphicsPipelineDesc of the current state
vector and the state vector of the cached transition. With the hash map we compute a hash over the
entire state vector and then do a 256-byte `memcmp` to guard against hash collisions. With the
transition table we will only compare as many bytes as were changed in the transition bit mask. By
skipping the expensive hashing and `memcmp` we can get as good or faster performance than native
OpenGL drivers.
Note that the current design of the transition table stores transitions in an unsorted list. If
applications map from one state to many this will slow down the transition time. This could be
improved in the future using a faster look up. For instance we could keep a sorted transition table
or use a small hash map for transitions.
......@@ -325,25 +325,30 @@ angle::Result UtilsVk::setupProgram(vk::Context *context,
Serial serial = renderer->getCurrentQueueSerial();
vk::PipelineAndSerial *pipelineAndSerial;
if (isCompute)
{
vk::PipelineAndSerial *pipelineAndSerial;
program->setShader(gl::ShaderType::Compute, fsCsShader);
ANGLE_TRY(program->getComputePipeline(context, pipelineLayout.get(), &pipelineAndSerial));
pipelineAndSerial->updateSerial(serial);
commandBuffer->bindPipeline(bindPoint, pipelineAndSerial->get());
}
else
{
program->setShader(gl::ShaderType::Vertex, vsShader);
program->setShader(gl::ShaderType::Fragment, fsCsShader);
// This value is not used but is passed to getGraphicsPipeline to avoid a nullptr check.
const vk::GraphicsPipelineDesc *descPtr;
vk::PipelineHelper *helper;
ANGLE_TRY(program->getGraphicsPipeline(
context, &renderer->getRenderPassCache(), renderer->getPipelineCache(), serial,
pipelineLayout.get(), *pipelineDesc, gl::AttributesMask(), &pipelineAndSerial));
pipelineLayout.get(), *pipelineDesc, gl::AttributesMask(), &descPtr, &helper));
helper->updateSerial(serial);
commandBuffer->bindPipeline(bindPoint, helper->getPipeline());
}
commandBuffer->bindPipeline(bindPoint, pipelineAndSerial->get());
pipelineAndSerial->updateSerial(serial);
if (descriptorSet != VK_NULL_HANDLE)
{
commandBuffer->bindDescriptorSets(bindPoint, pipelineLayout.get(), 0, 1, &descriptorSet, 0,
......@@ -619,8 +624,8 @@ angle::Result UtilsVk::clearImage(ContextVk *contextVk,
vk::GraphicsPipelineDesc pipelineDesc;
pipelineDesc.initDefaults();
pipelineDesc.updateColorWriteMask(params.colorMaskFlags, *params.alphaMask);
pipelineDesc.updateRenderPassDesc(*params.renderPassDesc);
pipelineDesc.setColorWriteMask(params.colorMaskFlags, *params.alphaMask);
pipelineDesc.setRenderPassDesc(*params.renderPassDesc);
vk::ShaderLibrary &shaderLibrary = renderer->getShaderLibrary();
vk::RefCounted<vk::ShaderAndSerial> *vertexShader = nullptr;
......@@ -707,7 +712,7 @@ angle::Result UtilsVk::copyImage(vk::Context *context,
vk::GraphicsPipelineDesc pipelineDesc;
pipelineDesc.initDefaults();
pipelineDesc.updateRenderPassDesc(renderPassDesc);
pipelineDesc.setRenderPassDesc(renderPassDesc);
gl::Rectangle renderArea;
renderArea.x = params.destOffset[0];
......
......@@ -641,7 +641,8 @@ class ShaderProgramHelper : angle::NonCopyable
const PipelineLayout &pipelineLayout,
const GraphicsPipelineDesc &pipelineDesc,
const gl::AttributesMask &activeAttribLocationsMask,
PipelineAndSerial **pipelineOut)
const vk::GraphicsPipelineDesc **descPtrOut,
PipelineHelper **pipelineOut)
{
// Pull in a compatible RenderPass.
vk::RenderPass *compatibleRenderPass = nullptr;
......@@ -651,7 +652,7 @@ class ShaderProgramHelper : angle::NonCopyable
return mGraphicsPipelines.getPipeline(
context, pipelineCache, *compatibleRenderPass, pipelineLayout,
activeAttribLocationsMask, mShaders[gl::ShaderType::Vertex].get().get(),
mShaders[gl::ShaderType::Fragment].get().get(), pipelineDesc, pipelineOut);
mShaders[gl::ShaderType::Fragment].get().get(), pipelineDesc, descPtrOut, pipelineOut);
}
angle::Result getComputePipeline(Context *context,
......@@ -661,6 +662,8 @@ class ShaderProgramHelper : angle::NonCopyable
private:
gl::ShaderMap<BindingPointer<ShaderAndSerial>> mShaders;
GraphicsPipelineCache mGraphicsPipelines;
// We should probably use PipelineHelper here so we can remove PipelineAndSerial.
PipelineAndSerial mComputePipeline;
};
} // namespace vk
......
......@@ -83,14 +83,15 @@ void VulkanPipelineCachePerfTest::step()
vk::PipelineLayout pl;
vk::PipelineCache pc;
vk::ShaderModule sm;
vk::PipelineAndSerial *result = nullptr;
const vk::GraphicsPipelineDesc *desc = nullptr;
vk::PipelineHelper *result = nullptr;
gl::AttributesMask am;
for (unsigned int iteration = 0; iteration < kIterationsPerStep; ++iteration)
{
for (const auto &hit : mCacheHits)
{
(void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, hit, &result);
(void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, hit, &desc, &result);
}
}
......@@ -98,7 +99,7 @@ void VulkanPipelineCachePerfTest::step()
++missCount, ++mMissIndex)
{
const auto &miss = mCacheMisses[mMissIndex];
(void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, miss, &result);
(void)mCache.getPipeline(VK_NULL_HANDLE, pc, rp, pl, am, sm, sm, miss, &desc, &result);
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment