Commit 745e0712 by Mohan Maiya Committed by Commit Bot

Vulkan: Enable CPU only buffers for PBOs

Add support for a CPU only buffer for PBOs that serve as the destination for all host operations like MapBuffer*. This removes the latency caused by waiting for the in-flight GPU commands to be complete before handing over the buffer to the app. This change removes a ~6ms wait/sleep on the first call to MapBuffer* in each frame of Manhattan Bug: angleproject:4339 Tests: angle_end2end_tests --gtest_filter=BufferDataTest*Vulkan Change-Id: I52016b160af8a670cc30f01c05e48f699521310f Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/2116874 Commit-Queue: Mohan Maiya <m.maiya@samsung.com> Reviewed-by: 's avatarTobin Ehlis <tobine@google.com>
parent dcd98298
...@@ -100,13 +100,13 @@ class BufferVk : public BufferImpl ...@@ -100,13 +100,13 @@ class BufferVk : public BufferImpl
VkDeviceSize length, VkDeviceSize length,
GLbitfield access, GLbitfield access,
void **mapPtr); void **mapPtr);
void unmapImpl(ContextVk *contextVk); angle::Result unmapImpl(ContextVk *contextVk);
// Calls copyBuffer internally. // Calls copyBuffer internally.
angle::Result copyToBuffer(ContextVk *contextVk, angle::Result copyToBufferImpl(ContextVk *contextVk,
vk::BufferHelper *destBuffer, vk::BufferHelper *destBuffer,
uint32_t copyCount, uint32_t copyCount,
const VkBufferCopy *copies); const VkBufferCopy *copies);
ConversionBuffer *getVertexConversionBuffer(RendererVk *renderer, ConversionBuffer *getVertexConversionBuffer(RendererVk *renderer,
angle::FormatID formatID, angle::FormatID formatID,
...@@ -116,6 +116,29 @@ class BufferVk : public BufferImpl ...@@ -116,6 +116,29 @@ class BufferVk : public BufferImpl
private: private:
void initializeStagingBuffer(ContextVk *contextVk, gl::BufferBinding target, size_t size); void initializeStagingBuffer(ContextVk *contextVk, gl::BufferBinding target, size_t size);
angle::Result initializeShadowBuffer(ContextVk *contextVk,
gl::BufferBinding target,
size_t size);
ANGLE_INLINE uint8_t *getShadowBuffer(size_t offset)
{
return (mShadowBuffer.getCurrentBuffer() + offset);
}
ANGLE_INLINE const uint8_t *getShadowBuffer(size_t offset) const
{
return (mShadowBuffer.getCurrentBuffer() + offset);
}
void updateShadowBuffer(const uint8_t *data, size_t size, size_t offset);
angle::Result directUpdate(ContextVk *contextVk,
const uint8_t *data,
size_t size,
size_t offset);
angle::Result stagedUpdate(ContextVk *contextVk,
const uint8_t *data,
size_t size,
size_t offset);
angle::Result setDataImpl(ContextVk *contextVk, angle::Result setDataImpl(ContextVk *contextVk,
const uint8_t *data, const uint8_t *data,
size_t size, size_t size,
...@@ -145,6 +168,13 @@ class BufferVk : public BufferImpl ...@@ -145,6 +168,13 @@ class BufferVk : public BufferImpl
// All staging buffer support is provided by a DynamicBuffer. // All staging buffer support is provided by a DynamicBuffer.
vk::DynamicBuffer mStagingBuffer; vk::DynamicBuffer mStagingBuffer;
// For GPU-read only buffers glMap* latency is reduced by maintaining a copy
// of the buffer which is writeable only by the CPU. The contents are updated on all
// glData/glSubData/glCopy calls. With this, a glMap* call becomes a non-blocking
// operation by elimnating the need to wait on any recorded or in-flight GPU commands.
// We use DynamicShadowBuffer class to encapsulate all the bookeeping logic.
vk::DynamicShadowBuffer mShadowBuffer;
// A cache of converted vertex data. // A cache of converted vertex data.
std::vector<VertexConversionBuffer> mVertexConversionBuffers; std::vector<VertexConversionBuffer> mVertexConversionBuffers;
}; };
......
...@@ -1000,7 +1000,7 @@ angle::Result ContextVk::setupIndexedDraw(const gl::Context *context, ...@@ -1000,7 +1000,7 @@ angle::Result ContextVk::setupIndexedDraw(const gl::Context *context,
const size_t byteCount = static_cast<size_t>(elementArrayBuffer->getSize()) - const size_t byteCount = static_cast<size_t>(elementArrayBuffer->getSize()) -
reinterpret_cast<uintptr_t>(indices); reinterpret_cast<uintptr_t>(indices);
ANGLE_TRY(mVertexArray->convertIndexBufferCPU(this, indexType, byteCount, src)); ANGLE_TRY(mVertexArray->convertIndexBufferCPU(this, indexType, byteCount, src));
bufferVk->unmapImpl(this); ANGLE_TRY(bufferVk->unmapImpl(this));
} }
else else
{ {
......
...@@ -31,6 +31,25 @@ angle::Result Resource::finishRunningCommands(ContextVk *contextVk) ...@@ -31,6 +31,25 @@ angle::Result Resource::finishRunningCommands(ContextVk *contextVk)
return contextVk->finishToSerial(mUse.getSerial()); return contextVk->finishToSerial(mUse.getSerial());
} }
angle::Result Resource::waitForIdle(ContextVk *contextVk)
{
// If there are pending commands for the resource, flush them.
if (usedInRecordedCommands())
{
ANGLE_TRY(contextVk->flushImpl(nullptr));
}
// Make sure the driver is done with the resource.
if (usedInRunningCommands(contextVk->getLastCompletedQueueSerial()))
{
ANGLE_TRY(finishRunningCommands(contextVk));
}
ASSERT(!isCurrentlyInUse(contextVk->getLastCompletedQueueSerial()));
return angle::Result::Continue;
}
// SharedGarbage implementation. // SharedGarbage implementation.
SharedGarbage::SharedGarbage() = default; SharedGarbage::SharedGarbage() = default;
......
...@@ -179,6 +179,9 @@ class Resource : angle::NonCopyable ...@@ -179,6 +179,9 @@ class Resource : angle::NonCopyable
// Ensures the driver is caught up to this resource and it is only in use by ANGLE. // Ensures the driver is caught up to this resource and it is only in use by ANGLE.
angle::Result finishRunningCommands(ContextVk *contextVk); angle::Result finishRunningCommands(ContextVk *contextVk);
// Complete all recorded and in-flight commands involving this resource
angle::Result waitForIdle(ContextVk *contextVk);
// Adds the resource to a resource use list. // Adds the resource to a resource use list.
void retain(ResourceUseList *resourceUseList); void retain(ResourceUseList *resourceUseList);
......
...@@ -283,7 +283,7 @@ angle::Result TextureVk::setSubImageImpl(const gl::Context *context, ...@@ -283,7 +283,7 @@ angle::Result TextureVk::setSubImageImpl(const gl::Context *context,
gl::Offset(area.x, area.y, area.z), formatInfo, unpack, type, source, vkFormat, gl::Offset(area.x, area.y, area.z), formatInfo, unpack, type, source, vkFormat,
inputRowPitch, inputDepthPitch, inputSkipBytes)); inputRowPitch, inputDepthPitch, inputSkipBytes));
unpackBufferVk->unmapImpl(contextVk); ANGLE_TRY(unpackBufferVk->unmapImpl(contextVk));
} }
} }
else if (pixels) else if (pixels)
......
...@@ -425,7 +425,7 @@ angle::Result VertexArrayVk::convertVertexBufferCPU(ContextVk *contextVk, ...@@ -425,7 +425,7 @@ angle::Result VertexArrayVk::convertVertexBufferCPU(ContextVk *contextVk,
0, numVertices, binding.getStride(), srcFormatSize, 0, numVertices, binding.getStride(), srcFormatSize,
vertexFormat.vertexLoadFunction, &mCurrentArrayBuffers[attribIndex], vertexFormat.vertexLoadFunction, &mCurrentArrayBuffers[attribIndex],
&conversion->lastAllocationOffset, 1)); &conversion->lastAllocationOffset, 1));
srcBuffer->unmapImpl(contextVk); ANGLE_TRY(srcBuffer->unmapImpl(contextVk));
ASSERT(conversion->dirty); ASSERT(conversion->dirty);
conversion->dirty = false; conversion->dirty = false;
...@@ -757,7 +757,7 @@ angle::Result VertexArrayVk::updateStreamedAttribs(const gl::Context *context, ...@@ -757,7 +757,7 @@ angle::Result VertexArrayVk::updateStreamedAttribs(const gl::Context *context,
&mCurrentArrayBufferOffsets[attribIndex], divisor)); &mCurrentArrayBufferOffsets[attribIndex], divisor));
if (bufferVk) if (bufferVk)
{ {
bufferVk->unmapImpl(contextVk); ANGLE_TRY(bufferVk->unmapImpl(contextVk));
} }
} }
else else
......
...@@ -744,6 +744,69 @@ void DynamicBuffer::reset() ...@@ -744,6 +744,69 @@ void DynamicBuffer::reset()
mLastFlushOrInvalidateOffset = 0; mLastFlushOrInvalidateOffset = 0;
} }
// DynamicShadowBuffer implementation.
DynamicShadowBuffer::DynamicShadowBuffer() : mInitialSize(0), mSize(0) {}
DynamicShadowBuffer::DynamicShadowBuffer(DynamicShadowBuffer &&other)
: mInitialSize(other.mInitialSize), mSize(other.mSize), mBuffer(std::move(other.mBuffer))
{}
void DynamicShadowBuffer::init(size_t initialSize)
{
mInitialSize = initialSize;
}
DynamicShadowBuffer::~DynamicShadowBuffer()
{
ASSERT(mBuffer.empty());
}
angle::Result DynamicShadowBuffer::allocate(size_t sizeInBytes)
{
bool result = true;
// Delete the current buffer, if any
if (!mBuffer.empty())
{
result &= mBuffer.resize(0);
}
// Cache the new size
mSize = std::max(mInitialSize, sizeInBytes);
// Allocate the buffer
result &= mBuffer.resize(mSize);
// If allocation failed, release the buffer and return error.
if (!result)
{
release();
return angle::Result::Stop;
}
return angle::Result::Continue;
}
void DynamicShadowBuffer::release()
{
reset();
if (!mBuffer.empty())
{
(void)mBuffer.resize(0);
}
}
void DynamicShadowBuffer::destroy(VkDevice device)
{
release();
}
void DynamicShadowBuffer::reset()
{
mSize = 0;
}
// DescriptorPoolHelper implementation. // DescriptorPoolHelper implementation.
DescriptorPoolHelper::DescriptorPoolHelper() : mFreeDescriptorSets(0) {} DescriptorPoolHelper::DescriptorPoolHelper() : mFreeDescriptorSets(0) {}
...@@ -1371,7 +1434,7 @@ angle::Result LineLoopHelper::getIndexBufferForElementArrayBuffer(ContextVk *con ...@@ -1371,7 +1434,7 @@ angle::Result LineLoopHelper::getIndexBufferForElementArrayBuffer(ContextVk *con
ANGLE_TRY(streamIndices(contextVk, glIndexType, indexCount, ANGLE_TRY(streamIndices(contextVk, glIndexType, indexCount,
static_cast<const uint8_t *>(srcDataMapping) + elementArrayOffset, static_cast<const uint8_t *>(srcDataMapping) + elementArrayOffset,
bufferOut, bufferOffsetOut, indexCountOut)); bufferOut, bufferOffsetOut, indexCountOut));
elementArrayBufferVk->unmapImpl(contextVk); ANGLE_TRY(elementArrayBufferVk->unmapImpl(contextVk));
return angle::Result::Continue; return angle::Result::Continue;
} }
...@@ -1396,7 +1459,7 @@ angle::Result LineLoopHelper::getIndexBufferForElementArrayBuffer(ContextVk *con ...@@ -1396,7 +1459,7 @@ angle::Result LineLoopHelper::getIndexBufferForElementArrayBuffer(ContextVk *con
if (contextVk->getRenderer()->getFeatures().extraCopyBufferRegion.enabled) if (contextVk->getRenderer()->getFeatures().extraCopyBufferRegion.enabled)
copies.push_back({sourceOffset, *bufferOffsetOut + (unitCount + 1) * unitSize, 1}); copies.push_back({sourceOffset, *bufferOffsetOut + (unitCount + 1) * unitSize, 1});
ANGLE_TRY(elementArrayBufferVk->copyToBuffer( ANGLE_TRY(elementArrayBufferVk->copyToBufferImpl(
contextVk, *bufferOut, static_cast<uint32_t>(copies.size()), copies.data())); contextVk, *bufferOut, static_cast<uint32_t>(copies.size()), copies.data()));
ANGLE_TRY(mDynamicIndexBuffer.flush(contextVk)); ANGLE_TRY(mDynamicIndexBuffer.flush(contextVk));
return angle::Result::Continue; return angle::Result::Continue;
...@@ -3594,7 +3657,7 @@ angle::Result ImageHelper::readPixels(ContextVk *contextVk, ...@@ -3594,7 +3657,7 @@ angle::Result ImageHelper::readPixels(ContextVk *contextVk,
uint8_t *dest = static_cast<uint8_t *>(mapPtr) + reinterpret_cast<ptrdiff_t>(pixels); uint8_t *dest = static_cast<uint8_t *>(mapPtr) + reinterpret_cast<ptrdiff_t>(pixels);
PackPixels(packPixelsParams, *readFormat, area.width * readFormat->pixelBytes, PackPixels(packPixelsParams, *readFormat, area.width * readFormat->pixelBytes,
readPixelBuffer, static_cast<uint8_t *>(dest)); readPixelBuffer, static_cast<uint8_t *>(dest));
packBufferVk->unmapImpl(contextVk); ANGLE_TRY(packBufferVk->unmapImpl(contextVk));
} }
else else
{ {
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#ifndef LIBANGLE_RENDERER_VULKAN_VK_HELPERS_H_ #ifndef LIBANGLE_RENDERER_VULKAN_VK_HELPERS_H_
#define LIBANGLE_RENDERER_VULKAN_VK_HELPERS_H_ #define LIBANGLE_RENDERER_VULKAN_VK_HELPERS_H_
#include "common/MemoryBuffer.h"
#include "libANGLE/renderer/vulkan/ResourceVk.h" #include "libANGLE/renderer/vulkan/ResourceVk.h"
#include "libANGLE/renderer/vulkan/vk_cache_utils.h" #include "libANGLE/renderer/vulkan/vk_cache_utils.h"
...@@ -121,6 +122,77 @@ class DynamicBuffer : angle::NonCopyable ...@@ -121,6 +122,77 @@ class DynamicBuffer : angle::NonCopyable
std::vector<BufferHelper *> mBufferFreeList; std::vector<BufferHelper *> mBufferFreeList;
}; };
// Based off of the DynamicBuffer class, DynamicShadowBuffer provides
// a similar conceptually infinitely long buffer that will only be written
// to and read by the CPU. This can be used to provide CPU cached copies of
// GPU-read only buffers. The value add here is that when an app requests
// CPU access to a buffer we can fullfil such a request in O(1) time since
// we don't need to wait for GPU to be done with in-flight commands.
//
// The hidden cost here is that any operation that updates a buffer, either
// through a buffer sub data update or a buffer-to-buffer copy will have an
// additional overhead of having to update its CPU only buffer
class DynamicShadowBuffer : public angle::NonCopyable
{
public:
DynamicShadowBuffer();
DynamicShadowBuffer(DynamicShadowBuffer &&other);
~DynamicShadowBuffer();
// Initialize the DynamicShadowBuffer.
void init(size_t initialSize);
// Returns whether this DynamicShadowBuffer is active
ANGLE_INLINE bool valid() { return (mSize != 0); }
// This call will actually allocate a new CPU only memory from the heap.
// The size can be different than the one specified during `init`.
angle::Result allocate(size_t sizeInBytes);
ANGLE_INLINE void updateData(const uint8_t *data, size_t size, size_t offset)
{
ASSERT(!mBuffer.empty());
// Memcopy data into the buffer
memcpy((mBuffer.data() + offset), data, size);
}
// Map the CPU only buffer and return the pointer. We map the entire buffer for now.
ANGLE_INLINE void map(size_t offset, void **mapPtr)
{
ASSERT(mapPtr);
ASSERT(!mBuffer.empty());
*mapPtr = mBuffer.data() + offset;
}
// Unmap the CPU only buffer, NOOP for now
ANGLE_INLINE void unmap() {}
// This releases resources when they might currently be in use.
void release();
// This frees resources immediately.
void destroy(VkDevice device);
ANGLE_INLINE uint8_t *getCurrentBuffer()
{
ASSERT(!mBuffer.empty());
return mBuffer.data();
}
ANGLE_INLINE const uint8_t *getCurrentBuffer() const
{
ASSERT(!mBuffer.empty());
return mBuffer.data();
}
private:
void reset();
size_t mInitialSize;
size_t mSize;
angle::MemoryBuffer mBuffer;
};
// Uses DescriptorPool to allocate descriptor sets as needed. If a descriptor pool becomes full, we // Uses DescriptorPool to allocate descriptor sets as needed. If a descriptor pool becomes full, we
// allocate new pools internally as needed. RendererVk takes care of the lifetime of the discarded // allocate new pools internally as needed. RendererVk takes care of the lifetime of the discarded
// pools. Note that we used a fixed layout for descriptor pools in ANGLE. Uniform buffers must // pools. Note that we used a fixed layout for descriptor pools in ANGLE. Uniform buffers must
......
...@@ -411,6 +411,166 @@ TEST_P(BufferDataTestES3, BufferResizing) ...@@ -411,6 +411,166 @@ TEST_P(BufferDataTestES3, BufferResizing)
EXPECT_GL_NO_ERROR(); EXPECT_GL_NO_ERROR();
} }
// Test to verify mapping a buffer after copying to it contains flushed/updated data
TEST_P(BufferDataTestES3, CopyBufferSubDataMapReadTest)
{
const char simpleVertex[] = R"(attribute vec2 position;
attribute vec4 color;
varying vec4 vColor;
void main()
{
gl_Position = vec4(position, 0, 1);
vColor = color;
}
)";
const char simpleFragment[] = R"(precision mediump float;
varying vec4 vColor;
void main()
{
gl_FragColor = vColor;
}
)";
const uint32_t numComponents = 3;
const uint32_t width = 4;
const uint32_t height = 4;
const size_t numElements = width * height * numComponents;
std::vector<uint8_t> srcData(numElements);
std::vector<uint8_t> dstData(numElements);
for (uint8_t i = 0; i < srcData.size(); i++)
{
srcData[i] = 128;
}
for (uint8_t i = 0; i < dstData.size(); i++)
{
dstData[i] = 0;
}
GLBuffer srcBuffer;
GLBuffer dstBuffer;
glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
glBufferData(GL_ARRAY_BUFFER, srcData.size(), srcData.data(), GL_STATIC_DRAW);
ASSERT_GL_NO_ERROR();
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, dstBuffer);
glBufferData(GL_PIXEL_UNPACK_BUFFER, dstData.size(), dstData.data(), GL_STATIC_READ);
ASSERT_GL_NO_ERROR();
ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
glUseProgram(program);
GLint colorLoc = glGetAttribLocation(program, "color");
ASSERT_NE(-1, colorLoc);
glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
glEnableVertexAttribArray(colorLoc);
drawQuad(program, "position", 0.5f, 1.0f, true);
ASSERT_GL_NO_ERROR();
glCopyBufferSubData(GL_ARRAY_BUFFER, GL_PIXEL_UNPACK_BUFFER, 0, 0, numElements);
// With GL_MAP_READ_BIT, we expect the data to be flushed and updated to match srcData
uint8_t *data = reinterpret_cast<uint8_t *>(
glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, numElements, GL_MAP_READ_BIT));
EXPECT_GL_NO_ERROR();
for (size_t i = 0; i < numElements; ++i)
{
EXPECT_EQ(srcData[i], data[i]);
}
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
EXPECT_GL_NO_ERROR();
}
// Test to verify mapping a buffer after copying to it contains expected data
// with GL_MAP_UNSYNCHRONIZED_BIT
TEST_P(BufferDataTestES3, MapBufferUnsynchronizedReadTest)
{
const char simpleVertex[] = R"(attribute vec2 position;
attribute vec4 color;
varying vec4 vColor;
void main()
{
gl_Position = vec4(position, 0, 1);
vColor = color;
}
)";
const char simpleFragment[] = R"(precision mediump float;
varying vec4 vColor;
void main()
{
gl_FragColor = vColor;
}
)";
const uint32_t numComponents = 3;
const uint32_t width = 4;
const uint32_t height = 4;
const size_t numElements = width * height * numComponents;
std::vector<uint8_t> srcData(numElements);
std::vector<uint8_t> dstData(numElements);
for (uint8_t i = 0; i < srcData.size(); i++)
{
srcData[i] = 128;
}
for (uint8_t i = 0; i < dstData.size(); i++)
{
dstData[i] = 0;
}
GLBuffer srcBuffer;
GLBuffer dstBuffer;
glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
glBufferData(GL_ARRAY_BUFFER, srcData.size(), srcData.data(), GL_STATIC_DRAW);
ASSERT_GL_NO_ERROR();
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, dstBuffer);
glBufferData(GL_PIXEL_UNPACK_BUFFER, dstData.size(), dstData.data(), GL_STATIC_READ);
ASSERT_GL_NO_ERROR();
ANGLE_GL_PROGRAM(program, simpleVertex, simpleFragment);
glUseProgram(program);
GLint colorLoc = glGetAttribLocation(program, "color");
ASSERT_NE(-1, colorLoc);
glBindBuffer(GL_ARRAY_BUFFER, srcBuffer);
glVertexAttribPointer(colorLoc, 3, GL_UNSIGNED_BYTE, GL_TRUE, 0, nullptr);
glEnableVertexAttribArray(colorLoc);
drawQuad(program, "position", 0.5f, 1.0f, true);
ASSERT_GL_NO_ERROR();
glCopyBufferSubData(GL_ARRAY_BUFFER, GL_PIXEL_UNPACK_BUFFER, 0, 0, numElements);
// Synchronize.
glFinish();
// Map with GL_MAP_UNSYNCHRONIZED_BIT and overwrite buffers data with srcData
uint8_t *data = reinterpret_cast<uint8_t *>(glMapBufferRange(
GL_PIXEL_UNPACK_BUFFER, 0, numElements, GL_MAP_WRITE_BIT | GL_MAP_UNSYNCHRONIZED_BIT));
EXPECT_GL_NO_ERROR();
memcpy(data, srcData.data(), srcData.size());
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
EXPECT_GL_NO_ERROR();
// Map without GL_MAP_UNSYNCHRONIZED_BIT and read data. We expect it to be srcData
data = reinterpret_cast<uint8_t *>(
glMapBufferRange(GL_PIXEL_UNPACK_BUFFER, 0, numElements, GL_MAP_READ_BIT));
EXPECT_GL_NO_ERROR();
for (size_t i = 0; i < numElements; ++i)
{
EXPECT_EQ(srcData[i], data[i]);
}
glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
EXPECT_GL_NO_ERROR();
}
// Verify the functionality of glMapBufferRange()'s GL_MAP_UNSYNCHRONIZED_BIT // Verify the functionality of glMapBufferRange()'s GL_MAP_UNSYNCHRONIZED_BIT
// NOTE: On Vulkan, if we ever use memory that's not `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT`, then // NOTE: On Vulkan, if we ever use memory that's not `VK_MEMORY_PROPERTY_HOST_COHERENT_BIT`, then
// this could incorrectly pass. // this could incorrectly pass.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment