Commit a2e6c1a1 by Nicolas Capens Committed by Nicolas Capens

Optimize multisample resolve with SSE2 instructions

Benchmark results: Run on (48 X 2594 MHz CPU s) CPU Caches: L1 Data 32 KiB (x24) L1 Instruction 32 KiB (x24) L2 Unified 256 KiB (x24) L3 Unified 30720 KiB (x2) --------------------------------------------------------------- Benchmark Time CPU Iterations --------------------------------------------------------------- (LLVM, before) Triangle/Hello 0.845 ms 0.439 ms 1673 Triangle/Multisample 6.95 ms 0.781 ms 1000 (LLVM, after) Triangle/Hello 0.861 ms 0.450 ms 1493 Triangle/Multisample 4.03 ms 0.753 ms 747 (Subzero, before) Triangle/Hello 1.19 ms 0.474 ms 1120 Triangle/Multisample 11.8 ms 0.920 ms 747 (Subzero, after) Triangle/Hello 0.907 ms 0.486 ms 1673 Triangle/Multisample 4.62 ms 0.781 ms 1000 Bug: b/147802090 Change-Id: Iea8498f2b745c86cf578db5c0f7ef2329b73c736 Reviewed-on: https://swiftshader-review.googlesource.com/c/SwiftShader/+/47970 Presubmit-Ready: Nicolas Capens <nicolascapens@google.com> Tested-by: 's avatarNicolas Capens <nicolascapens@google.com> Reviewed-by: 's avatarAlexis Hétu <sugoi@google.com> Kokoro-Result: kokoro <noreply+kokoro@google.com>
parent 4487e589
......@@ -16,6 +16,7 @@
#include "Pipeline/ShaderCore.hpp"
#include "Reactor/Reactor.hpp"
#include "System/CPUID.hpp"
#include "System/Debug.hpp"
#include "System/Half.hpp"
#include "System/Memory.hpp"
......@@ -24,6 +25,11 @@
#include <utility>
#if defined(__i386__) || defined(__x86_64__)
# include <xmmintrin.h>
# include <emmintrin.h>
#endif
namespace {
rr::RValue<rr::Int> PackFields(rr::Int4 const &ints, const sw::int4 shifts)
{
......@@ -1971,13 +1977,36 @@ bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve r
uint8_t *source2 = source1 + slice;
uint8_t *source3 = source2 + slice;
const bool SSE2 = CPUID::supportsSSE2();
if(format == VK_FORMAT_R8G8B8A8_UNORM || format == VK_FORMAT_B8G8R8A8_UNORM || format == VK_FORMAT_A8B8G8R8_UNORM_PACK32)
{
if(samples == 4)
{
for(int y = 0; y < height; y++)
{
for(int x = 0; x < width; x++)
int x = 0;
#if defined(__i386__) || defined(__x86_64__)
if(SSE2)
{
for(; (x + 3) < width; x += 4)
{
__m128i c0 = _mm_loadu_si128((__m128i *)(source0 + 4 * x));
__m128i c1 = _mm_loadu_si128((__m128i *)(source1 + 4 * x));
__m128i c2 = _mm_loadu_si128((__m128i *)(source2 + 4 * x));
__m128i c3 = _mm_loadu_si128((__m128i *)(source3 + 4 * x));
c0 = _mm_avg_epu8(c0, c1);
c2 = _mm_avg_epu8(c2, c3);
c0 = _mm_avg_epu8(c0, c2);
_mm_storeu_si128((__m128i *)(dest + 4 * x), c0);
}
}
#endif
for(; x < width; x++)
{
uint32_t c0 = *(uint32_t *)(source0 + 4 * x);
uint32_t c1 = *(uint32_t *)(source1 + 4 * x);
......@@ -1996,6 +2025,10 @@ bool Blitter::fastResolve(const vk::Image *src, vk::Image *dst, VkImageResolve r
source2 += pitch;
source3 += pitch;
dest += pitch;
ASSERT(source0 < src->end());
ASSERT(source3 < src->end());
ASSERT(dest < dst->end());
}
}
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment