Commit 86846e24 by Geoff Lang

Clean up the functions in loadimage.h/cpp.

* Capitalize the function names to fit the style guide. * Use explicit sizes in the function names to avoid any confusion about input or output sizes. * Use explicit sized types in the functions to avoid potential issues on new platforms since a lot of bit-twiddling is used. * Use size_t for all sizes. * Use uint8_t for all binary data for input and output data so that pointer arithmetic is much easier. * Move templated function definitions into an .inl file so that loadimage.h looks as clean as possible. BUG=angle:665 Change-Id: Id7173ed66d9e1b7ee3261eea11e77d838cbd2951 Reviewed-on: https://chromium-review.googlesource.com/202590Reviewed-by: 's avatarBrandon Jones <bajones@chromium.org> Reviewed-by: 's avatarJamie Madill <jmadill@chromium.org> Tested-by: 's avatarGeoff Lang <geofflang@chromium.org>
parent 2a6564eb
...@@ -390,6 +390,7 @@ if (is_win) { ...@@ -390,6 +390,7 @@ if (is_win) {
"src/libGLESv2/renderer/imageformats.h", "src/libGLESv2/renderer/imageformats.h",
"src/libGLESv2/renderer/loadimage.cpp", "src/libGLESv2/renderer/loadimage.cpp",
"src/libGLESv2/renderer/loadimage.h", "src/libGLESv2/renderer/loadimage.h",
"src/libGLESv2/renderer/loadimage.inl",
"src/libGLESv2/renderer/loadimageSSE2.cpp", "src/libGLESv2/renderer/loadimageSSE2.cpp",
"src/libGLESv2/renderer/vertexconversion.h", "src/libGLESv2/renderer/vertexconversion.h",
"src/libGLESv2/resource.h", "src/libGLESv2/resource.h",
......
...@@ -113,6 +113,7 @@ ...@@ -113,6 +113,7 @@
<ItemGroup> <ItemGroup>
<None Include="..\..\src\angle.gyp"/> <None Include="..\..\src\angle.gyp"/>
<None Include="..\..\src\libGLESv2\libGLESv2.def"/> <None Include="..\..\src\libGLESv2\libGLESv2.def"/>
<None Include="..\..\src\libGLESv2\renderer\loadimage.inl"/>
<None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Swizzle11.hlsl"/> <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Swizzle11.hlsl"/>
<None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\generate_shaders.bat"/> <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\generate_shaders.bat"/>
<None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Passthrough3D11.hlsl"/> <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Passthrough3D11.hlsl"/>
......
...@@ -264,6 +264,9 @@ ...@@ -264,6 +264,9 @@
<ClInclude Include="..\..\src\libGLESv2\renderer\TextureStorage.h"> <ClInclude Include="..\..\src\libGLESv2\renderer\TextureStorage.h">
<Filter>src\libGLESv2\renderer</Filter> <Filter>src\libGLESv2\renderer</Filter>
</ClInclude> </ClInclude>
<None Include="..\..\src\libGLESv2\renderer\loadimage.inl">
<Filter>src\libGLESv2\renderer</Filter>
</None>
<ClInclude Include="..\..\src\libGLESv2\renderer\ShaderExecutable.h"> <ClInclude Include="..\..\src\libGLESv2\renderer\ShaderExecutable.h">
<Filter>src\libGLESv2\renderer</Filter> <Filter>src\libGLESv2\renderer</Filter>
</ClInclude> </ClInclude>
......
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
[ [
'<!@(python <(angle_path)/enumerate_files.py \ '<!@(python <(angle_path)/enumerate_files.py \
-dirs common libGLESv2 third_party/murmurhash ../include third_party/systeminfo \ -dirs common libGLESv2 third_party/murmurhash ../include third_party/systeminfo \
-types *.cpp *.h *.hlsl *.vs *.ps *.bat *.def *.rc \ -types *.cpp *.h *.inl *.hlsl *.vs *.ps *.bat *.def *.rc \
-excludes */d3d/*)', -excludes */d3d/*)',
], ],
'defines': 'defines':
...@@ -54,7 +54,7 @@ ...@@ -54,7 +54,7 @@
[ [
'<!@(python <(angle_path)/enumerate_files.py \ '<!@(python <(angle_path)/enumerate_files.py \
-dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d9 \ -dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d9 \
-types *.cpp *.h *.vs *.ps *.bat)', -types *.cpp *.h *.inl *.vs *.ps *.bat)',
], ],
'defines': 'defines':
[ [
...@@ -77,7 +77,7 @@ ...@@ -77,7 +77,7 @@
[ [
'<!@(python <(angle_path)/enumerate_files.py \ '<!@(python <(angle_path)/enumerate_files.py \
-dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d/d3d11 \ -dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d/d3d11 \
-types *.cpp *.h *.hlsl *.bat)', -types *.cpp *.h *.inl *.hlsl *.bat)',
], ],
'defines': 'defines':
[ [
......
...@@ -14,16 +14,18 @@ ...@@ -14,16 +14,18 @@
#include "libGLESv2/Caps.h" #include "libGLESv2/Caps.h"
#include "libGLESv2/angletypes.h" #include "libGLESv2/angletypes.h"
#include <cstddef>
typedef void (*MipGenerationFunction)(unsigned int sourceWidth, unsigned int sourceHeight, unsigned int sourceDepth, typedef void (*MipGenerationFunction)(unsigned int sourceWidth, unsigned int sourceHeight, unsigned int sourceDepth,
const unsigned char *sourceData, int sourceRowPitch, int sourceDepthPitch, const unsigned char *sourceData, int sourceRowPitch, int sourceDepthPitch,
unsigned char *destData, int destRowPitch, int destDepthPitch); unsigned char *destData, int destRowPitch, int destDepthPitch);
typedef void (*LoadImageFunction)(int width, int height, int depth, typedef void (*LoadImageFunction)(size_t width, size_t height, size_t depth,
const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch, const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch); uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
typedef void (*InitializeTextureDataFunction)(int width, int height, int depth, typedef void (*InitializeTextureDataFunction)(size_t width, size_t height, size_t depth,
void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch); uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
typedef void (*ColorReadFunction)(const void *source, void *dest); typedef void (*ColorReadFunction)(const void *source, void *dest);
typedef void (*ColorWriteFunction)(const void *source, void *dest); typedef void (*ColorWriteFunction)(const void *source, void *dest);
......
...@@ -168,8 +168,10 @@ void Image11::loadData(GLint xoffset, GLint yoffset, GLint zoffset, GLsizei widt ...@@ -168,8 +168,10 @@ void Image11::loadData(GLint xoffset, GLint yoffset, GLint zoffset, GLsizei widt
return; return;
} }
void* offsetMappedData = (void*)((BYTE *)mappedImage.pData + (yoffset * mappedImage.RowPitch + xoffset * outputPixelSize + zoffset * mappedImage.DepthPitch)); uint8_t* offsetMappedData = (reinterpret_cast<uint8_t*>(mappedImage.pData) + (yoffset * mappedImage.RowPitch + xoffset * outputPixelSize + zoffset * mappedImage.DepthPitch));
loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch, offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch); loadFunction(width, height, depth,
reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
unmap(); unmap();
} }
...@@ -198,11 +200,12 @@ void Image11::loadCompressedData(GLint xoffset, GLint yoffset, GLint zoffset, GL ...@@ -198,11 +200,12 @@ void Image11::loadCompressedData(GLint xoffset, GLint yoffset, GLint zoffset, GL
return; return;
} }
void* offsetMappedData = (void*)((BYTE*)mappedImage.pData + ((yoffset / outputBlockHeight) * mappedImage.RowPitch + uint8_t* offsetMappedData = reinterpret_cast<uint8_t*>(mappedImage.pData) + ((yoffset / outputBlockHeight) * mappedImage.RowPitch +
(xoffset / outputBlockWidth) * outputPixelSize + (xoffset / outputBlockWidth) * outputPixelSize +
zoffset * mappedImage.DepthPitch)); zoffset * mappedImage.DepthPitch);
loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch, loadFunction(width, height, depth,
reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch); offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
unmap(); unmap();
......
...@@ -208,7 +208,7 @@ void Image9::createSurface() ...@@ -208,7 +208,7 @@ void Image9::createSurface()
result = newSurface->LockRect(&lockedRect, &entireRect, 0); result = newSurface->LockRect(&lockedRect, &entireRect, 0);
ASSERT(SUCCEEDED(result)); ASSERT(SUCCEEDED(result));
initializeFunc(mWidth, mHeight, 1, lockedRect.pBits, lockedRect.Pitch, 0); initializeFunc(mWidth, mHeight, 1, reinterpret_cast<uint8_t*>(lockedRect.pBits), lockedRect.Pitch, 0);
result = newSurface->UnlockRect(); result = newSurface->UnlockRect();
ASSERT(SUCCEEDED(result)); ASSERT(SUCCEEDED(result));
...@@ -405,7 +405,9 @@ void Image9::loadData(GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width ...@@ -405,7 +405,9 @@ void Image9::loadData(GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width
return; return;
} }
loadFunction(width, height, depth, input, inputRowPitch, 0, locked.pBits, locked.Pitch, 0); loadFunction(width, height, depth,
reinterpret_cast<const uint8_t*>(input), inputRowPitch, 0,
reinterpret_cast<uint8_t*>(locked.pBits), locked.Pitch, 0);
unlock(); unlock();
} }
...@@ -438,8 +440,9 @@ void Image9::loadCompressedData(GLint xoffset, GLint yoffset, GLint zoffset, GLs ...@@ -438,8 +440,9 @@ void Image9::loadCompressedData(GLint xoffset, GLint yoffset, GLint zoffset, GLs
return; return;
} }
loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch, loadFunction(width, height, depth,
locked.pBits, locked.Pitch, 0); reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
reinterpret_cast<uint8_t*>(locked.pBits), locked.Pitch, 0);
unlock(); unlock();
} }
......
//
// Copyright (c) 2014 The ANGLE Project Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
#include "common/mathutil.h"
namespace rx
{
template <typename T>
inline T *OffsetDataPointer(uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch)
{
return reinterpret_cast<T*>(data + (y * rowPitch) + (z * depthPitch));
}
template <typename T>
inline const T *OffsetDataPointer(const uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch)
{
return reinterpret_cast<const T*>(data + (y * rowPitch) + (z * depthPitch));
}
template <typename type, size_t componentCount>
inline void LoadToNative(size_t width, size_t height, size_t depth,
const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
const size_t rowSize = width * sizeof(type) * componentCount;
const size_t layerSize = rowSize * height;
const size_t imageSize = layerSize * depth;
if (layerSize == inputDepthPitch && layerSize == outputDepthPitch)
{
ASSERT(rowSize == inputRowPitch && rowSize == outputRowPitch);
memcpy(output, input, imageSize);
}
else if (rowSize == inputRowPitch && rowSize == outputRowPitch)
{
for (size_t z = 0; z < depth; z++)
{
const type *source = OffsetDataPointer<type>(input, 0, z, inputRowPitch, inputDepthPitch);
type *dest = OffsetDataPointer<type>(output, 0, z, outputRowPitch, outputDepthPitch);
memcpy(dest, source, layerSize);
}
}
else
{
for (size_t z = 0; z < depth; z++)
{
for (size_t y = 0; y < height; y++)
{
const type *source = OffsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
type *dest = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
memcpy(dest, source, width * sizeof(type) * componentCount);
}
}
}
}
template <typename type, uint32_t fourthComponentBits>
inline void LoadToNative3To4(size_t width, size_t height, size_t depth,
const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
const type fourthValue = gl::bitCast<type>(fourthComponentBits);
for (size_t z = 0; z < depth; z++)
{
for (size_t y = 0; y < height; y++)
{
const type *source = OffsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
type *dest = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
for (size_t x = 0; x < width; x++)
{
dest[x * 4 + 0] = source[x * 3 + 0];
dest[x * 4 + 1] = source[x * 3 + 1];
dest[x * 4 + 2] = source[x * 3 + 2];
dest[x * 4 + 3] = fourthValue;
}
}
}
}
template <size_t componentCount>
inline void Load32FTo16F(size_t width, size_t height, size_t depth,
const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
const size_t elementWidth = componentCount * width;
for (size_t z = 0; z < depth; z++)
{
for (size_t y = 0; y < height; y++)
{
const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
for (size_t x = 0; x < elementWidth; x++)
{
dest[x] = gl::float32ToFloat16(source[x]);
}
}
}
}
template <size_t blockWidth, size_t blockHeight, size_t blockSize>
inline void LoadCompressedToNative(size_t width, size_t height, size_t depth,
const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
const size_t columns = (width + (blockWidth - 1)) / blockWidth;
const size_t rows = (height + (blockHeight - 1)) / blockHeight;
for (size_t z = 0; z < depth; ++z)
{
for (size_t y = 0; y < rows; ++y)
{
const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
memcpy(dest, source, columns * blockSize);
}
}
}
template <typename type, uint32_t firstBits, uint32_t secondBits, uint32_t thirdBits, uint32_t fourthBits>
inline void Initialize4ComponentData(size_t width, size_t height, size_t depth,
uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
type writeValues[4] =
{
gl::bitCast<type>(firstBits),
gl::bitCast<type>(secondBits),
gl::bitCast<type>(thirdBits),
gl::bitCast<type>(fourthBits),
};
for (size_t z = 0; z < depth; z++)
{
for (size_t y = 0; y < height; y++)
{
type *destRow = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
for (size_t x = 0; x < width; x++)
{
type* destPixel = destRow + x * 4;
// This could potentially be optimized by generating an entire row of initialization
// data and copying row by row instead of pixel by pixel.
memcpy(destPixel, writeValues, sizeof(type) * 4);
}
}
}
}
}
#include "precompiled.h" #include "precompiled.h"
// //
// Copyright (c) 2002-2012 The ANGLE Project Authors. All rights reserved. // Copyright (c) 2002-2014 The ANGLE Project Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be // Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file. // found in the LICENSE file.
// //
// loadimage.cpp: Defines image loading functions. It's // loadimageSSE2.cpp: Defines image loading functions. It's
// in a separated file for GCC, which can enable SSE usage only per-file, // in a separated file for GCC, which can enable SSE usage only per-file,
// not for code blocks that use SSE2 explicitly. // not for code blocks that use SSE2 explicitly.
...@@ -14,94 +14,92 @@ ...@@ -14,94 +14,92 @@
namespace rx namespace rx
{ {
void loadAlphaDataToBGRASSE2(int width, int height, int depth, void LoadA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch, const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch) uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{ {
const unsigned char *source = NULL; __m128i zeroWide = _mm_setzero_si128();
unsigned int *dest = NULL;
__m128i zeroWide = _mm_setzero_si128();
for (int z = 0; z < depth; z++) for (size_t z = 0; z < depth; z++)
{
for (size_t y = 0; y < height; y++)
{ {
for (int y = 0; y < height; y++) const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
size_t x = 0;
// Make output writes aligned
for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
{ {
source = static_cast<const unsigned char*>(input) + y * inputRowPitch + z * inputDepthPitch; dest[x] = static_cast<uint32_t>(source[x]) << 24;
dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputRowPitch + z * outputDepthPitch); }
int x; for (; x + 7 < width; x += 8)
// Make output writes aligned {
for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++) __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
{ // Interleave each byte to 16bit, make the lower byte to zero
dest[x] = static_cast<unsigned int>(source[x]) << 24; sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
} // Interleave each 16bit to 32bit, make the lower 16bit to zero
__m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
for (; x + 7 < width; x += 8) __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
{
__m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x])); _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
// Interleave each byte to 16bit, make the lower byte to zero _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
sourceData = _mm_unpacklo_epi8(zeroWide, sourceData); }
// Interleave each 16bit to 32bit, make the lower 16bit to zero
__m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData); // Handle the remainder
__m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData); for (; x < width; x++)
{
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo); dest[x] = static_cast<uint32_t>(source[x]) << 24;
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
}
// Handle the remainder
for (; x < width; x++)
{
dest[x] = static_cast<unsigned int>(source[x]) << 24;
}
} }
} }
} }
}
void loadRGBAUByteDataToBGRASSE2(int width, int height, int depth, void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch, const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch) uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{ {
const unsigned int *source = NULL; __m128i brMask = _mm_set1_epi32(0x00ff00ff);
unsigned int *dest = NULL;
__m128i brMask = _mm_set1_epi32(0x00ff00ff);
for (int z = 0; z < depth; z++) for (size_t z = 0; z < depth; z++)
{
for (size_t y = 0; y < height; y++)
{ {
for (int y = 0; y < height; y++) const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
size_t x = 0;
// Make output writes aligned
for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
{
uint32_t rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
for (; x + 3 < width; x += 4)
{
__m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
// Mask out g and a, which don't change
__m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
// Mask out b and r
__m128i brComponents = _mm_and_si128(sourceData, brMask);
// Swap b and r
__m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
__m128i result = _mm_or_si128(gaComponents, brSwapped);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
}
// Perform leftover writes
for (; x < width; x++)
{ {
source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputRowPitch + z * inputDepthPitch); uint32_t rgba = source[x];
dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputRowPitch + z * outputDepthPitch); dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
int x = 0;
// Make output writes aligned
for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
{
unsigned int rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
for (; x + 3 < width; x += 4)
{
__m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
// Mask out g and a, which don't change
__m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
// Mask out b and r
__m128i brComponents = _mm_and_si128(sourceData, brMask);
// Swap b and r
__m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
__m128i result = _mm_or_si128(gaComponents, brSwapped);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
}
// Perform leftover writes
for (; x < width; x++)
{
unsigned int rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
} }
} }
} }
}
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment