Clean up the functions in loadimage.h/cpp.

* Capitalize the function names to fit the style guide. * Use explicit sizes in the function names to avoid any confusion about input or output sizes. * Use explicit sized types in the functions to avoid potential issues on new platforms since a lot of bit-twiddling is used. * Use size_t for all sizes. * Use uint8_t for all binary data for input and output data so that pointer arithmetic is much easier. * Move templated function definitions into an .inl file so that loadimage.h looks as clean as possible. BUG=angle:665 Change-Id: Id7173ed66d9e1b7ee3261eea11e77d838cbd2951 Reviewed-on: https://chromium-review.googlesource.com/202590Reviewed-by: Brandon Jones <bajones@chromium.org> Reviewed-by: Jamie Madill <jmadill@chromium.org> Tested-by: Geoff Lang <geofflang@chromium.org>

Clean up the functions in loadimage.h/cpp.
86846e24 · Geoff Lang · 2a6564eb · 86846e24 · 86846e24 · 86846e24
Commit 86846e24 authored Jun 03, 2014 by Geoff Lang
13 changed files
--- a/BUILD.gn
+++ b/BUILD.gn
@@ -390,6 +390,7 @@ if (is_win) {
      "src/libGLESv2/renderer/imageformats.h",
      "src/libGLESv2/renderer/loadimage.cpp",
      "src/libGLESv2/renderer/loadimage.h",
+      "src/libGLESv2/renderer/loadimage.inl",
      "src/libGLESv2/renderer/loadimageSSE2.cpp",
      "src/libGLESv2/renderer/vertexconversion.h",
      "src/libGLESv2/resource.h",

--- a/projects/src/libGLESv2.vcxproj
+++ b/projects/src/libGLESv2.vcxproj
@@ -113,6 +113,7 @@
  <ItemGroup>
    <None Include="..\..\src\angle.gyp"/>
    <None Include="..\..\src\libGLESv2\libGLESv2.def"/>
+    <None Include="..\..\src\libGLESv2\renderer\loadimage.inl"/>
    <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Swizzle11.hlsl"/>
    <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\generate_shaders.bat"/>
    <None Include="..\..\src\libGLESv2\renderer\d3d\d3d11\shaders\Passthrough3D11.hlsl"/>

--- a/projects/src/libGLESv2.vcxproj.filters
+++ b/projects/src/libGLESv2.vcxproj.filters
@@ -264,6 +264,9 @@
    <ClInclude Include="..\..\src\libGLESv2\renderer\TextureStorage.h">
      <Filter>src\libGLESv2\renderer</Filter>
    </ClInclude>
+    <None Include="..\..\src\libGLESv2\renderer\loadimage.inl">
+      <Filter>src\libGLESv2\renderer</Filter>
+    </None>
    <ClInclude Include="..\..\src\libGLESv2\renderer\ShaderExecutable.h">
      <Filter>src\libGLESv2\renderer</Filter>
    </ClInclude>

--- a/src/libGLESv2.gypi
+++ b/src/libGLESv2.gypi
@@ -37,7 +37,7 @@
                    [
                        '<!@(python <(angle_path)/enumerate_files.py \
                             -dirs common libGLESv2 third_party/murmurhash ../include third_party/systeminfo \
-                             -types *.cpp *.h *.hlsl *.vs *.ps *.bat *.def *.rc \
+                             -types *.cpp *.h *.inl *.hlsl *.vs *.ps *.bat *.def *.rc \
                             -excludes */d3d/*)',
                    ],
                    'defines':
@@ -54,7 +54,7 @@
                            [
                                '<!@(python <(angle_path)/enumerate_files.py \
                                     -dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d9 \
-                                     -types *.cpp *.h *.vs *.ps *.bat)',
+                                     -types *.cpp *.h *.inl *.vs *.ps *.bat)',
                            ],
                            'defines':
                            [
@@ -77,7 +77,7 @@
                            [
                                '<!@(python <(angle_path)/enumerate_files.py \
                                     -dirs libGLESv2/renderer/d3d libGLESv2/renderer/d3d/d3d/d3d11 \
-                                     -types *.cpp *.h *.hlsl *.bat)',
+                                     -types *.cpp *.h *.inl *.hlsl *.bat)',
                            ],
                            'defines':
                            [

--- a/src/libGLESv2/formatutils.h
+++ b/src/libGLESv2/formatutils.h
@@ -14,16 +14,18 @@
 #include "libGLESv2/Caps.h"
 #include "libGLESv2/angletypes.h"
+#include <cstddef>
 typedef void (*MipGenerationFunction)(unsigned int sourceWidth, unsigned int sourceHeight, unsigned int sourceDepth,
                                      const unsigned char *sourceData, int sourceRowPitch, int sourceDepthPitch,
                                      unsigned char *destData, int destRowPitch, int destDepthPitch);
-typedef void (*LoadImageFunction)(int width, int height, int depth,
+typedef void (*LoadImageFunction)(size_t width, size_t height, size_t depth,
-                                  const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
+                                  const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
-                                  void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
+                                  uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
-typedef void (*InitializeTextureDataFunction)(int width, int height, int depth,
+typedef void (*InitializeTextureDataFunction)(size_t width, size_t height, size_t depth,
-                                              void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch);
+                                              uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch);
 typedef void (*ColorReadFunction)(const void *source, void *dest);
 typedef void (*ColorWriteFunction)(const void *source, void *dest);

--- a/src/libGLESv2/renderer/d3d/d3d11/Image11.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d11/Image11.cpp
@@ -168,8 +168,10 @@ void Image11::loadData(GLint xoffset, GLint yoffset, GLint zoffset, GLsizei widt
        return;
    }
-    void* offsetMappedData = (void*)((BYTE *)mappedImage.pData + (yoffset * mappedImage.RowPitch + xoffset * outputPixelSize + zoffset * mappedImage.DepthPitch));
+    uint8_t* offsetMappedData = (reinterpret_cast<uint8_t*>(mappedImage.pData) + (yoffset * mappedImage.RowPitch + xoffset * outputPixelSize + zoffset * mappedImage.DepthPitch));
-    loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch, offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
+                 offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
    unmap();
 }
@@ -198,11 +200,12 @@ void Image11::loadCompressedData(GLint xoffset, GLint yoffset, GLint zoffset, GL
        return;
    }
-    void* offsetMappedData = (void*)((BYTE*)mappedImage.pData + ((yoffset / outputBlockHeight) * mappedImage.RowPitch +
+    uint8_t* offsetMappedData = reinterpret_cast<uint8_t*>(mappedImage.pData) + ((yoffset / outputBlockHeight) * mappedImage.RowPitch +
-                                                                 (xoffset / outputBlockWidth) * outputPixelSize +
+                                                                           (xoffset / outputBlockWidth) * outputPixelSize +
-                                                                 zoffset * mappedImage.DepthPitch));
+                                                                           zoffset * mappedImage.DepthPitch);
-    loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch,
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
                 offsetMappedData, mappedImage.RowPitch, mappedImage.DepthPitch);
    unmap();

--- a/src/libGLESv2/renderer/d3d/d3d11/formatutils11.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d11/formatutils11.cpp
--- a/src/libGLESv2/renderer/d3d/d3d9/Image9.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d9/Image9.cpp
@@ -208,7 +208,7 @@ void Image9::createSurface()
            result = newSurface->LockRect(&lockedRect, &entireRect, 0);
            ASSERT(SUCCEEDED(result));
-            initializeFunc(mWidth, mHeight, 1, lockedRect.pBits, lockedRect.Pitch, 0);
+            initializeFunc(mWidth, mHeight, 1, reinterpret_cast<uint8_t*>(lockedRect.pBits), lockedRect.Pitch, 0);
            result = newSurface->UnlockRect();
            ASSERT(SUCCEEDED(result));
@@ -405,7 +405,9 @@ void Image9::loadData(GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width
        return;
    }
-    loadFunction(width, height, depth, input, inputRowPitch, 0, locked.pBits, locked.Pitch, 0);
+    loadFunction(width, height, depth,
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, 0,
+                 reinterpret_cast<uint8_t*>(locked.pBits), locked.Pitch, 0);
    unlock();
 }
@@ -438,8 +440,9 @@ void Image9::loadCompressedData(GLint xoffset, GLint yoffset, GLint zoffset, GLs
        return;
    }
-    loadFunction(width, height, depth, input, inputRowPitch, inputDepthPitch,
+    loadFunction(width, height, depth,
-                 locked.pBits, locked.Pitch, 0);
+                 reinterpret_cast<const uint8_t*>(input), inputRowPitch, inputDepthPitch,
+                 reinterpret_cast<uint8_t*>(locked.pBits), locked.Pitch, 0);
    unlock();
 }

--- a/src/libGLESv2/renderer/d3d/d3d9/formatutils9.cpp
+++ b/src/libGLESv2/renderer/d3d/d3d9/formatutils9.cpp
--- a/src/libGLESv2/renderer/loadimage.cpp
+++ b/src/libGLESv2/renderer/loadimage.cpp
--- a/src/libGLESv2/renderer/loadimage.h
+++ b/src/libGLESv2/renderer/loadimage.h
--- a/src/libGLESv2/renderer/loadimage.inl
+++ b/src/libGLESv2/renderer/loadimage.inl
+//
+// Copyright (c) 2014 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+#include "common/mathutil.h"
+namespace rx
+{
+template <typename T>
+inline T *OffsetDataPointer(uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch)
+{
+    return reinterpret_cast<T*>(data + (y * rowPitch) + (z * depthPitch));
+}
+template <typename T>
+inline const T *OffsetDataPointer(const uint8_t *data, size_t y, size_t z, size_t rowPitch, size_t depthPitch)
+{
+    return reinterpret_cast<const T*>(data + (y * rowPitch) + (z * depthPitch));
+}
+template <typename type, size_t componentCount>
+inline void LoadToNative(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const size_t rowSize = width * sizeof(type) * componentCount;
+    const size_t layerSize = rowSize * height;
+    const size_t imageSize = layerSize * depth;
+    if (layerSize == inputDepthPitch && layerSize == outputDepthPitch)
+    {
+        ASSERT(rowSize == inputRowPitch && rowSize == outputRowPitch);
+        memcpy(output, input, imageSize);
+    }
+    else if (rowSize == inputRowPitch && rowSize == outputRowPitch)
+    {
+        for (size_t z = 0; z < depth; z++)
+        {
+            const type *source = OffsetDataPointer<type>(input, 0, z, inputRowPitch, inputDepthPitch);
+            type *dest = OffsetDataPointer<type>(output, 0, z, outputRowPitch, outputDepthPitch);
+            memcpy(dest, source, layerSize);
+        }
+    }
+    else
+    {
+        for (size_t z = 0; z < depth; z++)
+        {
+            for (size_t y = 0; y < height; y++)
+            {
+                const type *source = OffsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
+                type *dest = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
+                memcpy(dest, source, width * sizeof(type) * componentCount);
+            }
+        }
+    }
+}
+template <typename type, uint32_t fourthComponentBits>
+inline void LoadToNative3To4(size_t width, size_t height, size_t depth,
+                             const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                             uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const type fourthValue = gl::bitCast<type>(fourthComponentBits);
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const type *source = OffsetDataPointer<type>(input, y, z, inputRowPitch, inputDepthPitch);
+            type *dest = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                dest[x * 4 + 0] = source[x * 3 + 0];
+                dest[x * 4 + 1] = source[x * 3 + 1];
+                dest[x * 4 + 2] = source[x * 3 + 2];
+                dest[x * 4 + 3] = fourthValue;
+            }
+        }
+    }
+}
+template <size_t componentCount>
+inline void Load32FTo16F(size_t width, size_t height, size_t depth,
+                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const size_t elementWidth = componentCount * width;
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            const float *source = OffsetDataPointer<float>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint16_t *dest = OffsetDataPointer<uint16_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < elementWidth; x++)
+            {
+                dest[x] = gl::float32ToFloat16(source[x]);
+            }
+        }
+    }
+}
+template <size_t blockWidth, size_t blockHeight, size_t blockSize>
+inline void LoadCompressedToNative(size_t width, size_t height, size_t depth,
+                                   const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
+                                   uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    const size_t columns = (width + (blockWidth - 1)) / blockWidth;
+    const size_t rows = (height + (blockHeight - 1)) / blockHeight;
+    for (size_t z = 0; z < depth; ++z)
+    {
+        for (size_t y = 0; y < rows; ++y)
+        {
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint8_t *dest = OffsetDataPointer<uint8_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            memcpy(dest, source, columns * blockSize);
+        }
+    }
+}
+template <typename type, uint32_t firstBits, uint32_t secondBits, uint32_t thirdBits, uint32_t fourthBits>
+inline void Initialize4ComponentData(size_t width, size_t height, size_t depth,
+                                     uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
+{
+    type writeValues[4] =
+    {
+        gl::bitCast<type>(firstBits),
+        gl::bitCast<type>(secondBits),
+        gl::bitCast<type>(thirdBits),
+        gl::bitCast<type>(fourthBits),
+    };
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
+        {
+            type *destRow = OffsetDataPointer<type>(output, y, z, outputRowPitch, outputDepthPitch);
+            for (size_t x = 0; x < width; x++)
+            {
+                type* destPixel = destRow + x * 4;
+                // This could potentially be optimized by generating an entire row of initialization
+                // data and copying row by row instead of pixel by pixel.
+                memcpy(destPixel, writeValues, sizeof(type) * 4);
+            }
+        }
+    }
+}
+}
--- a/src/libGLESv2/renderer/loadimageSSE2.cpp
+++ b/src/libGLESv2/renderer/loadimageSSE2.cpp
 #include "precompiled.h"
 //
-// Copyright (c) 2002-2012 The ANGLE Project Authors. All rights reserved.
+// Copyright (c) 2002-2014 The ANGLE Project Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 //
-// loadimage.cpp: Defines image loading functions. It's
+// loadimageSSE2.cpp: Defines image loading functions. It's
 // in a separated file for GCC, which can enable SSE usage only per-file,
 // not for code blocks that use SSE2 explicitly.
@@ -14,94 +14,92 @@
 namespace rx
 {
-    void loadAlphaDataToBGRASSE2(int width, int height, int depth,
+void LoadA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
-        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
+                        const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
-        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+                        uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
-    {
+{
-        const unsigned char *source = NULL;
+    __m128i zeroWide = _mm_setzero_si128();
-        unsigned int *dest = NULL;
-        __m128i zeroWide = _mm_setzero_si128();
-        for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
        {
-            for (int y = 0; y < height; y++)
+            const uint8_t *source = OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            size_t x = 0;
+            // Make output writes aligned
+            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
            {
-                source = static_cast<const unsigned char*>(input) + y * inputRowPitch + z * inputDepthPitch;
+                dest[x] = static_cast<uint32_t>(source[x]) << 24;
-                dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputRowPitch + z * outputDepthPitch);
+            }
-                int x;
+            for (; x + 7 < width; x += 8)
-                // Make output writes aligned
+            {
-                for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
+                __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
-                {
+                // Interleave each byte to 16bit, make the lower byte to zero
-                    dest[x] = static_cast<unsigned int>(source[x]) << 24;
+                sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
-                }
+                // Interleave each 16bit to 32bit, make the lower 16bit to zero
+                __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
-                for (; x + 7 < width; x += 8)
+                __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
-                {
-                    __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
+                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
-                    // Interleave each byte to 16bit, make the lower byte to zero
+                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
-                    sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
+            }
-                    // Interleave each 16bit to 32bit, make the lower 16bit to zero
-                    __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
+            // Handle the remainder
-                    __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
+            for (; x < width; x++)
+            {
-                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
+                dest[x] = static_cast<uint32_t>(source[x]) << 24;
-                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
-                }
-                // Handle the remainder
-                for (; x < width; x++)
-                {
-                    dest[x] = static_cast<unsigned int>(source[x]) << 24;
-                }
            }
        }
    }
+}
-    void loadRGBAUByteDataToBGRASSE2(int width, int height, int depth,
+void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
-        const void *input, unsigned int inputRowPitch, unsigned int inputDepthPitch,
+                           const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
-        void *output, unsigned int outputRowPitch, unsigned int outputDepthPitch)
+                           uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
-    {
+{
-        const unsigned int *source = NULL;
+    __m128i brMask = _mm_set1_epi32(0x00ff00ff);
-        unsigned int *dest = NULL;
-        __m128i brMask = _mm_set1_epi32(0x00ff00ff);
-        for (int z = 0; z < depth; z++)
+    for (size_t z = 0; z < depth; z++)
+    {
+        for (size_t y = 0; y < height; y++)
        {
-            for (int y = 0; y < height; y++)
+            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
+            size_t x = 0;
+            // Make output writes aligned
+            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
+            {
+                uint32_t rgba = source[x];
+                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
+            }
+            for (; x + 3 < width; x += 4)
+            {
+                __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
+                // Mask out g and a, which don't change
+                __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
+                // Mask out b and r
+                __m128i brComponents = _mm_and_si128(sourceData, brMask);
+                // Swap b and r
+                __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
+                __m128i result = _mm_or_si128(gaComponents, brSwapped);
+                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
+            }
+            // Perform leftover writes
+            for (; x < width; x++)
            {
-                source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputRowPitch + z * inputDepthPitch);
+                uint32_t rgba = source[x];
-                dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputRowPitch + z * outputDepthPitch);
+                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-                int x = 0;
-                // Make output writes aligned
-                for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
-                {
-                    unsigned int rgba = source[x];
-                    dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-                }
-                for (; x + 3 < width; x += 4)
-                {
-                    __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
-                    // Mask out g and a, which don't change
-                    __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
-                    // Mask out b and r
-                    __m128i brComponents = _mm_and_si128(sourceData, brMask);
-                    // Swap b and r
-                    __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
-                    __m128i result = _mm_or_si128(gaComponents, brSwapped);
-                    _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
-                }
-                // Perform leftover writes
-                for (; x < width; x++)
-                {
-                    unsigned int rgba = source[x];
-                    dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-                }
            }
        }
    }
+}
 }