Clean up the SSE detection logic.

* Include the correct header right in the SSE check in platform.h. * Don't use separate SSE versions of the load functions, have them use SSE automatically. BUG=612205 Change-Id: I70f9a5513e144db4d16c1f3ad922debeb6c50268 Reviewed-on: https://chromium-review.googlesource.com/358108Reviewed-by: Corentin Wallez <cwallez@chromium.org> Reviewed-by: Jamie Madill <jmadill@chromium.org> Commit-Queue: Geoff Lang <geofflang@chromium.org>

Clean up the SSE detection logic.
5695fc99 · Geoff Lang · Commit Bot · e92507bc · 5695fc99 · 5695fc99
Commit 5695fc99 authored Jul 05, 2016 by Geoff Lang Committed by Commit Bot Jul 06, 2016
7 changed files
--- a/src/common/mathutil.h
+++ b/src/common/mathutil.h
@@ -135,7 +135,7 @@ inline unsigned int unorm(float x)
 inline bool supportsSSE2()
 {
-#if defined(ANGLE_PLATFORM_WINDOWS) && !defined(_M_ARM)
+#if defined(ANGLE_USE_SSE)
    static bool checked = false;
    static bool supports = false;
@@ -144,21 +144,22 @@ inline bool supportsSSE2()
        return supports;
    }
-    int info[4];
+#if defined(ANGLE_PLATFORM_WINDOWS) && !defined(_M_ARM)
-    __cpuid(info, 0);
-    if (info[0] >= 1)
    {
-        __cpuid(info, 1);
+        int info[4];
+        __cpuid(info, 0);
-        supports = (info[3] >> 26) & 1;
+        if (info[0] >= 1)
-    }
+        {
+            __cpuid(info, 1);
+            supports = (info[3] >> 26) & 1;
+        }
+    }
+#endif  // defined(ANGLE_PLATFORM_WINDOWS) && !defined(_M_ARM)
    checked = true;
    return supports;
-#else
+#else  // defined(ANGLE_USE_SSE)
-    UNIMPLEMENTED();
    return false;
 #endif
 }

--- a/src/common/platform.h
+++ b/src/common/platform.h
@@ -77,8 +77,12 @@
 #   undef far
 #endif
-#if !defined(_M_ARM) && !defined(ANGLE_PLATFORM_ANDROID)
+#if defined(_MSC_VER) && !defined(_M_ARM)
-#   define ANGLE_USE_SSE
+#include <intrin.h>
+#define ANGLE_USE_SSE
+#elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
+#include <x86intrin.h>
+#define ANGLE_USE_SSE
 #endif
 #endif // COMMON_PLATFORM_H_
--- a/src/image_util/loadimage.cpp
+++ b/src/image_util/loadimage.cpp
@@ -9,6 +9,7 @@
 #include "image_util/loadimage.h"
 #include "common/mathutil.h"
+#include "common/platform.h"
 #include "image_util/imageformats.h"
 namespace angle
@@ -24,6 +25,54 @@ void LoadA8ToRGBA8(size_t width,
                   size_t outputRowPitch,
                   size_t outputDepthPitch)
 {
+#if defined(ANGLE_USE_SSE)
+    if (gl::supportsSSE2())
+    {
+        __m128i zeroWide = _mm_setzero_si128();
+        for (size_t z = 0; z < depth; z++)
+        {
+            for (size_t y = 0; y < height; y++)
+            {
+                const uint8_t *source =
+                    priv::OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
+                uint32_t *dest = priv::OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch,
+                                                                   outputDepthPitch);
+                size_t x = 0;
+                // Make output writes aligned
+                for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
+                {
+                    dest[x] = static_cast<uint32_t>(source[x]) << 24;
+                }
+                for (; x + 7 < width; x += 8)
+                {
+                    __m128i sourceData =
+                        _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&source[x]));
+                    // Interleave each byte to 16bit, make the lower byte to zero
+                    sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
+                    // Interleave each 16bit to 32bit, make the lower 16bit to zero
+                    __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
+                    __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
+                    _mm_store_si128(reinterpret_cast<__m128i *>(&dest[x]), lo);
+                    _mm_store_si128(reinterpret_cast<__m128i *>(&dest[x + 4]), hi);
+                }
+                // Handle the remainder
+                for (; x < width; x++)
+                {
+                    dest[x] = static_cast<uint32_t>(source[x]) << 24;
+                }
+            }
+        }
+        return;
+    }
+#endif
    for (size_t z = 0; z < depth; z++)
    {
        for (size_t y = 0; y < height; y++)
@@ -545,6 +594,58 @@ void LoadRGBA8ToBGRA8(size_t width,
                      size_t outputRowPitch,
                      size_t outputDepthPitch)
 {
+#if defined(ANGLE_USE_SSE)
+    if (gl::supportsSSE2())
+    {
+        __m128i brMask = _mm_set1_epi32(0x00ff00ff);
+        for (size_t z = 0; z < depth; z++)
+        {
+            for (size_t y = 0; y < height; y++)
+            {
+                const uint32_t *source =
+                    priv::OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
+                uint32_t *dest = priv::OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch,
+                                                                   outputDepthPitch);
+                size_t x = 0;
+                // Make output writes aligned
+                for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
+                {
+                    uint32_t rgba = source[x];
+                    dest[x]       = (ANGLE_ROTL(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
+                }
+                for (; x + 3 < width; x += 4)
+                {
+                    __m128i sourceData =
+                        _mm_loadu_si128(reinterpret_cast<const __m128i *>(&source[x]));
+                    // Mask out g and a, which don't change
+                    __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
+                    // Mask out b and r
+                    __m128i brComponents = _mm_and_si128(sourceData, brMask);
+                    // Swap b and r
+                    __m128i brSwapped = _mm_shufflehi_epi16(
+                        _mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)),
+                        _MM_SHUFFLE(2, 3, 0, 1));
+                    __m128i result = _mm_or_si128(gaComponents, brSwapped);
+                    _mm_store_si128(reinterpret_cast<__m128i *>(&dest[x]), result);
+                }
+                // Perform leftover writes
+                for (; x < width; x++)
+                {
+                    uint32_t rgba = source[x];
+                    dest[x]       = (ANGLE_ROTL(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
+                }
+            }
+        }
+        return;
+    }
+#endif
    for (size_t z = 0; z < depth; z++)
    {
        for (size_t y = 0; y < height; y++)

--- a/src/image_util/loadimage.h
+++ b/src/image_util/loadimage.h
@@ -35,16 +35,6 @@ void LoadA8ToBGRA8(size_t width,
                   size_t outputRowPitch,
                   size_t outputDepthPitch);
-void LoadA8ToBGRA8_SSE2(size_t width,
-                        size_t height,
-                        size_t depth,
-                        const uint8_t *input,
-                        size_t inputRowPitch,
-                        size_t inputDepthPitch,
-                        uint8_t *output,
-                        size_t outputRowPitch,
-                        size_t outputDepthPitch);
 void LoadA32FToRGBA32F(size_t width,
                       size_t height,
                       size_t depth,
@@ -215,16 +205,6 @@ void LoadR5G6B5ToRGBA8(size_t width,
                       size_t outputRowPitch,
                       size_t outputDepthPitch);
-void LoadRGBA8ToBGRA8_SSE2(size_t width,
-                           size_t height,
-                           size_t depth,
-                           const uint8_t *input,
-                           size_t inputRowPitch,
-                           size_t inputDepthPitch,
-                           uint8_t *output,
-                           size_t outputRowPitch,
-                           size_t outputDepthPitch);
 void LoadRGBA8ToBGRA8(size_t width,
                      size_t height,
                      size_t depth,

--- a/src/image_util/loadimage_sse2.cpp
+++ b/src/image_util/loadimage_sse2.cpp
-//
-// Copyright (c) 2002-2015 The ANGLE Project Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-//
-// loadimageSSE2.cpp: Defines image loading functions. It's
-// in a separated file for GCC, which can enable SSE usage only per-file,
-// not for code blocks that use SSE2 explicitly.
-#include "image_util/loadimage.h"
-#include "common/mathutil.h"
-#include "common/platform.h"
-#ifdef ANGLE_USE_SSE
-#include <emmintrin.h>
-#endif
-namespace angle
-{
-void LoadA8ToBGRA8_SSE2(size_t width,
-                        size_t height,
-                        size_t depth,
-                        const uint8_t *input,
-                        size_t inputRowPitch,
-                        size_t inputDepthPitch,
-                        uint8_t *output,
-                        size_t outputRowPitch,
-                        size_t outputDepthPitch)
-{
-#if defined(ANGLE_USE_SSE)
-    __m128i zeroWide = _mm_setzero_si128();
-    for (size_t z = 0; z < depth; z++)
-    {
-        for (size_t y = 0; y < height; y++)
-        {
-            const uint8_t *source =
-                priv::OffsetDataPointer<uint8_t>(input, y, z, inputRowPitch, inputDepthPitch);
-            uint32_t *dest =
-                priv::OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
-            size_t x = 0;
-            // Make output writes aligned
-            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
-            {
-                dest[x] = static_cast<uint32_t>(source[x]) << 24;
-            }
-            for (; x + 7 < width; x += 8)
-            {
-                __m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&source[x]));
-                // Interleave each byte to 16bit, make the lower byte to zero
-                sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
-                // Interleave each 16bit to 32bit, make the lower 16bit to zero
-                __m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
-                __m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
-                _mm_store_si128(reinterpret_cast<__m128i *>(&dest[x]), lo);
-                _mm_store_si128(reinterpret_cast<__m128i *>(&dest[x + 4]), hi);
-            }
-            // Handle the remainder
-            for (; x < width; x++)
-            {
-                dest[x] = static_cast<uint32_t>(source[x]) << 24;
-            }
-        }
-    }
-#else
-    // Ensure that this function is reported as not implemented for ARM builds because
-    // the instructions below are not present for that architecture.
-    UNIMPLEMENTED();
-    return;
-#endif
-}
-void LoadRGBA8ToBGRA8_SSE2(size_t width,
-                           size_t height,
-                           size_t depth,
-                           const uint8_t *input,
-                           size_t inputRowPitch,
-                           size_t inputDepthPitch,
-                           uint8_t *output,
-                           size_t outputRowPitch,
-                           size_t outputDepthPitch)
-{
-#if defined(ANGLE_USE_SSE)
-    __m128i brMask = _mm_set1_epi32(0x00ff00ff);
-    for (size_t z = 0; z < depth; z++)
-    {
-        for (size_t y = 0; y < height; y++)
-        {
-            const uint32_t *source =
-                priv::OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
-            uint32_t *dest =
-                priv::OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);
-            size_t x = 0;
-            // Make output writes aligned
-            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
-            {
-                uint32_t rgba = source[x];
-                dest[x]       = (ANGLE_ROTL(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-            }
-            for (; x + 3 < width; x += 4)
-            {
-                __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i *>(&source[x]));
-                // Mask out g and a, which don't change
-                __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
-                // Mask out b and r
-                __m128i brComponents = _mm_and_si128(sourceData, brMask);
-                // Swap b and r
-                __m128i brSwapped =
-                    _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)),
-                                        _MM_SHUFFLE(2, 3, 0, 1));
-                __m128i result = _mm_or_si128(gaComponents, brSwapped);
-                _mm_store_si128(reinterpret_cast<__m128i *>(&dest[x]), result);
-            }
-            // Perform leftover writes
-            for (; x < width; x++)
-            {
-                uint32_t rgba = source[x];
-                dest[x]       = (ANGLE_ROTL(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
-            }
-        }
-    }
-#else
-    // Ensure that this function is reported as not implemented for ARM builds because
-    // the instructions below are not present for that architecture.
-    UNIMPLEMENTED();
-    return;
-#endif
-}
-}
--- a/src/libANGLE/renderer/d3d/d3d9/formatutils9.cpp
+++ b/src/libANGLE/renderer/d3d/d3d9/formatutils9.cpp
@@ -179,28 +179,6 @@ static InternalFormatInitialzerMap BuildInternalFormatInitialzerMap()
    return map;
 }
-// Each GL internal format corresponds to one D3D format and data loading function.
-// Due to not all formats being available all the time, some of the function/format types are wrapped
-// in templates that perform format support queries on a Renderer9 object which is supplied
-// when requesting the function or format.
-typedef bool(*FallbackPredicateFunction)();
-template <FallbackPredicateFunction pred, LoadImageFunction prefered, LoadImageFunction fallback>
-static void FallbackLoad(size_t width, size_t height, size_t depth,
-                         const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
-                         uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
-{
-    if (pred())
-    {
-        prefered(width, height, depth, input, inputRowPitch, inputDepthPitch, output, outputRowPitch, outputDepthPitch);
-    }
-    else
-    {
-        fallback(width, height, depth, input, inputRowPitch, inputDepthPitch, output, outputRowPitch, outputDepthPitch);
-    }
-}
 static void UnreachableLoad(size_t width, size_t height, size_t depth,
                            const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
                            uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
@@ -241,6 +219,7 @@ static D3D9FormatMap BuildD3D9FormatMap()
    D3D9FormatMap map;
+    // clang-format off
    //                       | Internal format                     | Texture format      | Render format        | Load function                           |
    InsertD3D9FormatInfo(&map, GL_NONE,                             D3DFMT_NULL,          D3DFMT_NULL,           UnreachableLoad                          );
@@ -274,11 +253,11 @@ static D3D9FormatMap BuildD3D9FormatMap()
    InsertD3D9FormatInfo(&map, GL_LUMINANCE16F_EXT,                 D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        LoadL16FToRGBA16F                        );
    InsertD3D9FormatInfo(&map, GL_LUMINANCE_ALPHA16F_EXT,           D3DFMT_A16B16G16R16F, D3DFMT_UNKNOWN,        LoadLA16FToRGBA16F                       );
-    InsertD3D9FormatInfo(&map, GL_ALPHA8_EXT,                       D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       FallbackLoad<gl::supportsSSE2, LoadA8ToBGRA8_SSE2, LoadA8ToBGRA8>);
+    InsertD3D9FormatInfo(&map, GL_ALPHA8_EXT,                       D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadA8ToBGRA8                            );
    InsertD3D9FormatInfo(&map, GL_RGB8_OES,                         D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadRGB8ToBGRX8                           );
    InsertD3D9FormatInfo(&map, GL_RGB565,                           D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadR5G6B5ToBGRA8                         );
-    InsertD3D9FormatInfo(&map, GL_RGBA8_OES,                        D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       FallbackLoad<gl::supportsSSE2, LoadRGBA8ToBGRA8_SSE2, LoadRGBA8ToBGRA8>);
+    InsertD3D9FormatInfo(&map, GL_RGBA8_OES,                        D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadRGBA8ToBGRA8                          );
    InsertD3D9FormatInfo(&map, GL_RGBA4,                            D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadRGBA4ToBGRA8                          );
    InsertD3D9FormatInfo(&map, GL_RGB5_A1,                          D3DFMT_A8R8G8B8,      D3DFMT_A8R8G8B8,       LoadRGB5A1ToBGRA8                         );
    InsertD3D9FormatInfo(&map, GL_R8_EXT,                           D3DFMT_X8R8G8B8,      D3DFMT_X8R8G8B8,       LoadR8ToBGRX8                             );
@@ -297,6 +276,7 @@ static D3D9FormatMap BuildD3D9FormatMap()
    // then changing the format and loading function appropriately.
    InsertD3D9FormatInfo(&map, GL_LUMINANCE8_EXT,                   D3DFMT_L8,            D3DFMT_UNKNOWN,        LoadToNative<GLubyte, 1>                  );
    InsertD3D9FormatInfo(&map, GL_LUMINANCE8_ALPHA8_EXT,            D3DFMT_A8L8,          D3DFMT_UNKNOWN,        LoadToNative<GLubyte, 2>                  );
+    // clang-format on
    return map;
 }

--- a/src/libGLESv2.gypi
+++ b/src/libGLESv2.gypi
@@ -51,7 +51,6 @@
            'image_util/loadimage.h',
            'image_util/loadimage.inl',
            'image_util/loadimage_etc.cpp',
-            'image_util/loadimage_sse2.cpp',
        ],
        'libangle_includes':
        [