Commit f1f28c80 by jbauman@chromium.org

Use SSE2 to swizzle RGBA to BGRA

Using SSE2 can drastically reduce the amount of time it takes to do glTexImage2D. I've also added a plain-C path that's much faster than the one that was there before. BUG=151 TEST= Review URL: http://codereview.appspot.com/4465052 git-svn-id: https://angleproject.googlecode.com/svn/trunk@649 736b8ea6-26fd-11df-bfd4-992fa37f6226
parent 73bec982
#define MAJOR_VERSION 0 #define MAJOR_VERSION 0
#define MINOR_VERSION 0 #define MINOR_VERSION 0
#define BUILD_VERSION 0 #define BUILD_VERSION 0
#define BUILD_REVISION 648 #define BUILD_REVISION 649
#define STRINGIFY(x) #x #define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x) #define MACRO_STRINGIFY(x) STRINGIFY(x)
......
...@@ -13,6 +13,7 @@ ...@@ -13,6 +13,7 @@
#include <d3dx9tex.h> #include <d3dx9tex.h>
#include <algorithm> #include <algorithm>
#include <intrin.h>
#include "common/debug.h" #include "common/debug.h"
...@@ -252,7 +253,14 @@ void Texture::loadImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei ...@@ -252,7 +253,14 @@ void Texture::loadImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei
loadRGBUByteImageData(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output); loadRGBUByteImageData(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output);
break; break;
case GL_RGBA: case GL_RGBA:
loadRGBAUByteImageData(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output); if (supportsSSE2())
{
loadRGBAUByteImageDataSSE2(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output);
}
else
{
loadRGBAUByteImageData(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output);
}
break; break;
case GL_BGRA_EXT: case GL_BGRA_EXT:
loadBGRAImageData(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output); loadBGRAImageData(xoffset, yoffset, width, height, inputPitch, input, outputPitch, output);
...@@ -614,22 +622,62 @@ void Texture::loadRGBHalfFloatImageData(GLint xoffset, GLint yoffset, GLsizei wi ...@@ -614,22 +622,62 @@ void Texture::loadRGBHalfFloatImageData(GLint xoffset, GLint yoffset, GLsizei wi
} }
} }
void Texture::loadRGBAUByteImageDataSSE2(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const
{
const unsigned int *source = NULL;
unsigned int *dest = NULL;
__m128i brMask = _mm_set1_epi32(0x00ff00ff);
for (int y = 0; y < height; y++)
{
source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch);
dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + (y + yoffset) * outputPitch + xoffset * 4);
int x = 0;
// Make output writes aligned
for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
{
unsigned int rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
for (; x + 3 < width; x += 4)
{
__m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
// Mask out g and a, which don't change
__m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
// Mask out b and r
__m128i brComponents = _mm_and_si128(sourceData, brMask);
// Swap b and r
__m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
__m128i result = _mm_or_si128(gaComponents, brSwapped);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
}
// Perform leftover writes
for (; x < width; x++)
{
unsigned int rgba = source[x];
dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
}
}
}
void Texture::loadRGBAUByteImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, void Texture::loadRGBAUByteImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const int inputPitch, const void *input, size_t outputPitch, void *output) const
{ {
const unsigned char *source = NULL; const unsigned int *source = NULL;
unsigned char *dest = NULL; unsigned int *dest = NULL;
for (int y = 0; y < height; y++) for (int y = 0; y < height; y++)
{ {
source = static_cast<const unsigned char*>(input) + y * inputPitch; source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch);
dest = static_cast<unsigned char*>(output) + (y + yoffset) * outputPitch + xoffset * 4; dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + (y + yoffset) * outputPitch + xoffset * 4);
for (int x = 0; x < width; x++) for (int x = 0; x < width; x++)
{ {
dest[4 * x + 0] = source[x * 4 + 2]; unsigned int rgba = source[x];
dest[4 * x + 1] = source[x * 4 + 1]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
dest[4 * x + 2] = source[x * 4 + 0];
dest[4 * x + 3] = source[x * 4 + 3];
} }
} }
} }
......
...@@ -169,6 +169,8 @@ class Texture : public RefCountObject ...@@ -169,6 +169,8 @@ class Texture : public RefCountObject
int inputPitch, const void *input, size_t outputPitch, void *output) const; int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadRGBHalfFloatImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, void loadRGBHalfFloatImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const; int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadRGBAUByteImageDataSSE2(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadRGBAUByteImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, void loadRGBAUByteImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const; int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadRGBA4444ImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height, void loadRGBA4444ImageData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height,
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#ifndef LIBGLESV2_MATHUTIL_H_ #ifndef LIBGLESV2_MATHUTIL_H_
#define LIBGLESV2_MATHUTIL_H_ #define LIBGLESV2_MATHUTIL_H_
#include <intrin.h>
#include <math.h> #include <math.h>
#include <windows.h> #include <windows.h>
...@@ -88,6 +89,31 @@ inline GLenum adjustWinding(GLenum winding) ...@@ -88,6 +89,31 @@ inline GLenum adjustWinding(GLenum winding)
ASSERT(winding == GL_CW || winding == GL_CCW); ASSERT(winding == GL_CW || winding == GL_CCW);
return winding == GL_CW ? GL_CCW : GL_CW; return winding == GL_CW ? GL_CCW : GL_CW;
} }
inline bool supportsSSE2()
{
static bool checked = false;
static bool supports = false;
if (checked)
{
return supports;
}
int info[4];
__cpuid(info, 0);
if (info[0] >= 1)
{
__cpuid(info, 1);
supports = (info[3] >> 26) & 1;
}
checked = true;
return supports;
}
} }
#endif // LIBGLESV2_MATHUTIL_H_ #endif // LIBGLESV2_MATHUTIL_H_
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment