Implement SSE2 version of loadAlphaData

http://codereview.appspot.com/6050054/ Signed-off-by: Nicolas Capens Signed-off-by: Daniel Koch Authored-by: Jin Yang With this patch, my HTML5 2D canvas benchmark with Chromium on Windows7 with GPU acceleration can boost about 4% though the most execution time is in GPU. git-svn-id: https://angleproject.googlecode.com/svn/trunk@1067 736b8ea6-26fd-11df-bfd4-992fa37f6226
parent 98eec912
...@@ -48,4 +48,5 @@ Mark Callow ...@@ -48,4 +48,5 @@ Mark Callow
Yuriy O'Donnell Yuriy O'Donnell
Sam Hocevar Sam Hocevar
Pierre Leveille Pierre Leveille
Jin Yang
#define MAJOR_VERSION 1 #define MAJOR_VERSION 1
#define MINOR_VERSION 0 #define MINOR_VERSION 0
#define BUILD_VERSION 0 #define BUILD_VERSION 0
#define BUILD_REVISION 1064 #define BUILD_REVISION 1067
#define STRINGIFY(x) #x #define STRINGIFY(x) #x
#define MACRO_STRINGIFY(x) STRINGIFY(x) #define MACRO_STRINGIFY(x) STRINGIFY(x)
......
...@@ -306,7 +306,14 @@ void Image::loadData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height ...@@ -306,7 +306,14 @@ void Image::loadData(GLint xoffset, GLint yoffset, GLsizei width, GLsizei height
switch (mFormat) switch (mFormat)
{ {
case GL_ALPHA: case GL_ALPHA:
loadAlphaData(width, height, inputPitch, input, locked.Pitch, locked.pBits); if (supportsSSE2())
{
loadAlphaDataSSE2(width, height, inputPitch, input, locked.Pitch, locked.pBits);
}
else
{
loadAlphaData(width, height, inputPitch, input, locked.Pitch, locked.pBits);
}
break; break;
case GL_LUMINANCE: case GL_LUMINANCE:
loadLuminanceData(width, height, inputPitch, input, locked.Pitch, locked.pBits, getD3DFormat() == D3DFMT_L8); loadLuminanceData(width, height, inputPitch, input, locked.Pitch, locked.pBits, getD3DFormat() == D3DFMT_L8);
...@@ -430,6 +437,46 @@ void Image::loadAlphaData(GLsizei width, GLsizei height, ...@@ -430,6 +437,46 @@ void Image::loadAlphaData(GLsizei width, GLsizei height,
} }
} }
void Image::loadAlphaDataSSE2(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const
{
const unsigned char *source = NULL;
unsigned int *dest = NULL;
__m128i zeroWide = _mm_setzero_si128();
for (int y = 0; y < height; y++)
{
source = static_cast<const unsigned char*>(input) + y * inputPitch;
dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
int x;
// Make output writes aligned
for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 0xF) != 0 && x < width); x++)
{
dest[x] = static_cast<unsigned int>(source[x]) << 24;
}
for (; x + 7 < width; x += 8)
{
__m128i sourceData = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&source[x]));
// Interleave each byte to 16bit, make the lower byte to zero
sourceData = _mm_unpacklo_epi8(zeroWide, sourceData);
// Interleave each 16bit to 32bit, make the lower 16bit to zero
__m128i lo = _mm_unpacklo_epi16(zeroWide, sourceData);
__m128i hi = _mm_unpackhi_epi16(zeroWide, sourceData);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), lo);
_mm_store_si128(reinterpret_cast<__m128i*>(&dest[x + 4]), hi);
}
// Handle the remainder
for (; x < width; x++)
{
dest[x] = static_cast<unsigned int>(source[x]) << 24;
}
}
}
void Image::loadAlphaFloatData(GLsizei width, GLsizei height, void Image::loadAlphaFloatData(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const int inputPitch, const void *input, size_t outputPitch, void *output) const
{ {
...@@ -3066,4 +3113,4 @@ TextureStorage *TextureCubeMap::getStorage(bool renderTarget) ...@@ -3066,4 +3113,4 @@ TextureStorage *TextureCubeMap::getStorage(bool renderTarget)
return mTexStorage; return mTexStorage;
} }
} }
\ No newline at end of file
...@@ -71,6 +71,8 @@ class Image ...@@ -71,6 +71,8 @@ class Image
void loadAlphaData(GLsizei width, GLsizei height, void loadAlphaData(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const; int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadAlphaDataSSE2(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadAlphaFloatData(GLsizei width, GLsizei height, void loadAlphaFloatData(GLsizei width, GLsizei height,
int inputPitch, const void *input, size_t outputPitch, void *output) const; int inputPitch, const void *input, size_t outputPitch, void *output) const;
void loadAlphaHalfFloatData(GLsizei width, GLsizei height, void loadAlphaHalfFloatData(GLsizei width, GLsizei height,
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment