Micro-optimize math in IndexDataManager

Use bitwise operations instead of division, which is expensive on multiple CPU architectures. BUG=angleproject:956 TEST=angle_end2end_tests Change-Id: I57ab540d447c03dae5a96bafb4975fc37e310261 Reviewed-on: https://chromium-review.googlesource.com/262181Tested-by: Olli Etuaho <oetuaho@nvidia.com> Reviewed-by: Nicolas Capens <capn@chromium.org> Tested-by: Jamie Madill <jmadill@chromium.org>

Micro-optimize math in IndexDataManager
11ffe1b8 · Olli Etuaho · 5e5c826c · 11ffe1b8 · 11ffe1b8 · 11ffe1b8
Commit 11ffe1b8 authored Mar 24, 2015 by Olli Etuaho
Show whitespace changes
Inline Side-by-side

Showing with 23 additions and 7 deletions

formatutils.cpp src/libANGLE/formatutils.cpp +8 -0

formatutils.h src/libANGLE/formatutils.h +1 -0

IndexDataManager.cpp src/libANGLE/renderer/d3d/IndexDataManager.cpp +14 -7

No files found.
--- a/src/libANGLE/formatutils.cpp
+++ b/src/libANGLE/formatutils.cpp
@@ -132,6 +132,7 @@ FormatMap BuildFormatMap()

 Type::Type()
    : bytes(0),
+      bytesShift(0),
      specialInterpretation(false)
 {
 }
@@ -140,6 +141,13 @@ static Type GenTypeInfo(GLuint bytes, bool specialInterpretation)
 {
    Type info;
    info.bytes = bytes;
+    GLuint i = 0;
+    while ((1u << i) < bytes)
+    {
+        ++i;
+    }
+    info.bytesShift = i;
+    ASSERT((1u << info.bytesShift) == bytes);
    info.specialInterpretation = specialInterpretation;
    return info;
 }

--- a/src/libANGLE/formatutils.h
+++ b/src/libANGLE/formatutils.h
@@ -25,6 +25,7 @@ struct Type
    Type();

    GLuint bytes;
+    GLuint bytesShift; // Bit shift by this value to effectively divide/multiply by "bytes" in a more optimal way
    bool specialInterpretation;
 };
 const Type &GetTypeInfo(GLenum type);

--- a/src/libANGLE/renderer/d3d/IndexDataManager.cpp
+++ b/src/libANGLE/renderer/d3d/IndexDataManager.cpp
@@ -86,6 +86,8 @@ gl::Error IndexDataManager::prepareIndexData(GLenum type, GLsizei count, gl::Buf

        storage = GetImplAs<BufferD3D>(buffer);

+        // We'll trust that the compiler will optimize the % below:
+        // the operands are unsigned and the divisor is a constant.
        switch (type)
        {
          case GL_UNSIGNED_BYTE:  alignedOffset = (offset % sizeof(GLubyte) == 0);  break;
@@ -127,7 +129,8 @@ gl::Error IndexDataManager::prepareIndexData(GLenum type, GLsizei count, gl::Buf

        if (!staticBuffer->getIndexRangeCache()->findRange(type, offset, count, NULL, &streamOffset))
        {
-            streamOffset = (offset / typeInfo.bytes) * gl::GetTypeInfo(destinationIndexType).bytes;
+            // Using bit-shift here is faster than using division.
+            streamOffset = (offset >> typeInfo.bytesShift) << gl::GetTypeInfo(destinationIndexType).bytesShift;
            staticBuffer->getIndexRangeCache()->addRange(type, offset, count, translated->indexRange, streamOffset);
        }
        if (!buffer->getIndexRangeCache()->findRange(type, offset, count, nullptr, nullptr))
@@ -162,7 +165,8 @@ gl::Error IndexDataManager::prepareIndexData(GLenum type, GLsizei count, gl::Buf
            if (staticBuffer->getBufferSize() == 0 && alignedOffset)
            {
                indexBuffer = staticBuffer;
-                convertCount = storage->getSize() / typeInfo.bytes;
+                // Using bit-shift here is faster than using division.
+                convertCount = storage->getSize() >> typeInfo.bytesShift;
            }
            else
            {
@@ -173,13 +177,14 @@ gl::Error IndexDataManager::prepareIndexData(GLenum type, GLsizei count, gl::Buf

        ASSERT(indexBuffer);

-        if (convertCount > std::numeric_limits<unsigned int>::max() / destTypeInfo.bytes)
+        // Using bit-shift here is faster than using division.
+        if (convertCount > (std::numeric_limits<unsigned int>::max() >> destTypeInfo.bytesShift))
        {
            return gl::Error(GL_OUT_OF_MEMORY, "Reserving %u indices of %u bytes each exceeds the maximum buffer size.",
                             convertCount, destTypeInfo.bytes);
        }

-        unsigned int bufferSizeRequired = convertCount * destTypeInfo.bytes;
+        unsigned int bufferSizeRequired = convertCount << destTypeInfo.bytesShift;
        error = indexBuffer->reserveBufferSpace(bufferSizeRequired, type);
        if (error.isError())
        {
@@ -212,7 +217,8 @@ gl::Error IndexDataManager::prepareIndexData(GLenum type, GLsizei count, gl::Buf

        if (staticBuffer)
        {
-            streamOffset = (offset / typeInfo.bytes) * destTypeInfo.bytes;
+            // Using bit-shift here is faster than using division.
+            streamOffset = (offset >> typeInfo.bytesShift) << destTypeInfo.bytesShift;
            staticBuffer->getIndexRangeCache()->addRange(type, offset, count, translated->indexRange, streamOffset);
        }
    }
@@ -220,13 +226,14 @@ gl::Error IndexDataManager::prepareIndexData(GLenum type, GLsizei count, gl::Buf
    translated->storage = directStorage ? storage : NULL;
    translated->indexBuffer = indexBuffer ? indexBuffer->getIndexBuffer() : NULL;
    translated->serial = directStorage ? storage->getSerial() : indexBuffer->getSerial();
-    translated->startIndex = streamOffset / destTypeInfo.bytes;
+    // Using bit-shift here is faster than using division.
+    translated->startIndex = (streamOffset >> destTypeInfo.bytesShift);
    translated->startOffset = streamOffset;
    translated->indexType = destinationIndexType;

    if (storage)
    {
-        storage->promoteStaticUsage(count * typeInfo.bytes);
+        storage->promoteStaticUsage(count << typeInfo.bytesShift);
    }

    return gl::Error(GL_NO_ERROR);