D3D11: Refactor InputLayout cache.

*re-land with fix for cache overflowing* Using a much more compact input layout structure allows us to save quite a bit of time comparing input layouts, or computing hashes. A subsequent patch shrinks the size of the structure further. BUG=angleproject:959 Change-Id: If240bb7c84d78fc8c9fb6f9049bf71d8a81c97c6 Reviewed-on: https://chromium-review.googlesource.com/283227Reviewed-by: Geoff Lang <geofflang@chromium.org> Tested-by: Jamie Madill <jmadill@chromium.org>

D3D11: Refactor InputLayout cache.
531d5f45 · Jamie Madill · 01e4bfe7 · 531d5f45 · 531d5f45 · 531d5f45
Commit 531d5f45 authored Jul 06, 2015 by Jamie Madill
5 changed files
--- a/src/libANGLE/renderer/d3d/d3d11/InputLayoutCache.cpp
+++ b/src/libANGLE/renderer/d3d/d3d11/InputLayoutCache.cpp
@@ -42,11 +42,44 @@ void GetInputLayout(const TranslatedAttribute *translatedAttributes[gl::MAX_VERT
    }
 }
+const unsigned int kDefaultCacheSize = 1024;
 } // anonymous namespace
-const unsigned int InputLayoutCache::kMaxInputLayouts = 1024;
+bool InputLayoutCache::PackedAttributeComparator::operator()(const PackedAttributeLayout &a,
+                                                             const PackedAttributeLayout &b) const
+{
+    if (a.numAttributes != b.numAttributes)
+    {
+        return a.numAttributes < b.numAttributes;
+    }
+    if (a.flags != b.flags)
+    {
+        return a.flags < b.flags;
+    }
+    for (size_t attribIndex = 0; attribIndex < a.numAttributes; attribIndex++)
+    {
+        const auto &attribA = a.attributeData[attribIndex];
+        const auto &attribB = b.attributeData[attribIndex];
+        if (attribA.glType != attribB.glType)
+            return attribA.glType < attribB.glType;
+        if (attribA.semanticIndex != attribB.semanticIndex)
+            return attribA.semanticIndex < attribB.semanticIndex;
+        if (attribA.dxgiFormat != attribB.dxgiFormat)
+            return attribA.dxgiFormat < attribB.dxgiFormat;
+        if (attribA.divisor != attribB.divisor)
+            return attribA.divisor < attribB.divisor;
+    }
-InputLayoutCache::InputLayoutCache() : mInputLayoutMap(kMaxInputLayouts, hashInputLayout, compareInputLayouts)
+    // Equal
+    return false;
+}
+InputLayoutCache::InputLayoutCache()
+    : mCacheSize(kDefaultCacheSize)
 {
    mCounter = 0;
    mDevice = NULL;
@@ -77,11 +110,11 @@ void InputLayoutCache::initialize(ID3D11Device *device, ID3D11DeviceContext *con
 void InputLayoutCache::clear()
 {
-    for (InputLayoutMap::iterator i = mInputLayoutMap.begin(); i != mInputLayoutMap.end(); i++)
+    for (auto &layout : mLayoutMap)
    {
-        SafeRelease(i->second.inputLayout);
+        SafeRelease(layout.second);
    }
-    mInputLayoutMap.clear();
+    mLayoutMap.clear();
    SafeRelease(mPointSpriteVertexBuffer);
    SafeRelease(mPointSpriteIndexBuffer);
    markDirty();
@@ -115,7 +148,10 @@ gl::Error InputLayoutCache::applyVertexBuffers(const std::vector<TranslatedAttri
        return gl::Error(GL_OUT_OF_MEMORY, "Internal input layout cache is not initialized.");
    }
-    InputLayoutKey ilKey = { 0 };
+    InputLayoutKey ilKey;
+    ilKey.elementCount = 0;
+    PackedAttributeLayout layout;
    static const char* semanticName = "TEXCOORD";
@@ -158,6 +194,11 @@ gl::Error InputLayoutCache::applyVertexBuffers(const std::vector<TranslatedAttri
            ilKey.elementCount++;
            nextAvailableInputSlot = i + 1;
+            layout.addAttributeData(ilKey.elements[ilKey.elementCount].glslElementType,
+                                    sortedSemanticIndices[i],
+                                    vertexFormatInfo.nativeFormat,
+                                    sortedAttributes[i]->divisor);
        }
    }
@@ -215,20 +256,34 @@ gl::Error InputLayoutCache::applyVertexBuffers(const std::vector<TranslatedAttri
        }
    }
-    ID3D11InputLayout *inputLayout = NULL;
+    if (programUsesInstancedPointSprites)
+    {
+        layout.flags |= PackedAttributeLayout::FLAG_USES_INSTANCED_SPRITES;
+    }
-    InputLayoutMap::iterator keyIter = mInputLayoutMap.find(ilKey);
+    if (moveFirstIndexedIntoSlotZero)
-    if (keyIter != mInputLayoutMap.end())
    {
-        inputLayout = keyIter->second.inputLayout;
+        layout.flags |= PackedAttributeLayout::FLAG_MOVE_FIRST_INDEXED;
-        keyIter->second.lastUsedTime = mCounter++;
+    }
+    if (instancedPointSpritesActive)
+    {
+        layout.flags |= PackedAttributeLayout::FLAG_INSTANCED_SPRITES_ACTIVE;
+    }
+    ID3D11InputLayout *inputLayout = nullptr;
+    auto layoutMapIt = mLayoutMap.find(layout);
+    if (layoutMapIt != mLayoutMap.end())
+    {
+        inputLayout = layoutMapIt->second;
    }
    else
    {
        gl::VertexFormat shaderInputLayout[gl::MAX_VERTEX_ATTRIBS];
        GetInputLayout(sortedAttributes, unsortedAttributes.size(), shaderInputLayout);
-        ShaderExecutableD3D *shader = NULL;
+        ShaderExecutableD3D *shader = nullptr;
        gl::Error error = programD3D->getVertexExecutableForInputLayout(shaderInputLayout, &shader, nullptr);
        if (error.isError())
        {
@@ -249,28 +304,26 @@ gl::Error InputLayoutCache::applyVertexBuffers(const std::vector<TranslatedAttri
            return gl::Error(GL_OUT_OF_MEMORY, "Failed to create internal input layout, HRESULT: 0x%08x", result);
        }
-        if (mInputLayoutMap.size() >= kMaxInputLayouts)
+        if (mLayoutMap.size() >= mCacheSize)
        {
-            TRACE("Overflowed the limit of %u input layouts, removing the least recently used "
+            TRACE("Overflowed the limit of %u input layouts, purging half the cache.", mCacheSize);
-                  "to make room.", kMaxInputLayouts);
-            InputLayoutMap::iterator leastRecentlyUsed = mInputLayoutMap.begin();
+            // Randomly release every second element
-            for (InputLayoutMap::iterator i = mInputLayoutMap.begin(); i != mInputLayoutMap.end(); i++)
+            auto it = mLayoutMap.begin();
+            while (it != mLayoutMap.end())
            {
-                if (i->second.lastUsedTime < leastRecentlyUsed->second.lastUsedTime)
+                it++;
+                if (it != mLayoutMap.end())
                {
-                    leastRecentlyUsed = i;
+                    // Calling std::map::erase invalidates the current iterator, so make a copy.
+                    auto eraseIt = it++;
+                    SafeRelease(eraseIt->second);
+                    mLayoutMap.erase(eraseIt);
                }
            }
-            SafeRelease(leastRecentlyUsed->second.inputLayout);
-            mInputLayoutMap.erase(leastRecentlyUsed);
        }
-        InputLayoutCounterPair inputCounterPair;
+        mLayoutMap[layout] = inputLayout;
-        inputCounterPair.inputLayout = inputLayout;
-        inputCounterPair.lastUsedTime = mCounter++;
-        mInputLayoutMap.insert(std::make_pair(ilKey, inputCounterPair));
    }
    if (inputLayout != mCurrentIL)
@@ -433,23 +486,4 @@ gl::Error InputLayoutCache::applyVertexBuffers(const std::vector<TranslatedAttri
    return gl::Error(GL_NO_ERROR);
 }
-std::size_t InputLayoutCache::hashInputLayout(const InputLayoutKey &inputLayout)
-{
-    static const unsigned int seed = 0xDEADBEEF;
-    std::size_t hash = 0;
-    MurmurHash3_x86_32(inputLayout.begin(), static_cast<int>(inputLayout.end() - inputLayout.begin()), seed, &hash);
-    return hash;
-}
-bool InputLayoutCache::compareInputLayouts(const InputLayoutKey &a, const InputLayoutKey &b)
-{
-    if (a.elementCount != b.elementCount)
-    {
-        return false;
-    }
-    return std::equal(a.begin(), a.end(), b.begin());
-}
 }
--- a/src/libANGLE/renderer/d3d/d3d11/InputLayoutCache.h
+++ b/src/libANGLE/renderer/d3d/d3d11/InputLayoutCache.h
@@ -17,6 +17,7 @@
 #include <GLES2/gl2.h>
 #include <cstddef>
+#include <map>
 #include <unordered_map>
 namespace gl
@@ -43,6 +44,9 @@ class InputLayoutCache : angle::NonCopyable
    gl::Error applyVertexBuffers(const std::vector<TranslatedAttribute> &attributes,
                                 GLenum mode, gl::Program *program, SourceIndexData *sourceInfo);
+    // Useful for testing
+    void setCacheSize(unsigned int cacheSize) { mCacheSize = cacheSize; }
  private:
    struct InputLayoutElement
    {
@@ -66,12 +70,53 @@ class InputLayoutCache : angle::NonCopyable
        }
    };
-    struct InputLayoutCounterPair
+    struct PackedAttributeLayout
    {
-        ID3D11InputLayout *inputLayout;
+        PackedAttributeLayout()
-        unsigned long long lastUsedTime;
+            : numAttributes(0),
+              flags(0)
+        {
+        }
+        void addAttributeData(GLenum glType,
+                              UINT semanticIndex,
+                              DXGI_FORMAT dxgiFormat,
+                              unsigned int divisor)
+        {
+            attributeData[numAttributes].glType = glType;
+            attributeData[numAttributes].semanticIndex = semanticIndex;
+            attributeData[numAttributes].dxgiFormat = dxgiFormat;
+            attributeData[numAttributes].divisor = divisor;
+            ++numAttributes;
+        }
+        struct PackedAttribute
+        {
+            GLenum glType;
+            UINT semanticIndex;
+            DXGI_FORMAT dxgiFormat;
+            unsigned int divisor;
+        };
+        enum Flags
+        {
+            FLAG_USES_INSTANCED_SPRITES = 0x1,
+            FLAG_MOVE_FIRST_INDEXED = 0x2,
+            FLAG_INSTANCED_SPRITES_ACTIVE = 0x4,
+        };
+        size_t numAttributes;
+        unsigned int flags;
+        PackedAttribute attributeData[gl::MAX_VERTEX_ATTRIBS];
+    };
+    struct PackedAttributeComparator
+    {
+        bool operator()(const PackedAttributeLayout &a, const PackedAttributeLayout &b) const;
    };
+    std::map<PackedAttributeLayout, ID3D11InputLayout *, PackedAttributeComparator> mLayoutMap;
    ID3D11InputLayout *mCurrentIL;
    ID3D11Buffer *mCurrentBuffers[gl::MAX_VERTEX_ATTRIBS];
    UINT mCurrentVertexStrides[gl::MAX_VERTEX_ATTRIBS];
@@ -80,19 +125,7 @@ class InputLayoutCache : angle::NonCopyable
    ID3D11Buffer *mPointSpriteVertexBuffer;
    ID3D11Buffer *mPointSpriteIndexBuffer;
-    static std::size_t hashInputLayout(const InputLayoutKey &inputLayout);
+    unsigned int mCacheSize;
-    static bool compareInputLayouts(const InputLayoutKey &a, const InputLayoutKey &b);
-    typedef std::size_t (*InputLayoutHashFunction)(const InputLayoutKey &);
-    typedef bool (*InputLayoutEqualityFunction)(const InputLayoutKey &, const InputLayoutKey &);
-    typedef std::unordered_map<InputLayoutKey,
-                               InputLayoutCounterPair,
-                               InputLayoutHashFunction,
-                               InputLayoutEqualityFunction> InputLayoutMap;
-    InputLayoutMap mInputLayoutMap;
-    static const unsigned int kMaxInputLayouts;
    unsigned long long mCounter;
    ID3D11Device *mDevice;

--- a/src/libANGLE/renderer/d3d/d3d11/Renderer11.h
+++ b/src/libANGLE/renderer/d3d/d3d11/Renderer11.h
@@ -264,6 +264,7 @@ class Renderer11 : public RendererD3D
    const Renderer11DeviceCaps &getRenderer11DeviceCaps() { return mRenderer11DeviceCaps; };
    RendererClass getRendererClass() const override { return RENDERER_D3D11; }
+    InputLayoutCache *getInputLayoutCache() { return &mInputLayoutCache; }
  protected:
    void createAnnotator() override;

--- a/src/tests/angle_end2end_tests.gypi
+++ b/src/tests/angle_end2end_tests.gypi
@@ -69,6 +69,7 @@
        [
            '<(angle_path)/src/tests/gl_tests/D3D11EmulatedIndexedBufferTest.cpp',
            '<(angle_path)/src/tests/gl_tests/D3D11FormatTablesTest.cpp',
+            '<(angle_path)/src/tests/gl_tests/D3D11InputLayoutCacheTest.cpp',
            '<(angle_path)/src/tests/gl_tests/QueryDisplayAttribTest.cpp',
            # TODO(cwallez) for Linux, requires a portable implementation of threads
            '<(angle_path)/src/tests/egl_tests/EGLThreadTest.cpp',

--- a/src/tests/gl_tests/D3D11InputLayoutCacheTest.cpp
+++ b/src/tests/gl_tests/D3D11InputLayoutCacheTest.cpp
+//
+// Copyright 2015 The ANGLE Project Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+// D3D11InputLayoutCacheTest:
+//   Stress to to reproduce a bug where we weren't fluing the case correctly.
+//
+#include <sstream>
+#include "libANGLE/Context.h"
+#include "libANGLE/renderer/d3d/d3d11/Renderer11.h"
+#include "test_utils/ANGLETest.h"
+#include "test_utils/angle_test_instantiate.h"
+using namespace angle;
+namespace
+{
+class D3D11InputLayoutCacheTest : public ANGLETest
+{
+  protected:
+    GLuint makeProgramWithAttribCount(unsigned int attribCount)
+    {
+        std::stringstream strstr;
+        strstr << "attribute vec2 position;" << std::endl;
+        for (unsigned int attribIndex = 0; attribIndex < attribCount; ++attribIndex)
+        {
+            strstr << "attribute float a" << attribIndex << ";" << std::endl;
+        }
+        strstr << "varying float v;" << std::endl
+               << "void main() {" << std::endl
+               << "    v = 0.0;" << std::endl;
+        for (unsigned int attribIndex = 0; attribIndex < attribCount; ++attribIndex)
+        {
+            strstr << "    v += a" << attribIndex << ";" << std::endl;
+        }
+        strstr << "    gl_Position = vec4(position, 0.0, 1.0);" << std::endl
+               << "}" << std::endl;
+        const std::string basicFragmentShader =
+            "varying highp float v;\n"
+            "void main() {"
+            "   gl_FragColor = vec4(v / 255.0, 0.0, 0.0, 1.0);\n"
+            "}\n";
+        return CompileProgram(strstr.str(), basicFragmentShader);
+    }
+};
+// Stress the cache by setting a small cache size and drawing with a bunch of shaders
+// with different input signatures.
+TEST_P(D3D11InputLayoutCacheTest, StressTest)
+{
+    // Hack the ANGLE!
+    gl::Context *context = reinterpret_cast<gl::Context *>(getEGLWindow()->getContext());
+    rx::Renderer11 *renderer11 = rx::GetAs<rx::Renderer11>(context->getRenderer());
+    rx::InputLayoutCache *inputLayoutCache = renderer11->getInputLayoutCache();
+    // Clamp the cache size to something tiny
+    inputLayoutCache->setCacheSize(4);
+    GLint maxAttribs = 0;
+    context->getIntegerv(GL_MAX_VERTEX_ATTRIBS, &maxAttribs);
+    // Reserve one attrib for position
+    unsigned int maxInputs = static_cast<unsigned int>(maxAttribs) - 2;
+    std::vector<GLuint> programs;
+    for (unsigned int attribCount = 0; attribCount <= maxInputs; ++attribCount)
+    {
+        GLuint program = makeProgramWithAttribCount(attribCount);
+        ASSERT_NE(0u, program);
+        programs.push_back(program);
+    }
+    // Iteratively do a simple drop operation, trying every attribute count from 0..MAX_ATTRIBS.
+    // This should thrash the cache.
+    for (unsigned int iterationCount = 0; iterationCount < 10; ++iterationCount)
+    {
+        ASSERT_GL_NO_ERROR();
+        for (unsigned int attribCount = 0; attribCount <= maxInputs; ++attribCount)
+        {
+            GLuint program = programs[attribCount];
+            glUseProgram(program);
+            for (unsigned int attribIndex = 0; attribIndex < attribCount; ++attribIndex)
+            {
+                std::stringstream attribNameStr;
+                attribNameStr << "a" << attribIndex;
+                std::string attribName = attribNameStr.str();
+                GLint location = glGetAttribLocation(program, attribName.c_str());
+                ASSERT_NE(-1, location);
+                glVertexAttrib1f(location, 1.0f);
+                glDisableVertexAttribArray(location);
+            }
+            drawQuad(program, "position", 0.5f);
+            EXPECT_PIXEL_EQ(0, 0, attribCount, 0, 0, 255u);
+        }
+    }
+    for (GLuint program : programs)
+    {
+        glDeleteProgram(program);
+    }
+}
+ANGLE_INSTANTIATE_TEST(D3D11InputLayoutCacheTest, ES2_D3D11());
+}  // anonymous namespace