Metal: autogen for 3D texture's mipmap generating shader.

Bug: angleproject:4921 Bug: angleproject:2634 Change-Id: I5c379d750114e2ca1c5dd0203e94bb63dac1e0bf Reviewed-on: https://chromium-review.googlesource.com/c/angle/angle/+/2336125 Commit-Queue: Le Hoang Quyen <le.hoang.q@gmail.com> Reviewed-by: Jamie Madill <jmadill@chromium.org> Reviewed-by: Jonah Ryan-Davis <jonahr@google.com>

Metal: autogen for 3D texture's mipmap generating shader.
7ce9947d · Le Hoang Quyen · Commit Bot · 09b5e6b1 · 7ce9947d · 7ce9947d
Commit 7ce9947d authored Aug 04, 2020 by Le Hoang Quyen Committed by Commit Bot Aug 24, 2020
17 changed files
--- a/scripts/code_generation_hashes/Metal_default_shaders.json
+++ b/scripts/code_generation_hashes/Metal_default_shaders.json
@@ -4,37 +4,39 @@
  "src/libANGLE/renderer/metal/shaders/clear.metal":
    "67da9886363c530132e5bc2199bab2db",
  "src/libANGLE/renderer/metal/shaders/common.h":
-    "5888cfe052e6e6332e8c7c8949e888fb",
+    "4260719e5f35107db1029d2fe6e1a732",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_ios_autogen.inc":
-    "39f2302c254c8490b0f5b4782355a9e4",
+    "56c8d0decc672ce442ec4d179d889a90",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_ios_sim_autogen.inc":
-    "ab9be1624a4f15fc12647771727302b0",
+    "4b582bf07c3729a99a64aaf4016ecdd4",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_mac_autogen.inc":
-    "35f17267fa21f1f96d8a781d45485e1a",
+    "0088f4968365c1297e67f53732245401",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_ios_autogen.inc":
-    "81c4b4d1012dbcb7aa738f49bbed1645",
+    "958a164f2ceeae4e4ff04e0225139b7d",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_ios_sim_autogen.inc":
-    "3446979c2d6504cb421e1f0004f5f15f",
+    "bc5cfe5f1d0c27e771965f426cdb3b8d",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_mac_autogen.inc":
-    "7839cf16e8e45e6dffa9454dd5d7d3dc",
+    "642c73f07c80a3e776f050bc330de3d9",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_ios_autogen.inc":
-    "82ccf14797364f2c3c5dee14227b2c24",
+    "b37f61dfcbbed927d631a97ed2733382",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_ios_sim_autogen.inc":
-    "28f453c423b4029ab876668e4f1e8b21",
+    "17937de14c1146f972184d67fe17b670",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_mac_autogen.inc":
-    "3f402a605d97be11664ee0bb642e616a",
+    "e819d0a8fb87db32e84946319b394524",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_ios_autogen.inc":
-    "52c9275b8582f9e7c4a6b0b77c857a72",
+    "d7d307d96b4cb4ce970d2b6438dfd3f4",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_ios_sim_autogen.inc":
-    "259bdeb1bc17ab5098e2be4032f5d608",
+    "1943dad6dd5b28630ff1992fcef75da9",
  "src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_mac_autogen.inc":
-    "e16bd5da4f614789be9e2545f2af4237",
+    "a46aee7517016919d468bb780fd33f6a",
  "src/libANGLE/renderer/metal/shaders/compiled/mtl_default_shaders_autogen.inc":
    "634a127f4e94f6bc3123e89850d010ee",
  "src/libANGLE/renderer/metal/shaders/constants.h":
-    "9bb6e63bf2b48a7a56978c787bde4850",
+    "dad1a869a1095be669b7da5651901d38",
  "src/libANGLE/renderer/metal/shaders/gen_indices.metal":
    "87a76d5e12825111c0595f69e79f5d20",
+  "src/libANGLE/renderer/metal/shaders/gen_mipmap.metal":
+    "54dca94c48bead446624079070b9b309",
  "src/libANGLE/renderer/metal/shaders/gen_mtl_internal_shaders.py":
-    "962d0f3229d91ab71ad555f9fa3fe0c3"
+    "566ddbcfb95ddc845da9ab94eed8640b"
 }
\ No newline at end of file
--- a/src/libANGLE/renderer/metal/shaders/common.h
+++ b/src/libANGLE/renderer/metal/shaders/common.h
@@ -107,6 +107,33 @@ static inline vec<T, 4> resolveTextureMS(texture2d_ms<T> srcTexture, uint2 coord
    return output;
 }
+static inline float4 sRGBtoLinear(float4 color)
+{
+    float3 linear1 = color.rgb / 12.92;
+    float3 linear2 = pow((color.rgb + float3(0.055)) / 1.055, 2.4);
+    float3 factor  = float3(color.rgb <= float3(0.04045));
+    float4 linear  = float4(factor * linear1 + float3(1.0 - factor) * linear2, color.a);
+    return linear;
+}
+static inline float linearToSRGB(float color)
+{
+    if (color <= 0.0f)
+        return 0.0f;
+    else if (color < 0.0031308f)
+        return 12.92f * color;
+    else if (color < 1.0f)
+        return 1.055f * pow(color, 0.41666f) - 0.055f;
+    else
+        return 1.0f;
+}
+static inline float4 linearToSRGB(float4 color)
+{
+    return float4(linearToSRGB(color.r), linearToSRGB(color.g), linearToSRGB(color.b), color.a);
+}
 }  // namespace mtl_shader
 }  // namespace rx

--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_ios_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_ios_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_ios_sim_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_ios_sim_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_mac_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_debug_mac_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_ios_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_ios_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_ios_sim_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_ios_sim_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_mac_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_2_1_mac_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_ios_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_ios_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_ios_sim_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_ios_sim_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_mac_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_debug_mac_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_ios_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_ios_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_ios_sim_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_ios_sim_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_mac_autogen.inc
+++ b/src/libANGLE/renderer/metal/shaders/compiled/compiled_default_metallib_mac_autogen.inc
--- a/src/libANGLE/renderer/metal/shaders/constants.h
+++ b/src/libANGLE/renderer/metal/shaders/constants.h
@@ -23,6 +23,9 @@ enum
    kTextureTypeCount         = 5,
 };
+// Metal doesn't support constexpr to be used as array size, so we need to use macro here
+#define kGenerateMipThreadGroupSizePerDim 8
 }  // namespace mtl_shader
 }  // namespace rx

--- a/src/libANGLE/renderer/metal/shaders/gen_mipmap.metal
+++ b/src/libANGLE/renderer/metal/shaders/gen_mipmap.metal
+//
+// Copyright 2020 The ANGLE Project. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+//
+#include "common.h"
+using namespace rx::mtl_shader;
+#define kThreadGroupXYZ                                                      \
+    (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim * \
+     kGenerateMipThreadGroupSizePerDim)
+#define kThreadGroupXY (kGenerateMipThreadGroupSizePerDim * kGenerateMipThreadGroupSizePerDim)
+#define kThreadGroupX kGenerateMipThreadGroupSizePerDim
+#define TEXEL_STORE(index, texel) \
+    sR[index] = texel.r;          \
+    sG[index] = texel.g;          \
+    sB[index] = texel.b;          \
+    sA[index] = texel.a;
+#define TEXEL_LOAD(index) float4(sR[index], sG[index], sB[index], sA[index])
+#define TO_LINEAR(texel) (options.sRGB ? sRGBtoLinear(texel) : texel)
+#define OUT_OF_BOUND_CHECK(edgeValue, targetValue, condition) \
+    (condition) ? (edgeValue) : (targetValue)
+struct GenMipParams
+{
+    uint srcLevel;
+    uint numMipLevelsToGen;
+    bool sRGB;
+};
+// NOTE(hqle): For numMipLevelsToGen > 1, this function assumes the texture is power of two. If it
+// is not, quality will not be good.
+kernel void generate3DMipmaps(uint lIndex [[thread_index_in_threadgroup]],
+                              ushort3 gIndices [[thread_position_in_grid]],
+                              texture3d<float> srcTexture [[texture(0)]],
+                              texture3d<float, access::write> dstMip1 [[texture(1)]],
+                              texture3d<float, access::write> dstMip2 [[texture(2)]],
+                              texture3d<float, access::write> dstMip3 [[texture(3)]],
+                              texture3d<float, access::write> dstMip4 [[texture(4)]],
+                              constant GenMipParams &options [[buffer(0)]])
+{
+    ushort3 mipSize  = ushort3(dstMip1.get_width(), dstMip1.get_height(), dstMip1.get_depth());
+    bool validThread = gIndices.x < mipSize.x && gIndices.y < mipSize.y && gIndices.z < mipSize.z;
+    constexpr sampler textureSampler(mag_filter::linear, min_filter::linear, mip_filter::linear);
+    // NOTE(hqle): Use simd_group function whenever available. That could avoid barrier use.
+    // Use struct of array style to avoid bank conflict.
+    threadgroup float sR[kThreadGroupXYZ];
+    threadgroup float sG[kThreadGroupXYZ];
+    threadgroup float sB[kThreadGroupXYZ];
+    threadgroup float sA[kThreadGroupXYZ];
+    // ----- First mip level -------
+    float4 texel1;
+    if (validThread)
+    {
+        float3 texCoords = (float3(gIndices) + float3(0.5, 0.5, 0.5)) / float3(mipSize);
+        texel1           = srcTexture.sample(textureSampler, texCoords, level(options.srcLevel));
+        // Write to texture
+        dstMip1.write(texel1, gIndices);
+    }
+    else
+    {
+        // This will invalidate all subsequent checks
+        lIndex = 0xffffffff;
+    }
+    if (options.numMipLevelsToGen == 1)
+    {
+        return;
+    }
+    // ---- Second mip level --------
+    // Write to shared memory
+    if (options.sRGB)
+    {
+        texel1 = linearToSRGB(texel1);
+    }
+    TEXEL_STORE(lIndex, texel1);
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Index must be even
+    if ((lIndex & 0x49) == 0)  // (lIndex & b1001001) == 0
+    {
+        bool3 atEdge = gIndices == (mipSize - ushort3(1));
+        // (x+1, y, z)
+        // If the width of mip is 1, texel2 will equal to texel1:
+        float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 1), atEdge.x);
+        // (x, y+1, z)
+        float4 texel3 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupX), atEdge.y);
+        // (x, y, z+1)
+        float4 texel4 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + kThreadGroupXY), atEdge.z);
+        // (x+1, y+1, z)
+        float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupX + 1)),
+                                           atEdge.x | atEdge.y);
+        // (x+1, y, z+1)
+        float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (kThreadGroupXY + 1)),
+                                           atEdge.x | atEdge.z);
+        // (x, y+1, z+1)
+        float4 texel7 = OUT_OF_BOUND_CHECK(
+            texel3, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX)), atEdge.y | atEdge.z);
+        // (x+1, y+1, z+1)
+        float4 texel8 =
+            OUT_OF_BOUND_CHECK(texel5, TEXEL_LOAD(lIndex + (kThreadGroupXY + kThreadGroupX + 1)),
+                               atEdge.x | atEdge.y | atEdge.z);
+        texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
+        dstMip2.write(TO_LINEAR(texel1), gIndices >> 1);
+        // Write to shared memory
+        TEXEL_STORE(lIndex, texel1);
+    }
+    if (options.numMipLevelsToGen == 2)
+    {
+        return;
+    }
+    // ---- 3rd mip level --------
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Index must be multiple of 4
+    if ((lIndex & 0xdb) == 0)  // (lIndex & b11011011) == 0
+    {
+        mipSize      = max(mipSize >> 1, ushort3(1));
+        bool3 atEdge = (gIndices >> 1) == (mipSize - ushort3(1));
+        // (x+1, y, z)
+        // If the width of mip is 1, texel2 will equal to texel1:
+        float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 2), atEdge.x);
+        // (x, y+1, z)
+        float4 texel3 =
+            OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupX)), atEdge.y);
+        // (x, y, z+1)
+        float4 texel4 =
+            OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY)), atEdge.z);
+        // (x+1, y+1, z)
+        float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupX + 2)),
+                                           atEdge.x | atEdge.y);
+        // (x+1, y, z+1)
+        float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2)),
+                                           atEdge.x | atEdge.z);
+        // (x, y+1, z+1)
+        float4 texel7 = OUT_OF_BOUND_CHECK(
+            texel3, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX)),
+            atEdge.y | atEdge.z);
+        // (x+1, y+1, z+1)
+        float4 texel8 = OUT_OF_BOUND_CHECK(
+            texel5, TEXEL_LOAD(lIndex + (2 * kThreadGroupXY + 2 * kThreadGroupX + 2)),
+            atEdge.x | atEdge.y | atEdge.z);
+        texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
+        dstMip3.write(TO_LINEAR(texel1), gIndices >> 2);
+        // Write to shared memory
+        TEXEL_STORE(lIndex, texel1);
+    }
+    if (options.numMipLevelsToGen == 3)
+    {
+        return;
+    }
+    // ---- 4th mip level --------
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+    // Index must be multiple of 8
+    if ((lIndex & 0x1ff) == 0)  // (lIndex & b111111111) == 0
+    {
+        mipSize      = max(mipSize >> 1, ushort3(1));
+        bool3 atEdge = (gIndices >> 2) == (mipSize - ushort3(1));
+        // (x+1, y, z)
+        // If the width of mip is 1, texel2 will equal to texel1:
+        float4 texel2 = OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + 4), atEdge.x);
+        // (x, y+1, z)
+        float4 texel3 =
+            OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupX)), atEdge.y);
+        // (x, y, z+1)
+        float4 texel4 =
+            OUT_OF_BOUND_CHECK(texel1, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY)), atEdge.z);
+        // (x+1, y+1, z)
+        float4 texel5 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupX + 4)),
+                                           atEdge.x | atEdge.y);
+        // (x+1, y, z+1)
+        float4 texel6 = OUT_OF_BOUND_CHECK(texel2, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4)),
+                                           atEdge.x | atEdge.z);
+        // (x, y+1, z+1)
+        float4 texel7 = OUT_OF_BOUND_CHECK(
+            texel3, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX)),
+            atEdge.y | atEdge.z);
+        // (x+1, y+1, z+1)
+        float4 texel8 = OUT_OF_BOUND_CHECK(
+            texel5, TEXEL_LOAD(lIndex + (4 * kThreadGroupXY + 4 * kThreadGroupX + 4)),
+            atEdge.x | atEdge.y | atEdge.z);
+        texel1 = (texel1 + texel2 + texel3 + texel4 + texel5 + texel6 + texel7 + texel8) / 8.0;
+        dstMip4.write(TO_LINEAR(texel1), gIndices >> 3);
+    }
+}
--- a/src/libANGLE/renderer/metal/shaders/gen_mtl_internal_shaders.py
+++ b/src/libANGLE/renderer/metal/shaders/gen_mtl_internal_shaders.py
@@ -162,7 +162,7 @@ def gen_precompiled_shaders(mac_version, ios_version, variable_name, additional_
 def main():
-    src_files = ['blit.metal', 'clear.metal', 'gen_indices.metal']
+    src_files = ['blit.metal', 'clear.metal', 'gen_indices.metal', 'gen_mipmap.metal']
    # yapf: disable
    os_specific_autogen_files = [