Commit ffdd61c3 by John Kessenich

Merge branch 'support_latest_sm_60_ops'

n why this merge is necessary,
parents d487d4d0 ce443b3a
......@@ -1110,6 +1110,42 @@ local_size = (32, 16, 1)
0:52 1 (const int)
0:52 Constant:
0:52 2 (const int)
0:54 move second child to first child ( temp uint)
0:54 direct index ( temp uint)
0:54 u: direct index for structure ( temp 4-component vector of uint)
0:54 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:54 Constant:
0:54 0 (const uint)
0:54 direct index ( temp uint)
0:54 'dti' ( in 3-component vector of uint)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 subgroupBallotInclusiveBitCount ( temp uint)
0:54 subgroupBallot ( temp 4-component vector of uint)
0:54 Compare Equal ( temp bool)
0:54 direct index ( temp uint)
0:54 u: direct index for structure ( temp 4-component vector of uint)
0:54 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:54 Constant:
0:54 0 (const uint)
0:54 direct index ( temp uint)
0:54 'dti' ( in 3-component vector of uint)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const uint)
0:13 Function Definition: CSMain( ( temp void)
0:13 Function Parameters:
0:? Sequence
......@@ -2237,6 +2273,42 @@ local_size = (32, 16, 1)
0:52 1 (const int)
0:52 Constant:
0:52 2 (const int)
0:54 move second child to first child ( temp uint)
0:54 direct index ( temp uint)
0:54 u: direct index for structure ( temp 4-component vector of uint)
0:54 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:54 Constant:
0:54 0 (const uint)
0:54 direct index ( temp uint)
0:54 'dti' ( in 3-component vector of uint)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 subgroupBallotInclusiveBitCount ( temp uint)
0:54 subgroupBallot ( temp 4-component vector of uint)
0:54 Compare Equal ( temp bool)
0:54 direct index ( temp uint)
0:54 u: direct index for structure ( temp 4-component vector of uint)
0:54 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:54 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:54 Constant:
0:54 0 (const uint)
0:54 direct index ( temp uint)
0:54 'dti' ( in 3-component vector of uint)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const int)
0:54 Constant:
0:54 0 (const uint)
0:13 Function Definition: CSMain( ( temp void)
0:13 Function Parameters:
0:? Sequence
......@@ -2251,15 +2323,16 @@ local_size = (32, 16, 1)
// Module Version 10000
// Generated by (magic number): 80003
// Id's are bound by 358
// Id's are bound by 369
Capability Shader
Capability Float64
Capability GroupNonUniform
Capability GroupNonUniformArithmetic
Capability GroupNonUniformBallot
1: ExtInstImport "GLSL.std.450"
MemoryModel Logical GLSL450
EntryPoint GLCompute 4 "CSMain" 353
EntryPoint GLCompute 4 "CSMain" 364
ExecutionMode 4 LocalSize 32 16 1
Source HLSL 500
Name 4 "CSMain"
......@@ -2273,9 +2346,9 @@ local_size = (32, 16, 1)
Name 22 "data"
MemberName 22(data) 0 "@data"
Name 24 "data"
Name 351 "dti"
Name 353 "dti"
Name 355 "param"
Name 362 "dti"
Name 364 "dti"
Name 366 "param"
MemberDecorate 20(Types) 0 Offset 0
MemberDecorate 20(Types) 1 Offset 16
MemberDecorate 20(Types) 2 Offset 32
......@@ -2284,7 +2357,7 @@ local_size = (32, 16, 1)
MemberDecorate 22(data) 0 Offset 0
Decorate 22(data) BufferBlock
Decorate 24(data) DescriptorSet 0
Decorate 353(dti) BuiltIn GlobalInvocationId
Decorate 364(dti) BuiltIn GlobalInvocationId
2: TypeVoid
3: TypeFunction 2
6: TypeInt 32 0
......@@ -2325,17 +2398,18 @@ local_size = (32, 16, 1)
170: TypePointer Uniform 18(float)
179: TypeVector 18(float) 2
191: TypeVector 18(float) 3
352: TypePointer Input 7(ivec3)
353(dti): 352(ptr) Variable Input
357: TypeBool
363: TypePointer Input 7(ivec3)
364(dti): 363(ptr) Variable Input
4(CSMain): 2 Function None 3
5: Label
351(dti): 8(ptr) Variable Function
355(param): 8(ptr) Variable Function
354: 7(ivec3) Load 353(dti)
Store 351(dti) 354
356: 7(ivec3) Load 351(dti)
Store 355(param) 356
357: 2 FunctionCall 11(@CSMain(vu3;) 355(param)
362(dti): 8(ptr) Variable Function
366(param): 8(ptr) Variable Function
365: 7(ivec3) Load 364(dti)
Store 362(dti) 365
367: 7(ivec3) Load 362(dti)
Store 366(param) 367
368: 2 FunctionCall 11(@CSMain(vu3;) 366(param)
Return
FunctionEnd
11(@CSMain(vu3;): 2 Function None 9
......@@ -2677,5 +2751,16 @@ local_size = (32, 16, 1)
349: 19(fvec4) Load 348
350: 19(fvec4) VectorShuffle 349 347 4 5 6 3
Store 348 350
351: 27(ptr) AccessChain 10(dti) 26
352: 6(int) Load 351
353: 27(ptr) AccessChain 10(dti) 26
354: 6(int) Load 353
355: 42(ptr) AccessChain 24(data) 25 354 25 26
356: 6(int) Load 355
358: 357(bool) IEqual 356 26
359: 13(ivec4) GroupNonUniformBallot 35 358
360: 6(int) GroupNonUniformBallotBitCount 35 InclusiveScan 359
361: 42(ptr) AccessChain 24(data) 25 352 25 26
Store 361 360
Return
FunctionEnd
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -7,7 +7,7 @@ gl_FragCoord origin is upper left
0:? Sequence
0:3 Test condition and select ( temp void)
0:3 Condition
0:3 '@gl_HelperInvocation' ( in bool HelperInvocation)
0:3 subgroupElect ( temp bool)
0:3 true case
0:? Sequence
0:5 Branch: Return with expression
......@@ -45,7 +45,7 @@ gl_FragCoord origin is upper left
0:? Sequence
0:3 Test condition and select ( temp void)
0:3 Condition
0:3 '@gl_HelperInvocation' ( in bool HelperInvocation)
0:3 subgroupElect ( temp bool)
0:3 true case
0:? Sequence
0:5 Branch: Return with expression
......@@ -76,16 +76,15 @@ gl_FragCoord origin is upper left
// Id's are bound by 30
Capability Shader
Capability GroupNonUniform
1: ExtInstImport "GLSL.std.450"
MemoryModel Logical GLSL450
EntryPoint Fragment 4 "PixelShaderFunction" 13 28
EntryPoint Fragment 4 "PixelShaderFunction" 28
ExecutionMode 4 OriginUpperLeft
Source HLSL 500
Name 4 "PixelShaderFunction"
Name 9 "@PixelShaderFunction("
Name 13 "@gl_HelperInvocation"
Name 28 "@entryPointOutput"
Decorate 13(@gl_HelperInvocation) BuiltIn HelperInvocation
Decorate 28(@entryPointOutput) Location 0
2: TypeVoid
3: TypeFunction 2
......@@ -93,8 +92,8 @@ gl_FragCoord origin is upper left
7: TypeVector 6(float) 4
8: TypeFunction 7(fvec4)
11: TypeBool
12: TypePointer Input 11(bool)
13(@gl_HelperInvocation): 12(ptr) Variable Input
12: TypeInt 32 0
13: 12(int) Constant 3
17: 6(float) Constant 1065353216
18: 6(float) Constant 1073741824
19: 6(float) Constant 1077936128
......@@ -111,7 +110,7 @@ gl_FragCoord origin is upper left
FunctionEnd
9(@PixelShaderFunction(): 7(fvec4) Function None 8
10: Label
14: 11(bool) Load 13(@gl_HelperInvocation)
14: 11(bool) GroupNonUniformElect 13
SelectionMerge 16 None
BranchConditional 14 15 23
15: Label
......
......@@ -3042,6 +3042,42 @@ local_size = (32, 16, 1)
0:122 1 (const int)
0:122 Constant:
0:122 2 (const int)
0:124 move second child to first child ( temp uint)
0:124 direct index ( temp uint)
0:124 u: direct index for structure ( temp 4-component vector of uint)
0:124 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:124 Constant:
0:124 0 (const uint)
0:124 direct index ( temp uint)
0:124 'dti' ( in 3-component vector of uint)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 subgroupBallotBitCount ( temp uint)
0:124 subgroupBallot ( temp 4-component vector of uint)
0:124 Compare Equal ( temp bool)
0:124 direct index ( temp uint)
0:124 u: direct index for structure ( temp 4-component vector of uint)
0:124 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:124 Constant:
0:124 0 (const uint)
0:124 direct index ( temp uint)
0:124 'dti' ( in 3-component vector of uint)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const uint)
0:13 Function Definition: CSMain( ( temp void)
0:13 Function Parameters:
0:? Sequence
......@@ -6101,6 +6137,42 @@ local_size = (32, 16, 1)
0:122 1 (const int)
0:122 Constant:
0:122 2 (const int)
0:124 move second child to first child ( temp uint)
0:124 direct index ( temp uint)
0:124 u: direct index for structure ( temp 4-component vector of uint)
0:124 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:124 Constant:
0:124 0 (const uint)
0:124 direct index ( temp uint)
0:124 'dti' ( in 3-component vector of uint)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 subgroupBallotBitCount ( temp uint)
0:124 subgroupBallot ( temp 4-component vector of uint)
0:124 Compare Equal ( temp bool)
0:124 direct index ( temp uint)
0:124 u: direct index for structure ( temp 4-component vector of uint)
0:124 indirect index (layout( row_major std430) buffer structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d})
0:124 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of structure{ temp 4-component vector of uint u, temp 4-component vector of int i, temp 4-component vector of float f, temp 4-component vector of double d} @data})
0:124 Constant:
0:124 0 (const uint)
0:124 direct index ( temp uint)
0:124 'dti' ( in 3-component vector of uint)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const int)
0:124 Constant:
0:124 0 (const uint)
0:13 Function Definition: CSMain( ( temp void)
0:13 Function Parameters:
0:? Sequence
......@@ -6115,15 +6187,16 @@ local_size = (32, 16, 1)
// Module Version 10000
// Generated by (magic number): 80003
// Id's are bound by 890
// Id's are bound by 901
Capability Shader
Capability Float64
Capability GroupNonUniform
Capability GroupNonUniformArithmetic
Capability GroupNonUniformBallot
1: ExtInstImport "GLSL.std.450"
MemoryModel Logical GLSL450
EntryPoint GLCompute 4 "CSMain" 885
EntryPoint GLCompute 4 "CSMain" 896
ExecutionMode 4 LocalSize 32 16 1
Source HLSL 500
Name 4 "CSMain"
......@@ -6137,9 +6210,9 @@ local_size = (32, 16, 1)
Name 22 "data"
MemberName 22(data) 0 "@data"
Name 24 "data"
Name 883 "dti"
Name 885 "dti"
Name 887 "param"
Name 894 "dti"
Name 896 "dti"
Name 898 "param"
MemberDecorate 20(Types) 0 Offset 0
MemberDecorate 20(Types) 1 Offset 16
MemberDecorate 20(Types) 2 Offset 32
......@@ -6148,7 +6221,7 @@ local_size = (32, 16, 1)
MemberDecorate 22(data) 0 Offset 0
Decorate 22(data) BufferBlock
Decorate 24(data) DescriptorSet 0
Decorate 885(dti) BuiltIn GlobalInvocationId
Decorate 896(dti) BuiltIn GlobalInvocationId
2: TypeVoid
3: TypeFunction 2
6: TypeInt 32 0
......@@ -6189,17 +6262,18 @@ local_size = (32, 16, 1)
170: TypePointer Uniform 18(float)
179: TypeVector 18(float) 2
191: TypeVector 18(float) 3
884: TypePointer Input 7(ivec3)
885(dti): 884(ptr) Variable Input
889: TypeBool
895: TypePointer Input 7(ivec3)
896(dti): 895(ptr) Variable Input
4(CSMain): 2 Function None 3
5: Label
883(dti): 8(ptr) Variable Function
887(param): 8(ptr) Variable Function
886: 7(ivec3) Load 885(dti)
Store 883(dti) 886
888: 7(ivec3) Load 883(dti)
Store 887(param) 888
889: 2 FunctionCall 11(@CSMain(vu3;) 887(param)
894(dti): 8(ptr) Variable Function
898(param): 8(ptr) Variable Function
897: 7(ivec3) Load 896(dti)
Store 894(dti) 897
899: 7(ivec3) Load 894(dti)
Store 898(param) 899
900: 2 FunctionCall 11(@CSMain(vu3;) 898(param)
Return
FunctionEnd
11(@CSMain(vu3;): 2 Function None 9
......@@ -7129,5 +7203,16 @@ local_size = (32, 16, 1)
881: 15(ivec4) Load 880
882: 15(ivec4) VectorShuffle 881 879 4 5 6 3
Store 880 882
883: 27(ptr) AccessChain 10(dti) 26
884: 6(int) Load 883
885: 27(ptr) AccessChain 10(dti) 26
886: 6(int) Load 885
887: 42(ptr) AccessChain 24(data) 25 886 25 26
888: 6(int) Load 887
890: 889(bool) IEqual 888 26
891: 13(ivec4) GroupNonUniformBallot 35 890
892: 6(int) GroupNonUniformBallotBitCount 35 Reduce 891
893: 42(ptr) AccessChain 24(data) 25 884 25 26
Store 893 892
Return
FunctionEnd
......@@ -16,8 +16,8 @@ local_size = (32, 16, 1)
0:6 'dti' ( in 3-component vector of uint)
0:6 Constant:
0:6 0 (const int)
0:6 packUint2x32 ( temp uint64_t)
0:6 vector swizzle ( temp 2-component vector of uint)
0:6 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:6 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:6 subgroupBallot ( temp 4-component vector of uint)
0:6 subgroupAny ( temp bool)
0:6 Compare Equal ( temp bool)
......@@ -27,11 +27,6 @@ local_size = (32, 16, 1)
0:6 0 (const int)
0:6 Constant:
0:6 0 (const uint)
0:6 Sequence
0:6 Constant:
0:6 0 (const int)
0:6 Constant:
0:6 1 (const int)
0:7 move second child to first child ( temp uint64_t)
0:7 indirect index (layout( row_major std430) buffer uint64_t)
0:7 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of uint64_t)
......@@ -42,8 +37,8 @@ local_size = (32, 16, 1)
0:7 'dti' ( in 3-component vector of uint)
0:7 Constant:
0:7 1 (const int)
0:7 packUint2x32 ( temp uint64_t)
0:7 vector swizzle ( temp 2-component vector of uint)
0:7 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:7 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:7 subgroupBallot ( temp 4-component vector of uint)
0:7 subgroupAll ( temp bool)
0:7 Compare Equal ( temp bool)
......@@ -53,11 +48,6 @@ local_size = (32, 16, 1)
0:7 1 (const int)
0:7 Constant:
0:7 0 (const uint)
0:7 Sequence
0:7 Constant:
0:7 0 (const int)
0:7 Constant:
0:7 1 (const int)
0:8 move second child to first child ( temp uint64_t)
0:8 indirect index (layout( row_major std430) buffer uint64_t)
0:8 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of uint64_t)
......@@ -68,8 +58,8 @@ local_size = (32, 16, 1)
0:8 'dti' ( in 3-component vector of uint)
0:8 Constant:
0:8 2 (const int)
0:8 packUint2x32 ( temp uint64_t)
0:8 vector swizzle ( temp 2-component vector of uint)
0:8 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:8 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:8 subgroupBallot ( temp 4-component vector of uint)
0:8 subgroupAllEqual ( temp bool)
0:8 Compare Equal ( temp bool)
......@@ -79,11 +69,24 @@ local_size = (32, 16, 1)
0:8 2 (const int)
0:8 Constant:
0:8 0 (const uint)
0:8 Sequence
0:8 Constant:
0:8 0 (const int)
0:8 Constant:
0:8 1 (const int)
0:9 move second child to first child ( temp uint64_t)
0:9 indirect index (layout( row_major std430) buffer uint64_t)
0:9 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of uint64_t)
0:9 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of uint64_t @data})
0:9 Constant:
0:9 0 (const uint)
0:9 direct index ( temp uint)
0:9 'dti' ( in 3-component vector of uint)
0:9 Constant:
0:9 2 (const int)
0:9 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:9 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:9 subgroupBallot ( temp 4-component vector of uint)
0:9 subgroupAllEqual ( temp bool)
0:9 direct index ( temp uint)
0:9 'dti' ( in 3-component vector of uint)
0:9 Constant:
0:9 2 (const int)
0:5 Function Definition: CSMain( ( temp void)
0:5 Function Parameters:
0:? Sequence
......@@ -117,8 +120,8 @@ local_size = (32, 16, 1)
0:6 'dti' ( in 3-component vector of uint)
0:6 Constant:
0:6 0 (const int)
0:6 packUint2x32 ( temp uint64_t)
0:6 vector swizzle ( temp 2-component vector of uint)
0:6 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:6 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:6 subgroupBallot ( temp 4-component vector of uint)
0:6 subgroupAny ( temp bool)
0:6 Compare Equal ( temp bool)
......@@ -128,11 +131,6 @@ local_size = (32, 16, 1)
0:6 0 (const int)
0:6 Constant:
0:6 0 (const uint)
0:6 Sequence
0:6 Constant:
0:6 0 (const int)
0:6 Constant:
0:6 1 (const int)
0:7 move second child to first child ( temp uint64_t)
0:7 indirect index (layout( row_major std430) buffer uint64_t)
0:7 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of uint64_t)
......@@ -143,8 +141,8 @@ local_size = (32, 16, 1)
0:7 'dti' ( in 3-component vector of uint)
0:7 Constant:
0:7 1 (const int)
0:7 packUint2x32 ( temp uint64_t)
0:7 vector swizzle ( temp 2-component vector of uint)
0:7 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:7 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:7 subgroupBallot ( temp 4-component vector of uint)
0:7 subgroupAll ( temp bool)
0:7 Compare Equal ( temp bool)
......@@ -154,11 +152,6 @@ local_size = (32, 16, 1)
0:7 1 (const int)
0:7 Constant:
0:7 0 (const uint)
0:7 Sequence
0:7 Constant:
0:7 0 (const int)
0:7 Constant:
0:7 1 (const int)
0:8 move second child to first child ( temp uint64_t)
0:8 indirect index (layout( row_major std430) buffer uint64_t)
0:8 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of uint64_t)
......@@ -169,8 +162,8 @@ local_size = (32, 16, 1)
0:8 'dti' ( in 3-component vector of uint)
0:8 Constant:
0:8 2 (const int)
0:8 packUint2x32 ( temp uint64_t)
0:8 vector swizzle ( temp 2-component vector of uint)
0:8 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:8 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:8 subgroupBallot ( temp 4-component vector of uint)
0:8 subgroupAllEqual ( temp bool)
0:8 Compare Equal ( temp bool)
......@@ -180,11 +173,24 @@ local_size = (32, 16, 1)
0:8 2 (const int)
0:8 Constant:
0:8 0 (const uint)
0:8 Sequence
0:8 Constant:
0:8 0 (const int)
0:8 Constant:
0:8 1 (const int)
0:9 move second child to first child ( temp uint64_t)
0:9 indirect index (layout( row_major std430) buffer uint64_t)
0:9 @data: direct index for structure (layout( row_major std430) buffer implicitly-sized array of uint64_t)
0:9 'data' (layout( row_major std430) buffer block{layout( row_major std430) buffer implicitly-sized array of uint64_t @data})
0:9 Constant:
0:9 0 (const uint)
0:9 direct index ( temp uint)
0:9 'dti' ( in 3-component vector of uint)
0:9 Constant:
0:9 2 (const int)
0:9 Construct uint64 (layout( row_major std430) buffer uint64_t)
0:9 Convert uint to uint64 ( temp 4-component vector of uint64_t)
0:9 subgroupBallot ( temp 4-component vector of uint)
0:9 subgroupAllEqual ( temp bool)
0:9 direct index ( temp uint)
0:9 'dti' ( in 3-component vector of uint)
0:9 Constant:
0:9 2 (const int)
0:5 Function Definition: CSMain( ( temp void)
0:5 Function Parameters:
0:? Sequence
......@@ -199,7 +205,7 @@ local_size = (32, 16, 1)
// Module Version 10000
// Generated by (magic number): 80003
// Id's are bound by 66
// Id's are bound by 75
Capability Shader
Capability Int64
......@@ -208,7 +214,7 @@ local_size = (32, 16, 1)
Capability GroupNonUniformBallot
1: ExtInstImport "GLSL.std.450"
MemoryModel Logical GLSL450
EntryPoint GLCompute 4 "CSMain" 61
EntryPoint GLCompute 4 "CSMain" 70
ExecutionMode 4 LocalSize 32 16 1
Source HLSL 500
Name 4 "CSMain"
......@@ -217,14 +223,14 @@ local_size = (32, 16, 1)
Name 15 "data"
MemberName 15(data) 0 "@data"
Name 17 "data"
Name 59 "dti"
Name 61 "dti"
Name 63 "param"
Name 68 "dti"
Name 70 "dti"
Name 72 "param"
Decorate 14 ArrayStride 8
MemberDecorate 15(data) 0 Offset 0
Decorate 15(data) BufferBlock
Decorate 17(data) DescriptorSet 0
Decorate 61(dti) BuiltIn GlobalInvocationId
Decorate 70(dti) BuiltIn GlobalInvocationId
2: TypeVoid
3: TypeFunction 2
6: TypeInt 32 0
......@@ -243,21 +249,21 @@ local_size = (32, 16, 1)
26: TypeBool
28: 6(int) Constant 3
30: TypeVector 6(int) 4
32: TypeVector 6(int) 2
32: TypeVector 13(int) 4
35: TypePointer Uniform 13(int)
37: 6(int) Constant 1
48: 6(int) Constant 2
60: TypePointer Input 7(ivec3)
61(dti): 60(ptr) Variable Input
69: TypePointer Input 7(ivec3)
70(dti): 69(ptr) Variable Input
4(CSMain): 2 Function None 3
5: Label
59(dti): 8(ptr) Variable Function
63(param): 8(ptr) Variable Function
62: 7(ivec3) Load 61(dti)
Store 59(dti) 62
64: 7(ivec3) Load 59(dti)
Store 63(param) 64
65: 2 FunctionCall 11(@CSMain(vu3;) 63(param)
68(dti): 8(ptr) Variable Function
72(param): 8(ptr) Variable Function
71: 7(ivec3) Load 70(dti)
Store 68(dti) 71
73: 7(ivec3) Load 68(dti)
Store 72(param) 73
74: 2 FunctionCall 11(@CSMain(vu3;) 72(param)
Return
FunctionEnd
11(@CSMain(vu3;): 2 Function None 9
......@@ -270,8 +276,8 @@ local_size = (32, 16, 1)
27: 26(bool) IEqual 25 20
29: 26(bool) GroupNonUniformAny 28 27
31: 30(ivec4) GroupNonUniformBallot 28 29
33: 32(ivec2) VectorShuffle 31 31 0 1
34: 13(int) Bitcast 33
33: 32(ivec4) UConvert 31
34: 13(int) CompositeExtract 33 0
36: 35(ptr) AccessChain 17(data) 19 23
Store 36 34
38: 21(ptr) AccessChain 10(dti) 37
......@@ -281,8 +287,8 @@ local_size = (32, 16, 1)
42: 26(bool) IEqual 41 20
43: 26(bool) GroupNonUniformAll 28 42
44: 30(ivec4) GroupNonUniformBallot 28 43
45: 32(ivec2) VectorShuffle 44 44 0 1
46: 13(int) Bitcast 45
45: 32(ivec4) UConvert 44
46: 13(int) CompositeExtract 45 0
47: 35(ptr) AccessChain 17(data) 19 39
Store 47 46
49: 21(ptr) AccessChain 10(dti) 48
......@@ -292,9 +298,19 @@ local_size = (32, 16, 1)
53: 26(bool) IEqual 52 20
54: 26(bool) GroupNonUniformAllEqual 28 53
55: 30(ivec4) GroupNonUniformBallot 28 54
56: 32(ivec2) VectorShuffle 55 55 0 1
57: 13(int) Bitcast 56
56: 32(ivec4) UConvert 55
57: 13(int) CompositeExtract 56 0
58: 35(ptr) AccessChain 17(data) 19 50
Store 58 57
59: 21(ptr) AccessChain 10(dti) 48
60: 6(int) Load 59
61: 21(ptr) AccessChain 10(dti) 48
62: 6(int) Load 61
63: 26(bool) GroupNonUniformAllEqual 28 62
64: 30(ivec4) GroupNonUniformBallot 28 63
65: 32(ivec4) UConvert 64
66: 13(int) CompositeExtract 65 0
67: 35(ptr) AccessChain 17(data) 19 60
Store 67 66
Return
FunctionEnd
RWStructuredBuffer<uint> data;
[numthreads(32, 16, 1)]
void CSMain()
{
data[WaveGetOrderedIndex()] = 1;
}
float4 PixelShaderFunction() : COLOR0
{
if (0 == WaveGetOrderedIndex())
{
return float4(1, 2, 3, 4);
}
else
{
return float4(4, 3, 2, 1);
}
}
RWStructuredBuffer<uint> data;
[numthreads(32, 16, 1)]
void CSMain()
{
uint i = 42;
data[GlobalOrderedCountIncrement(i)] = 1;
}
float4 PixelShaderFunction() : COLOR0
{
uint i = 42;
if (0 == GlobalOrderedCountIncrement(i))
{
return float4(1, 2, 3, 4);
}
else
{
return float4(4, 3, 2, 1);
}
}
......@@ -50,4 +50,6 @@ void CSMain(uint3 dti : SV_DispatchThreadID)
data[dti.x].d.x = WavePrefixProduct(data[dti.x].d.x);
data[dti.x].d.xy = WavePrefixProduct(data[dti.x].d.xy);
data[dti.x].d.xyz = WavePrefixProduct(data[dti.x].d.xyz);
data[dti.x].u.x = WavePrefixCountBits(data[dti.x].u.x == 0);
}
......@@ -91,43 +91,63 @@ void CSMain(uint3 dti : SV_DispatchThreadID)
data[dti.x].d.xy = QuadReadLaneAt(data[dti.x].d.xy, 3);
data[dti.x].d.xyz = QuadReadLaneAt(data[dti.x].d.xyz, 3);
data[dti.x].u = QuadSwapX(data[dti.x].u);
data[dti.x].u.x = QuadSwapX(data[dti.x].u.x);
data[dti.x].u.xy = QuadSwapX(data[dti.x].u.xy);
data[dti.x].u.xyz = QuadSwapX(data[dti.x].u.xyz);
data[dti.x].i = QuadSwapX(data[dti.x].i);
data[dti.x].i.x = QuadSwapX(data[dti.x].i.x);
data[dti.x].i.xy = QuadSwapX(data[dti.x].i.xy);
data[dti.x].i.xyz = QuadSwapX(data[dti.x].i.xyz);
data[dti.x].f = QuadSwapX(data[dti.x].f);
data[dti.x].f.x = QuadSwapX(data[dti.x].f.x);
data[dti.x].f.xy = QuadSwapX(data[dti.x].f.xy);
data[dti.x].f.xyz = QuadSwapX(data[dti.x].f.xyz);
data[dti.x].d = QuadSwapX(data[dti.x].d);
data[dti.x].d.x = QuadSwapX(data[dti.x].d.x);
data[dti.x].d.xy = QuadSwapX(data[dti.x].d.xy);
data[dti.x].d.xyz = QuadSwapX(data[dti.x].d.xyz);
data[dti.x].u = QuadSwapY(data[dti.x].u);
data[dti.x].u.x = QuadSwapY(data[dti.x].u.x);
data[dti.x].u.xy = QuadSwapY(data[dti.x].u.xy);
data[dti.x].u.xyz = QuadSwapY(data[dti.x].u.xyz);
data[dti.x].i = QuadSwapY(data[dti.x].i);
data[dti.x].i.x = QuadSwapY(data[dti.x].i.x);
data[dti.x].i.xy = QuadSwapY(data[dti.x].i.xy);
data[dti.x].i.xyz = QuadSwapY(data[dti.x].i.xyz);
data[dti.x].f = QuadSwapY(data[dti.x].f);
data[dti.x].f.x = QuadSwapY(data[dti.x].f.x);
data[dti.x].f.xy = QuadSwapY(data[dti.x].f.xy);
data[dti.x].f.xyz = QuadSwapY(data[dti.x].f.xyz);
data[dti.x].d = QuadSwapY(data[dti.x].d);
data[dti.x].d.x = QuadSwapY(data[dti.x].d.x);
data[dti.x].d.xy = QuadSwapY(data[dti.x].d.xy);
data[dti.x].d.xyz = QuadSwapY(data[dti.x].d.xyz);
data[dti.x].u = QuadReadAcrossX(data[dti.x].u);
data[dti.x].u.x = QuadReadAcrossX(data[dti.x].u.x);
data[dti.x].u.xy = QuadReadAcrossX(data[dti.x].u.xy);
data[dti.x].u.xyz = QuadReadAcrossX(data[dti.x].u.xyz);
data[dti.x].i = QuadReadAcrossX(data[dti.x].i);
data[dti.x].i.x = QuadReadAcrossX(data[dti.x].i.x);
data[dti.x].i.xy = QuadReadAcrossX(data[dti.x].i.xy);
data[dti.x].i.xyz = QuadReadAcrossX(data[dti.x].i.xyz);
data[dti.x].f = QuadReadAcrossX(data[dti.x].f);
data[dti.x].f.x = QuadReadAcrossX(data[dti.x].f.x);
data[dti.x].f.xy = QuadReadAcrossX(data[dti.x].f.xy);
data[dti.x].f.xyz = QuadReadAcrossX(data[dti.x].f.xyz);
data[dti.x].d = QuadReadAcrossX(data[dti.x].d);
data[dti.x].d.x = QuadReadAcrossX(data[dti.x].d.x);
data[dti.x].d.xy = QuadReadAcrossX(data[dti.x].d.xy);
data[dti.x].d.xyz = QuadReadAcrossX(data[dti.x].d.xyz);
data[dti.x].u = QuadReadAcrossY(data[dti.x].u);
data[dti.x].u.x = QuadReadAcrossY(data[dti.x].u.x);
data[dti.x].u.xy = QuadReadAcrossY(data[dti.x].u.xy);
data[dti.x].u.xyz = QuadReadAcrossY(data[dti.x].u.xyz);
data[dti.x].i = QuadReadAcrossY(data[dti.x].i);
data[dti.x].i.x = QuadReadAcrossY(data[dti.x].i.x);
data[dti.x].i.xy = QuadReadAcrossY(data[dti.x].i.xy);
data[dti.x].i.xyz = QuadReadAcrossY(data[dti.x].i.xyz);
data[dti.x].f = QuadReadAcrossY(data[dti.x].f);
data[dti.x].f.x = QuadReadAcrossY(data[dti.x].f.x);
data[dti.x].f.xy = QuadReadAcrossY(data[dti.x].f.xy);
data[dti.x].f.xyz = QuadReadAcrossY(data[dti.x].f.xyz);
data[dti.x].d = QuadReadAcrossY(data[dti.x].d);
data[dti.x].d.x = QuadReadAcrossY(data[dti.x].d.x);
data[dti.x].d.xy = QuadReadAcrossY(data[dti.x].d.xy);
data[dti.x].d.xyz = QuadReadAcrossY(data[dti.x].d.xyz);
data[dti.x].u = QuadReadAcrossDiagonal(data[dti.x].u);
data[dti.x].u.x = QuadReadAcrossDiagonal(data[dti.x].u.x);
data[dti.x].u.xy = QuadReadAcrossDiagonal(data[dti.x].u.xy);
data[dti.x].u.xyz = QuadReadAcrossDiagonal(data[dti.x].u.xyz);
data[dti.x].i = QuadReadAcrossDiagonal(data[dti.x].i);
data[dti.x].i.x = QuadReadAcrossDiagonal(data[dti.x].i.x);
data[dti.x].i.xy = QuadReadAcrossDiagonal(data[dti.x].i.xy);
data[dti.x].i.xyz = QuadReadAcrossDiagonal(data[dti.x].i.xyz);
data[dti.x].f = QuadReadAcrossDiagonal(data[dti.x].f);
data[dti.x].f.x = QuadReadAcrossDiagonal(data[dti.x].f.x);
data[dti.x].f.xy = QuadReadAcrossDiagonal(data[dti.x].f.xy);
data[dti.x].f.xyz = QuadReadAcrossDiagonal(data[dti.x].f.xyz);
data[dti.x].d = QuadReadAcrossDiagonal(data[dti.x].d);
data[dti.x].d.x = QuadReadAcrossDiagonal(data[dti.x].d.x);
data[dti.x].d.xy = QuadReadAcrossDiagonal(data[dti.x].d.xy);
data[dti.x].d.xyz = QuadReadAcrossDiagonal(data[dti.x].d.xyz);
}
......@@ -3,5 +3,5 @@ RWStructuredBuffer<uint> data;
[numthreads(32, 16, 1)]
void CSMain()
{
data[WaveGetLaneIndex()] = (WaveOnce()) ? WaveGetLaneCount() : 0;
data[WaveGetLaneIndex()] = (WaveIsFirstLane()) ? WaveGetLaneCount() : 0;
}
float4 PixelShaderFunction() : COLOR0
{
if (WaveIsHelperLane())
if (WaveIsFirstLane())
{
return float4(1, 2, 3, 4);
}
......
......@@ -11,113 +11,115 @@ RWStructuredBuffer<Types> data;
[numthreads(32, 16, 1)]
void CSMain(uint3 dti : SV_DispatchThreadID)
{
data[dti.x].u = WaveAllSum(data[dti.x].u);
data[dti.x].u.x = WaveAllSum(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllSum(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllSum(data[dti.x].u.xyz);
data[dti.x].i = WaveAllSum(data[dti.x].i);
data[dti.x].i.x = WaveAllSum(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllSum(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllSum(data[dti.x].i.xyz);
data[dti.x].f = WaveAllSum(data[dti.x].f);
data[dti.x].f.x = WaveAllSum(data[dti.x].f.x);
data[dti.x].f.xy = WaveAllSum(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveAllSum(data[dti.x].f.xyz);
data[dti.x].d = WaveAllSum(data[dti.x].d);
data[dti.x].d.x = WaveAllSum(data[dti.x].d.x);
data[dti.x].d.xy = WaveAllSum(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveAllSum(data[dti.x].d.xyz);
data[dti.x].u = WaveAllProduct(data[dti.x].u);
data[dti.x].u.x = WaveAllProduct(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllProduct(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllProduct(data[dti.x].u.xyz);
data[dti.x].i = WaveAllProduct(data[dti.x].i);
data[dti.x].i.x = WaveAllProduct(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllProduct(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllProduct(data[dti.x].i.xyz);
data[dti.x].f = WaveAllProduct(data[dti.x].f);
data[dti.x].f.x = WaveAllProduct(data[dti.x].f.x);
data[dti.x].f.xy = WaveAllProduct(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveAllProduct(data[dti.x].f.xyz);
data[dti.x].d = WaveAllProduct(data[dti.x].d);
data[dti.x].d.x = WaveAllProduct(data[dti.x].d.x);
data[dti.x].d.xy = WaveAllProduct(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveAllProduct(data[dti.x].d.xyz);
data[dti.x].u = WaveAllMin(data[dti.x].u);
data[dti.x].u.x = WaveAllMin(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllMin(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllMin(data[dti.x].u.xyz);
data[dti.x].i = WaveAllMin(data[dti.x].i);
data[dti.x].i.x = WaveAllMin(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllMin(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllMin(data[dti.x].i.xyz);
data[dti.x].f = WaveAllMin(data[dti.x].f);
data[dti.x].f.x = WaveAllMin(data[dti.x].f.x);
data[dti.x].f.xy = WaveAllMin(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveAllMin(data[dti.x].f.xyz);
data[dti.x].d = WaveAllMin(data[dti.x].d);
data[dti.x].d.x = WaveAllMin(data[dti.x].d.x);
data[dti.x].d.xy = WaveAllMin(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveAllMin(data[dti.x].d.xyz);
data[dti.x].u = WaveAllMax(data[dti.x].u);
data[dti.x].u.x = WaveAllMax(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllMax(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllMax(data[dti.x].u.xyz);
data[dti.x].i = WaveAllMax(data[dti.x].i);
data[dti.x].i.x = WaveAllMax(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllMax(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllMax(data[dti.x].i.xyz);
data[dti.x].f = WaveAllMax(data[dti.x].f);
data[dti.x].f.x = WaveAllMax(data[dti.x].f.x);
data[dti.x].f.xy = WaveAllMax(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveAllMax(data[dti.x].f.xyz);
data[dti.x].d = WaveAllMax(data[dti.x].d);
data[dti.x].d.x = WaveAllMax(data[dti.x].d.x);
data[dti.x].d.xy = WaveAllMax(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveAllMax(data[dti.x].d.xyz);
data[dti.x].u = WaveAllBitAnd(data[dti.x].u);
data[dti.x].u.x = WaveAllBitAnd(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllBitAnd(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllBitAnd(data[dti.x].u.xyz);
data[dti.x].i = WaveAllBitAnd(data[dti.x].i);
data[dti.x].i.x = WaveAllBitAnd(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllBitAnd(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllBitAnd(data[dti.x].i.xyz);
data[dti.x].u = WaveAllBitOr(data[dti.x].u);
data[dti.x].u.x = WaveAllBitOr(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllBitOr(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllBitOr(data[dti.x].u.xyz);
data[dti.x].i = WaveAllBitOr(data[dti.x].i);
data[dti.x].i.x = WaveAllBitOr(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllBitOr(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllBitOr(data[dti.x].i.xyz);
data[dti.x].u = WaveAllBitXor(data[dti.x].u);
data[dti.x].u.x = WaveAllBitXor(data[dti.x].u.x);
data[dti.x].u.xy = WaveAllBitXor(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveAllBitXor(data[dti.x].u.xyz);
data[dti.x].i = WaveAllBitXor(data[dti.x].i);
data[dti.x].i.x = WaveAllBitXor(data[dti.x].i.x);
data[dti.x].i.xy = WaveAllBitXor(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveAllBitXor(data[dti.x].i.xyz);
data[dti.x].u = WaveActiveSum(data[dti.x].u);
data[dti.x].u.x = WaveActiveSum(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveSum(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveSum(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveSum(data[dti.x].i);
data[dti.x].i.x = WaveActiveSum(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveSum(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveSum(data[dti.x].i.xyz);
data[dti.x].f = WaveActiveSum(data[dti.x].f);
data[dti.x].f.x = WaveActiveSum(data[dti.x].f.x);
data[dti.x].f.xy = WaveActiveSum(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveActiveSum(data[dti.x].f.xyz);
data[dti.x].d = WaveActiveSum(data[dti.x].d);
data[dti.x].d.x = WaveActiveSum(data[dti.x].d.x);
data[dti.x].d.xy = WaveActiveSum(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveActiveSum(data[dti.x].d.xyz);
data[dti.x].u = WaveActiveProduct(data[dti.x].u);
data[dti.x].u.x = WaveActiveProduct(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveProduct(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveProduct(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveProduct(data[dti.x].i);
data[dti.x].i.x = WaveActiveProduct(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveProduct(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveProduct(data[dti.x].i.xyz);
data[dti.x].f = WaveActiveProduct(data[dti.x].f);
data[dti.x].f.x = WaveActiveProduct(data[dti.x].f.x);
data[dti.x].f.xy = WaveActiveProduct(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveActiveProduct(data[dti.x].f.xyz);
data[dti.x].d = WaveActiveProduct(data[dti.x].d);
data[dti.x].d.x = WaveActiveProduct(data[dti.x].d.x);
data[dti.x].d.xy = WaveActiveProduct(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveActiveProduct(data[dti.x].d.xyz);
data[dti.x].u = WaveActiveMin(data[dti.x].u);
data[dti.x].u.x = WaveActiveMin(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveMin(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveMin(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveMin(data[dti.x].i);
data[dti.x].i.x = WaveActiveMin(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveMin(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveMin(data[dti.x].i.xyz);
data[dti.x].f = WaveActiveMin(data[dti.x].f);
data[dti.x].f.x = WaveActiveMin(data[dti.x].f.x);
data[dti.x].f.xy = WaveActiveMin(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveActiveMin(data[dti.x].f.xyz);
data[dti.x].d = WaveActiveMin(data[dti.x].d);
data[dti.x].d.x = WaveActiveMin(data[dti.x].d.x);
data[dti.x].d.xy = WaveActiveMin(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveActiveMin(data[dti.x].d.xyz);
data[dti.x].u = WaveActiveMax(data[dti.x].u);
data[dti.x].u.x = WaveActiveMax(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveMax(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveMax(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveMax(data[dti.x].i);
data[dti.x].i.x = WaveActiveMax(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveMax(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveMax(data[dti.x].i.xyz);
data[dti.x].f = WaveActiveMax(data[dti.x].f);
data[dti.x].f.x = WaveActiveMax(data[dti.x].f.x);
data[dti.x].f.xy = WaveActiveMax(data[dti.x].f.xy);
data[dti.x].f.xyz = WaveActiveMax(data[dti.x].f.xyz);
data[dti.x].d = WaveActiveMax(data[dti.x].d);
data[dti.x].d.x = WaveActiveMax(data[dti.x].d.x);
data[dti.x].d.xy = WaveActiveMax(data[dti.x].d.xy);
data[dti.x].d.xyz = WaveActiveMax(data[dti.x].d.xyz);
data[dti.x].u = WaveActiveBitAnd(data[dti.x].u);
data[dti.x].u.x = WaveActiveBitAnd(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveBitAnd(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveBitAnd(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveBitAnd(data[dti.x].i);
data[dti.x].i.x = WaveActiveBitAnd(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveBitAnd(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveBitAnd(data[dti.x].i.xyz);
data[dti.x].u = WaveActiveBitOr(data[dti.x].u);
data[dti.x].u.x = WaveActiveBitOr(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveBitOr(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveBitOr(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveBitOr(data[dti.x].i);
data[dti.x].i.x = WaveActiveBitOr(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveBitOr(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveBitOr(data[dti.x].i.xyz);
data[dti.x].u = WaveActiveBitXor(data[dti.x].u);
data[dti.x].u.x = WaveActiveBitXor(data[dti.x].u.x);
data[dti.x].u.xy = WaveActiveBitXor(data[dti.x].u.xy);
data[dti.x].u.xyz = WaveActiveBitXor(data[dti.x].u.xyz);
data[dti.x].i = WaveActiveBitXor(data[dti.x].i);
data[dti.x].i.x = WaveActiveBitXor(data[dti.x].i.x);
data[dti.x].i.xy = WaveActiveBitXor(data[dti.x].i.xy);
data[dti.x].i.xyz = WaveActiveBitXor(data[dti.x].i.xyz);
data[dti.x].u.x = WaveActiveCountBits(data[dti.x].u.x == 0);
}
......@@ -3,7 +3,8 @@ RWStructuredBuffer<uint64_t> data;
[numthreads(32, 16, 1)]
void CSMain(uint3 dti : SV_DispatchThreadID)
{
data[dti.x] = WaveBallot(WaveAnyTrue(dti.x == 0));
data[dti.y] = WaveBallot(WaveAllTrue(dti.y == 0));
data[dti.z] = WaveBallot(WaveAllEqual(dti.z == 0));
data[dti.x] = WaveActiveBallot(WaveActiveAnyTrue(dti.x == 0));
data[dti.y] = WaveActiveBallot(WaveActiveAllTrue(dti.y == 0));
data[dti.z] = WaveActiveBallot(WaveActiveAllEqualBool(dti.z == 0));
data[dti.z] = WaveActiveBallot(WaveActiveAllEqual(dti.z));
}
......@@ -927,10 +927,8 @@ enum TOperator {
// SM6 wave ops
EOpWaveGetLaneCount, // Will decompose to gl_SubgroupSize.
EOpWaveGetLaneIndex, // Will decompose to gl_SubgroupInvocationID.
EOpWaveIsHelperLane, // Will decompose to gl_HelperInvocation.
EOpWaveBallot, // Will decompose to subgroupBallot.
EOpWaveGetOrderedIndex, // Will decompose to an equation containing gl_SubgroupID.
EOpGlobalOrderedCountIncrement, // Will nice error.
EOpWaveActiveCountBits, // Will decompose to subgroupBallotBitCount(subgroupBallot()).
EOpWavePrefixCountBits, // Will decompose to subgroupBallotInclusiveBitCount(subgroupBallot()).
};
class TIntermTraverser;
......
......@@ -367,17 +367,13 @@ INSTANTIATE_TEST_CASE_P(
{"hlsl.type.identifier.frag", "main"},
{"hlsl.typeGraphCopy.vert", "main"},
{"hlsl.typedef.frag", "PixelShaderFunction"},
{"hlsl.wavequery.comp", "CSMain"},
{"hlsl.wavequery.frag", "PixelShaderFunction"},
{"hlsl.wavevote.comp", "CSMain"},
{"hlsl.wavebroadcast.comp", "CSMain"},
{"hlsl.wavereduction.comp", "CSMain"},
{"hlsl.waveprefix.comp", "CSMain"},
{"hlsl.wavequad.comp", "CSMain"},
{"hlsl.waveordered.comp", "CSMain"},
{"hlsl.waveordered2.comp", "CSMain"},
{"hlsl.waveordered.frag", "PixelShaderFunction"},
{"hlsl.waveordered2.frag", "PixelShaderFunction"},
{"hlsl.wavequery.comp", "CSMain"},
{"hlsl.wavequery.frag", "PixelShaderFunction"},
{"hlsl.wavereduction.comp", "CSMain"},
{"hlsl.wavevote.comp", "CSMain"},
{"hlsl.whileLoop.frag", "PixelShaderFunction"},
{"hlsl.void.frag", "PixelShaderFunction"}
}),
......
......@@ -5090,19 +5090,9 @@ void HlslParseContext::decomposeIntrinsic(const TSourceLoc& loc, TIntermTyped*&
node = lookupBuiltinVariable("@gl_SubgroupInvocationID", EbvSubgroupInvocation2, type);
break;
}
case EOpWaveIsHelperLane:
case EOpWaveActiveCountBits:
{
// Mapped to gl_HelperInvocation builtin (We preprend @ to the symbol
// so that it inhabits the symbol table, but has a user-invalid name
// in-case some source HLSL defined the symbol also).
TType type(EbtBool, EvqVaryingIn);
node = lookupBuiltinVariable("@gl_HelperInvocation", EbvHelperInvocation, type);
break;
}
case EOpWaveBallot:
{
// Mapped to subgroupBallot() builtin (NOTE: if an IHV has
// a subgroup size > 64 these wave ops will not work for them!)
// Mapped to subgroupBallotBitCount(subgroupBallot()) builtin
// uvec4 type.
TType uvec4Type(EbtUint, EvqTemporary, 4);
......@@ -5111,63 +5101,34 @@ void HlslParseContext::decomposeIntrinsic(const TSourceLoc& loc, TIntermTyped*&
TIntermTyped* res = intermediate.addBuiltInFunctionCall(loc,
EOpSubgroupBallot, true, arguments, uvec4Type);
// And extract a uvec2 for the two highest components.
TIntermTyped* xy = handleDotDereference(loc, res, "xy");
// uint type.
TType uintType(EbtUint, EvqTemporary);
// uint64_t type.
TType uint64Type(EbtUint64, EvqTemporary);
// And bitcast the result for a uint64_t
node = intermediate.addBuiltInFunctionCall(loc,
EOpPackUint2x32, true, xy, uint64Type);
EOpSubgroupBallotBitCount, true, res, uintType);
break;
}
case EOpWaveGetOrderedIndex:
case EOpWavePrefixCountBits:
{
if (language == EShLangFragment) {
// NOTE: For HLSL SM6.0 this should work for PS too, but the current GLSL extensions don't allow this.
error(loc, "WaveGetOrderedIndex() unsupported in a pixel/fragment shader", "WaveGetOrderedIndex", "");
break;
}
TType uintType(EbtUint, EvqVaryingIn);
TIntermTyped* subgroupID = lookupBuiltinVariable("@gl_SubgroupID", EbvSubgroupID, uintType);
TIntermTyped* numSubgroups = lookupBuiltinVariable("@gl_NumSubgroups", EbvNumSubgroups, uintType);
TType uvec3Type(EbtUint, EvqVaryingIn, 3);
TIntermTyped* numWorkGroups = lookupBuiltinVariable("@gl_NumWorkGroups", EbvNumWorkGroups, uvec3Type);
TIntermTyped* workGroupID = lookupBuiltinVariable("@gl_WorkGroupID", EbvWorkGroupId, uvec3Type);
// Mapped to subgroupBallotInclusiveBitCount(subgroupBallot())
// builtin
//x & y components of gl_NumWorkGroups
TIntermTyped* numWorkGroupsX = handleDotDereference(loc, numWorkGroups, "x");
TIntermTyped* numWorkGroupsY = handleDotDereference(loc, numWorkGroups, "y");
// uvec4 type.
TType uvec4Type(EbtUint, EvqTemporary, 4);
// x & y components of globalSize
TIntermTyped* globalSizeX = handleBinaryMath(loc, "mul", EOpMul, numSubgroups, numWorkGroupsX);
TIntermTyped* globalSizeY = numWorkGroupsY;
// Get the uvec4 return from subgroupBallot().
TIntermTyped* res = intermediate.addBuiltInFunctionCall(loc,
EOpSubgroupBallot, true, arguments, uvec4Type);
// x, y & z components of gl_WorkGroupID
TIntermTyped* workGroupX = handleDotDereference(loc, workGroupID, "x");
TIntermTyped* workGroupY = handleDotDereference(loc, workGroupID, "y");
TIntermTyped* workGroupZ = handleDotDereference(loc, workGroupID, "z");
// uint type.
TType uintType(EbtUint, EvqTemporary);
// We're going to build up the following variables to get a uniquely ordered ID:
// (globalSize.y * gl_WorkGroupID.z + gl_WorkGroupID.y) * globalSize.x + gl_WorkGroupID.x + gl_SubgroupID
node = handleBinaryMath(loc, "mul", EOpMul, globalSizeY, workGroupZ);
node = handleBinaryMath(loc, "add", EOpAdd, node, workGroupY);
node = handleBinaryMath(loc, "mul", EOpMul, node, globalSizeX);
node = handleBinaryMath(loc, "add", EOpAdd, node, workGroupX);
node = handleBinaryMath(loc, "add", EOpAdd, node, subgroupID);
node = intermediate.addBuiltInFunctionCall(loc,
EOpSubgroupBallotInclusiveBitCount, true, res, uintType);
break;
}
case EOpGlobalOrderedCountIncrement:
{
// NOTE: For HLSL SM6.0 this should work, but the current GLSL extensions don't allow this.
error(loc, "GlobalOrderedCountIncrement() unsupported", "GlobalOrderedCountIncrement", "");
break;
}
default:
break; // most pass through unchanged
......
......@@ -905,30 +905,33 @@ void TBuiltInParseablesHlsl::initialize(int /*version*/, EProfile /*profile*/, c
{ "Consume", nullptr, nullptr, "-", "-", EShLangAll, true },
// SM 6.0
{ "WaveOnce", "S", "B", "-", "-", EShLangPSCS, false},
{ "WaveIsFirstLane", "S", "B", "-", "-", EShLangPSCS, false},
{ "WaveGetLaneCount", "S", "U", "-", "-", EShLangPSCS, false},
{ "WaveGetLaneIndex", "S", "U", "-", "-", EShLangPSCS, false},
{ "WaveIsHelperLane", "S", "B", "-", "-", EShLangPS, false},
{ "WaveAnyTrue", "S", "B", "S", "B", EShLangPSCS, false},
{ "WaveAllTrue", "S", "B", "S", "B", EShLangPSCS, false},
{ "WaveAllEqual", "S", "B", "S", "B", EShLangPSCS, false},
{ "WaveBallot", "S", "M", "S", "B", EShLangPSCS, false},
{ "WaveActiveAnyTrue", "S", "B", "S", "B", EShLangPSCS, false},
{ "WaveActiveAllTrue", "S", "B", "S", "B", EShLangPSCS, false},
{ "WaveActiveBallot", "V4", "U", "S", "B", EShLangPSCS, false},
{ "WaveReadLaneAt", nullptr, nullptr, "SV,S", "DFUI,U", EShLangPSCS, false},
{ "WaveReadFirstLane", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllSum", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllProduct", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllMin", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllMax", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllBitAnd", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllBitOr", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveAllBitXor", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveAllEqual", "S", "B", "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveAllEqualBool", "S", "B", "S", "B", EShLangPSCS, false},
{ "WaveActiveCountBits", "S", "U", "S", "B", EShLangPSCS, false},
{ "WaveActiveSum", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveProduct", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveBitAnd", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveBitOr", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveBitXor", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveMin", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveActiveMax", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WavePrefixSum", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WavePrefixProduct", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WavePrefixCountBits", "S", "U", "S", "B", EShLangPSCS, false},
{ "QuadReadAcrossX", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "QuadReadAcrossY", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "QuadReadAcrossDiagonal", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "QuadReadLaneAt", nullptr, nullptr, "SV,S", "DFUI,U", EShLangPSCS, false},
{ "QuadSwapX", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "QuadSwapY", nullptr, nullptr, "SV", "DFUI", EShLangPSCS, false},
{ "WaveGetOrderedIndex", "S", "U", "-", "-", EShLangPSCS, false},
{ "GlobalOrderedCountIncrement", "S", "U", "S", "U", EShLangPSCS, false},
// Methods for subpass input objects
{ "SubpassLoad", "V4", nullptr, "[", "FIU", EShLangPS, true },
......@@ -1273,30 +1276,31 @@ void TBuiltInParseablesHlsl::identifyBuiltIns(int /*version*/, EProfile /*profil
symbolTable.relateToOperator(BUILTIN_PREFIX "RestartStrip", EOpMethodRestartStrip);
// Wave ops
symbolTable.relateToOperator("WaveOnce", EOpSubgroupElect);
symbolTable.relateToOperator("WaveIsFirstLane", EOpSubgroupElect);
symbolTable.relateToOperator("WaveGetLaneCount", EOpWaveGetLaneCount);
symbolTable.relateToOperator("WaveGetLaneIndex", EOpWaveGetLaneIndex);
symbolTable.relateToOperator("WaveIsHelperLane", EOpWaveIsHelperLane);
symbolTable.relateToOperator("WaveAnyTrue", EOpSubgroupAny);
symbolTable.relateToOperator("WaveAllTrue", EOpSubgroupAll);
symbolTable.relateToOperator("WaveAllEqual", EOpSubgroupAllEqual);
symbolTable.relateToOperator("WaveBallot", EOpWaveBallot);
symbolTable.relateToOperator("WaveReadLaneAt", EOpSubgroupShuffle);
symbolTable.relateToOperator("WaveActiveAnyTrue", EOpSubgroupAny);
symbolTable.relateToOperator("WaveActiveAllTrue", EOpSubgroupAll);
symbolTable.relateToOperator("WaveActiveBallot", EOpSubgroupBallot);
symbolTable.relateToOperator("WaveReadFirstLane", EOpSubgroupBroadcastFirst);
symbolTable.relateToOperator("WaveAllSum", EOpSubgroupAdd);
symbolTable.relateToOperator("WaveAllProduct", EOpSubgroupMul);
symbolTable.relateToOperator("WaveAllMin", EOpSubgroupMin);
symbolTable.relateToOperator("WaveAllMax", EOpSubgroupMax);
symbolTable.relateToOperator("WaveAllBitAnd", EOpSubgroupAnd);
symbolTable.relateToOperator("WaveAllBitOr", EOpSubgroupOr);
symbolTable.relateToOperator("WaveAllBitXor", EOpSubgroupXor);
symbolTable.relateToOperator("WaveReadLaneAt", EOpSubgroupShuffle);
symbolTable.relateToOperator("WaveActiveAllEqual", EOpSubgroupAllEqual);
symbolTable.relateToOperator("WaveActiveAllEqualBool", EOpSubgroupAllEqual);
symbolTable.relateToOperator("WaveActiveCountBits", EOpWaveActiveCountBits);
symbolTable.relateToOperator("WaveActiveSum", EOpSubgroupAdd);
symbolTable.relateToOperator("WaveActiveProduct", EOpSubgroupMul);
symbolTable.relateToOperator("WaveActiveBitAnd", EOpSubgroupAnd);
symbolTable.relateToOperator("WaveActiveBitOr", EOpSubgroupOr);
symbolTable.relateToOperator("WaveActiveBitXor", EOpSubgroupXor);
symbolTable.relateToOperator("WaveActiveMin", EOpSubgroupMin);
symbolTable.relateToOperator("WaveActiveMax", EOpSubgroupMax);
symbolTable.relateToOperator("WavePrefixSum", EOpSubgroupInclusiveAdd);
symbolTable.relateToOperator("WavePrefixProduct", EOpSubgroupInclusiveMul);
symbolTable.relateToOperator("WavePrefixCountBits", EOpWavePrefixCountBits);
symbolTable.relateToOperator("QuadReadAcrossX", EOpSubgroupQuadSwapHorizontal);
symbolTable.relateToOperator("QuadReadAcrossY", EOpSubgroupQuadSwapVertical);
symbolTable.relateToOperator("QuadReadAcrossDiagonal", EOpSubgroupQuadSwapDiagonal);
symbolTable.relateToOperator("QuadReadLaneAt", EOpSubgroupQuadBroadcast);
symbolTable.relateToOperator("QuadSwapX", EOpSubgroupQuadSwapHorizontal);
symbolTable.relateToOperator("QuadSwapY", EOpSubgroupQuadSwapVertical);
symbolTable.relateToOperator("WaveGetOrderedIndex", EOpWaveGetOrderedIndex);
symbolTable.relateToOperator("GlobalOrderedCountIncrement", EOpGlobalOrderedCountIncrement);
// Subpass input methods
symbolTable.relateToOperator(BUILTIN_PREFIX "SubpassLoad", EOpSubpassLoad);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment