From: Nicolai Hähnle <nicolai.haeh...@amd.com> The existing coherency test isn't a good match for the AMD GCN execution model. --- .../execution/coherency-extra.shader_test | 90 ++++++++++++++++++++++ 1 file changed, 90 insertions(+) create mode 100644 tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test
diff --git a/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test b/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test new file mode 100644 index 0000000..f718cd2 --- /dev/null +++ b/tests/spec/arb_shader_image_load_store/execution/coherency-extra.shader_test @@ -0,0 +1,90 @@ +# Additional coherency test that can demonstrate failures in an incorrect +# coherency implementation for AMD GCN, unlike arb_shader_image_load_store-coherency. +# +# The real problem with coherency in AMD GCN is separate, non-coherent L1 +# caches, i.e. when a shader execution writes to an image in a CU that uses +# one L1 cache, and a different shader execution reads from the image +# in a CU with a different L1 cache. +# +# This test uses atomic accesses to a control texture to select the very first +# fragment shader thread as a writer thread which keeps changing a data +# texture in a tight loop. All other threads become reader threads which +# report success if they see two different values of the same texture. +# +# This test can produce a false negative (false failure) in two cases: +# 1) The timeout value ITERS is too low, +# 2) There is no (or insufficient) parallelism in the implementation, and +# therefore the writer thread must finish before most of the reader threads +# get a chance to run. +# + +[require] +GL >= 3.3 +GLSL >= 3.30 +GL_ARB_shader_image_load_store +SIZE 256 256 + +[vertex shader passthrough] + +[fragment shader] +#version 330 +#extension GL_ARB_shader_image_load_store: enable + +// Change this to 0 to get a control test that should fail on hardware +// without coherent L1 caches. +// +// Need volatile instead of just coherent to prevent overly smart compilers +// from moving the imageLoad/imageStore out of the loop. +#if 1 +volatile +#endif +layout(r32i) uniform iimage2D tex; +volatile layout(r32i) uniform iimage2D ctrl; +out vec4 outcolor; + +// Add a timeout so that an incorrect coherency implementation doesn't hang +// the GPU. If this timeout is too low, you can get false negative results +// because the writer thread quits before all reader threads have +// executed. +#define ITERS 100000 + +void main() +{ + int id = imageAtomicAdd(ctrl, ivec2(0, 0), 1); + int orig = imageLoad(tex, ivec2(0, 0)).x; + bool done = false; + + outcolor = vec4(0.0, 0.0, 0.0, 1.0); + + for (int iter = 0; iter < ITERS && !done; ++iter) { + if (id == 0) { + imageStore(tex, ivec2(0, 0), ivec4(iter)); + if (imageLoad(ctrl, ivec2(0, 1)).x >= 256 * 256) + done = true; + } else { + int current = imageLoad(tex, ivec2(0, 0)).x; + if (current != orig) + done = true; + } + + if (done || (id == 0 && iter == 0)) + imageAtomicAdd(ctrl, ivec2(0, 1), 1); + } + + if (done) + outcolor.y = 1.0; + else + outcolor.x = 1.0; +} + +[test] +texture integer 0 (1, 2) (0, 0) GL_R32I +image texture 0 GL_R32I +texture integer 1 (1, 1) (0, 0) GL_R32I +image texture 1 GL_R32I + +uniform int ctrl 0 +uniform int tex 1 +draw rect -1 -1 2 2 + +probe all rgba 0.0 1.0 0.0 1.0 -- 2.5.0 _______________________________________________ Piglit mailing list Piglit@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/piglit