Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On 02/22/2016 02:10 PM, Matt Turner wrote: > On Mon, Feb 22, 2016 at 1:05 PM, Ian Romanickwrote: >> On 02/19/2016 02:57 AM, Iago Toral wrote: >>> I don't know much about this, but using shader_samples_identical should >>> only give a benefit if we actually get identical samples, otherwise it >>> means more work, right? >> >> It is possible to do more work. Assuming the backend compiler does >> everything right, that amount of extra work should be trivial. The >> initial texelFetch will get the MCS data and the sample zero data. The >> textureSamplesIdenticalEXT also wants the MCS data. The backend CSE >> pass should eliminate the second fetch of MCS data. Then the added >> work is just comparing the MCS data with 0 and the flow control. >> >> Below is the BDW SIMD8 code for SAMPLES=4. I see at least 8 >> instructions that could be removed (all the moves to g124-g127). >> Without GL_EXT_shader_samples_identical, none of those extra >> instructions are there. I'll play around with the GLSL to see if I can >> get the backend to generate better code. Otherwise, we'll have to see >> about fixing the backend. Here's the actual shader. I extracted this using Neil's microbenchmark. This one actually includes ld_mcs. For some reason it's not able to CSE across basic blocks. Also... that stupid empty then-branch. :( I tried a hack to not emit it, but it must be getting emptied later. I'll take a stab a updating the dead CF pass to remove empty then-branches. Comments in the NIR translation code indicate that pass already handles empty else-branches. SIMD8 shader: 55 instructions. 0 loops. 226 cycles. 0:0 spills:fills. Promoted 0 constants. Compacted 880 to 576 bytes (35%) START B0 pln(8) g34<1>F g4<0,1,0>F g2<8,8,1>F { align1 1Q compacted }; pln(8) g35<1>F g4.4<0,1,0>Fg2<8,8,1>F { align1 1Q compacted }; mov(8) g10<1>F [0F, 0F, 0F, 0F]VF { align1 1Q compacted }; mov(8) g12<1>D g34<8,8,1>F { align1 1Q compacted }; mov(8) g13<1>D g35<8,8,1>F { align1 1Q compacted }; send(8) g2<1>UW g12<8,8,1>F sampler ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q }; mov(8) g11<1>F g2<8,8,1>F { align1 1Q compacted }; send(8) g6<1>UW g10<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; mov(8) g124<1>Fg6<8,8,1>F { align1 1Q compacted }; mov(8) g125<1>Fg7<8,8,1>F { align1 1Q compacted }; mov(8) g126<1>Fg8<8,8,1>F { align1 1Q compacted }; mov(8) g127<1>Fg9<8,8,1>F { align1 1Q compacted }; cmp.z.f0(8) null<1>UD g2<8,8,1>UD 0xUD{ align1 1Q compacted }; (+f0) if(8) JIP: 32 UIP: 424{ align1 1Q }; END B0 ->B1 ->B2 START B1 <-B0 else(8) JIP: 408UIP: 408{ align1 1Q }; END B1 ->B3 START B2 <-B0 mov(8) g14<1>F g12<8,8,1>F { align1 1Q compacted }; mov(8) g15<1>F g13<8,8,1>F { align1 1Q compacted }; mov(8) g2<1>F 1.4013e-45F { align1 1Q }; mov(8) g4<1>F g12<8,8,1>F { align1 1Q compacted }; mov(8) g5<1>F g13<8,8,1>F { align1 1Q compacted }; mov(8) g19<1>F 2.8026e-45F { align1 1Q }; mov(8) g21<1>F g12<8,8,1>F { align1 1Q compacted }; mov(8) g22<1>F g13<8,8,1>F { align1 1Q compacted }; mov(8) g23<1>F 4.2039e-45F { align1 1Q }; mov(8) g25<1>F g12<8,8,1>F { align1 1Q compacted }; mov(8) g26<1>F g13<8,8,1>F { align1 1Q compacted }; send(8) g15<1>UWg14<8,8,1>F sampler ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 rlen 4 { align1 1Q }; mov(8) g3<1>F g15<8,8,1>F { align1 1Q compacted }; mov(8) g20<1>F g15<8,8,1>F { align1 1Q compacted }; mov(8) g24<1>F g15<8,8,1>F { align1 1Q compacted }; send(8) g10<1>UWg2<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; send(8) g14<1>UWg19<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; send(8) g18<1>UWg23<8,8,1>F
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On 02/22/2016 02:10 PM, Matt Turner wrote: > On Mon, Feb 22, 2016 at 1:05 PM, Ian Romanickwrote: >> On 02/19/2016 02:57 AM, Iago Toral wrote: >>> I don't know much about this, but using shader_samples_identical should >>> only give a benefit if we actually get identical samples, otherwise it >>> means more work, right? >> >> It is possible to do more work. Assuming the backend compiler does >> everything right, that amount of extra work should be trivial. The >> initial texelFetch will get the MCS data and the sample zero data. The >> textureSamplesIdenticalEXT also wants the MCS data. The backend CSE >> pass should eliminate the second fetch of MCS data. Then the added >> work is just comparing the MCS data with 0 and the flow control. >> >> Below is the BDW SIMD8 code for SAMPLES=4. I see at least 8 >> instructions that could be removed (all the moves to g124-g127). >> Without GL_EXT_shader_samples_identical, none of those extra >> instructions are there. I'll play around with the GLSL to see if I can >> get the backend to generate better code. Otherwise, we'll have to see >> about fixing the backend. >> >> SIMD8 shader: 51 instructions. 0 loops. 170 cycles. 0:0 spills:fills. >> Promoted 0 constants. Compacted 816 to 544 bytes (33%) >>START B0 >> pln(8) g34<1>F g4<0,1,0>F g2<8,8,1>F { align1 1Q >> compacted }; >> pln(8) g35<1>F g4.4<0,1,0>Fg2<8,8,1>F { align1 1Q >> compacted }; >> mov(8) g6<1>F [0F, 0F, 0F, 0F]VF { align1 1Q >> compacted }; >> mov(8) g7<1>F [0F, 0F, 0F, 0F]VF { align1 1Q >> compacted }; >> mov(8) g8<1>D g34<8,8,1>F { align1 1Q >> compacted }; >> mov(8) g9<1>D g35<8,8,1>F { align1 1Q >> compacted }; >> send(8) g2<1>UW g6<8,8,1>F >> sampler ld2dms SIMD8 Surface = 1 Sampler = 0 >> mlen 4 rlen 4 { align1 1Q }; >> mov(8) g124<1>Fg2<8,8,1>F { align1 1Q >> compacted }; >> mov(8) g125<1>Fg3<8,8,1>F { align1 1Q >> compacted }; >> mov(8) g126<1>Fg4<8,8,1>F { align1 1Q >> compacted }; >> mov(8) g127<1>Fg5<8,8,1>F { align1 1Q >> compacted }; >> mov.nz.f0(8)null<1>UD 0xUD{ align1 1Q >> }; >> (+f0) if(8) JIP: 32 UIP: 392{ align1 1Q >> }; > > This is a dead branch unless I'm missing something. The mov.nz 0x0 > sets the flag to false. > > I thought for a moment that maybe we were playing games and setting > only enabled-channels of the flag to false, but there's nothing else > that initializes the flag, so that can't be it. > > Without the seemingly-dead control flow the movs to g124-g127 before > the if would be removed. > > Have I misunderstood something? Here's what I used for a shader_runner test. #version 130 #extension GL_ARB_texture_multisample: require #extension GL_EXT_shader_samples_identical: enable #define gvec4 vec4 uniform sampler2DMS texSampler; in vec2 texCoords; out gvec4 out_color[1]; #define SAMPLES 4 #define S s4 /* The divide will happen at the end for floats. */ vec4 merge(vec4 a, vec4 b) { return a + b; } /* Reduce from N samples to N/2 samples. */ #define REDUCE(dst, src) \ do { \ if (src.length() <= SAMPLES) { \ for (i = 0; i < dst.length(); i++) \ dst[i] = merge(src[i*2], src[i*2+1]); \ } \ } while (false) void emit2(gvec4 s) { for (int i = 0; i < out_color.length(); i++) out_color[i] = s; } /* Scale the final result. */ void emit(vec4 s) { emit2(gvec4(s / float(SAMPLES))); } void main() { gvec4 s4[4], s2[2], s1[1]; ivec2 tc = ivec2(texCoords); int i; S[0] = texelFetch(texSampler, tc, 0); if (textureSamplesIdenticalEXT(texSampler, tc)) { emit2(S[0]); return; } for (i = 1; i < SAMPLES; i++)\n" S[i] = texelFetch(texSampler, tc, i);\n" REDUCE(s2, s4); REDUCE(s1, s2); emit(s1[0]); } I think it's pulling the SEND out of the end of the then-branch and the else-branch. That leaves the then-branch empty, and it doesn't get removed. The empty basic block is gone, but the instruction count is the same, if I change this to: #version 130 #extension GL_ARB_texture_multisample: require #extension GL_EXT_shader_samples_identical: enable #define gvec4 vec4 uniform sampler2DMS texSampler; in vec2 texCoords; out gvec4 out_color[1]; #define SAMPLES 4 #define S s4 /* The divide will happen at the end for floats. */ vec4 merge(vec4 a, vec4 b) { return a + b; } /* Reduce from N samples to N/2 samples. */ #define REDUCE(dst, src) \ do {
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On Mon, Feb 22, 2016 at 1:05 PM, Ian Romanickwrote: > On 02/19/2016 02:57 AM, Iago Toral wrote: >> I don't know much about this, but using shader_samples_identical should >> only give a benefit if we actually get identical samples, otherwise it >> means more work, right? > > It is possible to do more work. Assuming the backend compiler does > everything right, that amount of extra work should be trivial. The > initial texelFetch will get the MCS data and the sample zero data. The > textureSamplesIdenticalEXT also wants the MCS data. The backend CSE > pass should eliminate the second fetch of MCS data. Then the added > work is just comparing the MCS data with 0 and the flow control. > > Below is the BDW SIMD8 code for SAMPLES=4. I see at least 8 > instructions that could be removed (all the moves to g124-g127). > Without GL_EXT_shader_samples_identical, none of those extra > instructions are there. I'll play around with the GLSL to see if I can > get the backend to generate better code. Otherwise, we'll have to see > about fixing the backend. > > SIMD8 shader: 51 instructions. 0 loops. 170 cycles. 0:0 spills:fills. > Promoted 0 constants. Compacted 816 to 544 bytes (33%) >START B0 > pln(8) g34<1>F g4<0,1,0>F g2<8,8,1>F { align1 1Q > compacted }; > pln(8) g35<1>F g4.4<0,1,0>Fg2<8,8,1>F { align1 1Q > compacted }; > mov(8) g6<1>F [0F, 0F, 0F, 0F]VF { align1 1Q > compacted }; > mov(8) g7<1>F [0F, 0F, 0F, 0F]VF { align1 1Q > compacted }; > mov(8) g8<1>D g34<8,8,1>F { align1 1Q > compacted }; > mov(8) g9<1>D g35<8,8,1>F { align1 1Q > compacted }; > send(8) g2<1>UW g6<8,8,1>F > sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen > 4 rlen 4 { align1 1Q }; > mov(8) g124<1>Fg2<8,8,1>F { align1 1Q > compacted }; > mov(8) g125<1>Fg3<8,8,1>F { align1 1Q > compacted }; > mov(8) g126<1>Fg4<8,8,1>F { align1 1Q > compacted }; > mov(8) g127<1>Fg5<8,8,1>F { align1 1Q > compacted }; > mov.nz.f0(8)null<1>UD 0xUD{ align1 1Q }; > (+f0) if(8) JIP: 32 UIP: 392{ align1 1Q }; This is a dead branch unless I'm missing something. The mov.nz 0x0 sets the flag to false. I thought for a moment that maybe we were playing games and setting only enabled-channels of the flag to false, but there's nothing else that initializes the flag, so that can't be it. Without the seemingly-dead control flow the movs to g124-g127 before the if would be removed. Have I misunderstood something? >END B0 ->B1 ->B2 >START B1 <-B0 > else(8) JIP: 376UIP: 376{ align1 1Q }; And then of course we have no code in the "then" case anyway... :) >END B1 ->B3 >START B2 <-B0 > mov(8) g10<1>F 1.4013e-45F { align1 1Q }; > mov(8) g11<1>F [0F, 0F, 0F, 0F]VF { align1 1Q > compacted }; > mov(8) g12<1>F g8<8,8,1>F { align1 1Q > compacted }; > mov(8) g13<1>F g9<8,8,1>F { align1 1Q > compacted }; > mov(8) g14<1>F 2.8026e-45F { align1 1Q }; > mov(8) g15<1>F [0F, 0F, 0F, 0F]VF { align1 1Q > compacted }; > mov(8) g16<1>F g8<8,8,1>F { align1 1Q > compacted }; > mov(8) g17<1>F g9<8,8,1>F { align1 1Q > compacted }; > mov(8) g18<1>F 4.2039e-45F { align1 1Q }; > mov(8) g19<1>F [0F, 0F, 0F, 0F]VF { align1 1Q > compacted }; > mov(8) g20<1>F g8<8,8,1>F { align1 1Q > compacted }; > mov(8) g21<1>F g9<8,8,1>F { align1 1Q > compacted }; > send(8) g6<1>UW g10<8,8,1>F > sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen > 4 rlen 4 { align1 1Q }; > send(8) g10<1>UWg14<8,8,1>F > sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen > 4 rlen 4 { align1 1Q }; > send(8) g14<1>UWg18<8,8,1>F > sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen > 4 rlen 4 { align1 1Q }; > add(8) g18<1>F g2<8,8,1>F g6<8,8,1>F { align1 1Q > compacted }; > add(8) g22<1>F g3<8,8,1>F g7<8,8,1>F { align1 1Q > compacted }; > add(8) g26<1>F g4<8,8,1>F g8<8,8,1>F { align1 1Q > compacted }; > add(8) g30<1>F g5<8,8,1>F
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On 02/19/2016 02:57 AM, Iago Toral wrote: > I don't know much about this, but using shader_samples_identical should > only give a benefit if we actually get identical samples, otherwise it > means more work, right? It is possible to do more work. Assuming the backend compiler does everything right, that amount of extra work should be trivial. The initial texelFetch will get the MCS data and the sample zero data. The textureSamplesIdenticalEXT also wants the MCS data. The backend CSE pass should eliminate the second fetch of MCS data. Then the added work is just comparing the MCS data with 0 and the flow control. Below is the BDW SIMD8 code for SAMPLES=4. I see at least 8 instructions that could be removed (all the moves to g124-g127). Without GL_EXT_shader_samples_identical, none of those extra instructions are there. I'll play around with the GLSL to see if I can get the backend to generate better code. Otherwise, we'll have to see about fixing the backend. SIMD8 shader: 51 instructions. 0 loops. 170 cycles. 0:0 spills:fills. Promoted 0 constants. Compacted 816 to 544 bytes (33%) START B0 pln(8) g34<1>F g4<0,1,0>F g2<8,8,1>F { align1 1Q compacted }; pln(8) g35<1>F g4.4<0,1,0>Fg2<8,8,1>F { align1 1Q compacted }; mov(8) g6<1>F [0F, 0F, 0F, 0F]VF { align1 1Q compacted }; mov(8) g7<1>F [0F, 0F, 0F, 0F]VF { align1 1Q compacted }; mov(8) g8<1>D g34<8,8,1>F { align1 1Q compacted }; mov(8) g9<1>D g35<8,8,1>F { align1 1Q compacted }; send(8) g2<1>UW g6<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; mov(8) g124<1>Fg2<8,8,1>F { align1 1Q compacted }; mov(8) g125<1>Fg3<8,8,1>F { align1 1Q compacted }; mov(8) g126<1>Fg4<8,8,1>F { align1 1Q compacted }; mov(8) g127<1>Fg5<8,8,1>F { align1 1Q compacted }; mov.nz.f0(8)null<1>UD 0xUD{ align1 1Q }; (+f0) if(8) JIP: 32 UIP: 392{ align1 1Q }; END B0 ->B1 ->B2 START B1 <-B0 else(8) JIP: 376UIP: 376{ align1 1Q }; END B1 ->B3 START B2 <-B0 mov(8) g10<1>F 1.4013e-45F { align1 1Q }; mov(8) g11<1>F [0F, 0F, 0F, 0F]VF { align1 1Q compacted }; mov(8) g12<1>F g8<8,8,1>F { align1 1Q compacted }; mov(8) g13<1>F g9<8,8,1>F { align1 1Q compacted }; mov(8) g14<1>F 2.8026e-45F { align1 1Q }; mov(8) g15<1>F [0F, 0F, 0F, 0F]VF { align1 1Q compacted }; mov(8) g16<1>F g8<8,8,1>F { align1 1Q compacted }; mov(8) g17<1>F g9<8,8,1>F { align1 1Q compacted }; mov(8) g18<1>F 4.2039e-45F { align1 1Q }; mov(8) g19<1>F [0F, 0F, 0F, 0F]VF { align1 1Q compacted }; mov(8) g20<1>F g8<8,8,1>F { align1 1Q compacted }; mov(8) g21<1>F g9<8,8,1>F { align1 1Q compacted }; send(8) g6<1>UW g10<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; send(8) g10<1>UWg14<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; send(8) g14<1>UWg18<8,8,1>F sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 rlen 4 { align1 1Q }; add(8) g18<1>F g2<8,8,1>F g6<8,8,1>F { align1 1Q compacted }; add(8) g22<1>F g3<8,8,1>F g7<8,8,1>F { align1 1Q compacted }; add(8) g26<1>F g4<8,8,1>F g8<8,8,1>F { align1 1Q compacted }; add(8) g30<1>F g5<8,8,1>F g9<8,8,1>F { align1 1Q compacted }; add(8) g19<1>F g10<8,8,1>F g14<8,8,1>F { align1 1Q compacted }; add(8) g23<1>F g11<8,8,1>F g15<8,8,1>F { align1 1Q compacted }; add(8) g27<1>F g12<8,8,1>F g16<8,8,1>F { align1 1Q compacted }; add(8) g31<1>F g13<8,8,1>F g17<8,8,1>F { align1 1Q compacted }; add(8) g20<1>F g18<8,8,1>F g19<8,8,1>F { align1 1Q compacted }; add(8) g24<1>F g22<8,8,1>F g23<8,8,1>F { align1 1Q compacted }; add(8) g28<1>F g26<8,8,1>F g27<8,8,1>F { align1 1Q compacted }; add(8) g32<1>F g30<8,8,1>F
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
Am 19.02.2016 um 11:57 schrieb Iago Toral: > On Thu, 2016-02-18 at 16:13 -0800, Ian Romanick wrote: >> On 02/18/2016 01:47 PM, Ian Romanick wrote: >>> On 02/18/2016 08:44 AM, Neil Roberts wrote: I made a pathological test case (attached) which repeatedly renders into an MSAA FBO and then blits it to the screen and measures the framerate. It checks it with a range of different sample counts. The rendering is done either by rendering two triangles to fill the framebuffer or by calling glClear. The percentage increase in framerate after applying the patch is like this: With triangles to fill buffer: 16 62,27% 8 48,59% 4 27,72% 2 5,34% 0 0,58% With glClear: 16 -5,20% 8 -7,08% 4 -2,45% 2 -20,76% 0 3,71% >>> >>> I find this result interesting. In the samples=0 case, the before and >>> after shaders should be identical. I dug into this a bit, and I think >>> the problem is the previous patch. Using do { ... } while (false) in >>> the macro makes the compiler generate some really, really bad code. >>> Just deleting those two lines makes a huge difference. >>> >>> With do { ... } while (false): >>> >>> SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills. >>> Promoted 0 constants. Compacted 3648 to 2192 bytes (40%) >>> >>> Without: >>> >>> SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills. >>> Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) >>> SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills. >>> Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) >>> >>> This is for samples=16 and sampler2DMS. The cycle counts are bogus >>> because the 4 loops have their cycle counts statically multiplied by >>> 10... even though the loop will only execute once. >>> >>> So... I'll try some more experiments. I also wonder about changing the >>> SAMPLES > 1 to SAMPLES > 2. >> >> This is weird, and I'm not sure what to make of it. I ran your test at >> a bunch of commits on my branch. Getting rid of gl_FragColor was the >> big win. The GL_EXT_shader_samples_identical patch is still the big >> loss. I'm assuming that causes enough register pressure over the >> previous commit to lose SIMD16. The following patch just removes the >> "do {" and "} while (false)". That gets back some of the losses, >> but not all of it. > > I don't know much about this, but using shader_samples_identical should > only give a benefit if we actually get identical samples, otherwise it > means more work, right? I noticed that the test renders a quad with > random colors for each vertex that will be interpolated across the > region, so could it be that we are hitting few cases of identical > samples? Shouldn't the results be expected to be better if we rendered a > flat color instead? Unless you run that shader drawing random colors at per-sample frequency (using per-sample interpolation), you still get identical samples (for each pixel). Except at the edges of the quad. (I didn't actually look at the test...) Roland > >> The raw per-commit data is below. >> >> 5c7f974 Android: disable unused-parameter warning >> With triangles to fill buffer: >> >> 8,419.603882 >> 4,633.834045 >> 2,595.628113 >> 0,795.798157 >> >> With glClear: >> >> 8,450.937958 >> 4,594.000610 >> 2,641.354553 >> 0,767.400818 >> >> 89b8c0a meta/blit: Replace the gl_FragColor write >> With triangles to fill buffer: >> >> 8,597.943054 >> 4,803.083862 >> 2,942.862549 >> 0,1129.432983 >> >> With glClear: >> >> 8,643.500671 >> 4,980.392151 >> 2,982.704407 >> 0,1116.569946 >> >> 7025d43 meta/blit: Don't dynamically select the MSAA "merge" function >> With triangles to fill buffer: >> >> 8,599.412598 >> 4,954.745056 >> 2,933.184021 >> 0,1129.305420 >> >> With glClear: >> >> 8,582.106079 >> 4,965.810303 >> 2,975.419434 >> 0,1169.180420 >> >> b895a4d meta/blit: Use a single, master shader template for MSAA-SS blits >> With triangles to fill buffer: >> >> 8,591.786011 >> 4,932.835815 >> 2,910.498047 >> 0,1072.501099 >> >> With glClear: >> >> 8,623.441406 >> 4,952.562378 >> 2,941.442261 >> 0,1081.314819 >> >> 197e4be meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve >> blit >> With triangles to fill buffer: >> >> 8,559.252869 >> 4,795.355103 >> 2,781.494202 >> 0,941.619568 >> >> With glClear: >> >> 8,577.400513 >> 4,806.321533 >> 2,811.030029 >> 0,957.579224 >> >> ec01613 fix loop nonsense derp >> With triangles to fill buffer: >> >> 8,559.221558 >> 4,788.767944 >> 2,786.287170 >> 0,1078.632324 >> >> With glClear: >> >> 8,575.771545 >> 4,956.205750 >> 2,813.206482 >> 0,.111084 >> >>> I will probably also make a patch to fix the damage done by the do { ... >>> } while (false). That's just comedy. >>> It seems like a pretty convincing win for the triangle case but the clear case
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On Fri, 2016-02-19 at 11:47 +, Neil Roberts wrote: > Iago Toralwrites: > > > I don't know much about this, but using shader_samples_identical > > should only give a benefit if we actually get identical samples, > > otherwise it means more work, right? I noticed that the test renders a > > quad with random colors for each vertex that will be interpolated > > across the region, so could it be that we are hitting few cases of > > identical samples? Shouldn't the results be expected to be better if > > we rendered a flat color instead? > > It should all be identical regardless of the color chosen because it is > doing per-pixel shading rather than per-sample which means the fragment > shader is only run once per pixel and the same value is written to all > of the samples. The only case where you get differing samples normally > is at the edges of a polygon where the single color is only written to a > few of the samples. > > In this case it is even more identical because the random color is set > on a uniform rather than a vertex so it is actually using the same color > for the whole framebuffer. However, it is rendered as two triangles > which might mean that along the diagonal of the framebuffer where the > two triangles meet there will be a polygon edge which means the hardware > might not recognise that these all the same color, but that should only > be a few pixels and probably won't affect the results very much. You're right, I read through the test code too fast :) Iago ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
Iago Toralwrites: > I don't know much about this, but using shader_samples_identical > should only give a benefit if we actually get identical samples, > otherwise it means more work, right? I noticed that the test renders a > quad with random colors for each vertex that will be interpolated > across the region, so could it be that we are hitting few cases of > identical samples? Shouldn't the results be expected to be better if > we rendered a flat color instead? It should all be identical regardless of the color chosen because it is doing per-pixel shading rather than per-sample which means the fragment shader is only run once per pixel and the same value is written to all of the samples. The only case where you get differing samples normally is at the edges of a polygon where the single color is only written to a few of the samples. In this case it is even more identical because the random color is set on a uniform rather than a vertex so it is actually using the same color for the whole framebuffer. However, it is rendered as two triangles which might mean that along the diagonal of the framebuffer where the two triangles meet there will be a polygon edge which means the hardware might not recognise that these all the same color, but that should only be a few pixels and probably won't affect the results very much. Regards, - Neil ___ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On Thu, 2016-02-18 at 16:13 -0800, Ian Romanick wrote: > On 02/18/2016 01:47 PM, Ian Romanick wrote: > > On 02/18/2016 08:44 AM, Neil Roberts wrote: > >> I made a pathological test case (attached) which repeatedly renders into > >> an MSAA FBO and then blits it to the screen and measures the framerate. > >> It checks it with a range of different sample counts. The rendering is > >> done either by rendering two triangles to fill the framebuffer or by > >> calling glClear. > >> > >> The percentage increase in framerate after applying the patch is like > >> this: > >> > >> With triangles to fill buffer: > >> > >> 16 62,27% > >> 8 48,59% > >> 4 27,72% > >> 2 5,34% > >> 0 0,58% > >> > >> With glClear: > >> > >> 16 -5,20% > >> 8 -7,08% > >> 4 -2,45% > >> 2 -20,76% > >> 0 3,71% > > > > I find this result interesting. In the samples=0 case, the before and > > after shaders should be identical. I dug into this a bit, and I think > > the problem is the previous patch. Using do { ... } while (false) in > > the macro makes the compiler generate some really, really bad code. > > Just deleting those two lines makes a huge difference. > > > > With do { ... } while (false): > > > > SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills. > > Promoted 0 constants. Compacted 3648 to 2192 bytes (40%) > > > > Without: > > > > SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills. > > Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) > > SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills. > > Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) > > > > This is for samples=16 and sampler2DMS. The cycle counts are bogus > > because the 4 loops have their cycle counts statically multiplied by > > 10... even though the loop will only execute once. > > > > So... I'll try some more experiments. I also wonder about changing the > > SAMPLES > 1 to SAMPLES > 2. > > This is weird, and I'm not sure what to make of it. I ran your test at > a bunch of commits on my branch. Getting rid of gl_FragColor was the > big win. The GL_EXT_shader_samples_identical patch is still the big > loss. I'm assuming that causes enough register pressure over the > previous commit to lose SIMD16. The following patch just removes the > "do {" and "} while (false)". That gets back some of the losses, > but not all of it. I don't know much about this, but using shader_samples_identical should only give a benefit if we actually get identical samples, otherwise it means more work, right? I noticed that the test renders a quad with random colors for each vertex that will be interpolated across the region, so could it be that we are hitting few cases of identical samples? Shouldn't the results be expected to be better if we rendered a flat color instead? > The raw per-commit data is below. > > 5c7f974 Android: disable unused-parameter warning > With triangles to fill buffer: > > 8,419.603882 > 4,633.834045 > 2,595.628113 > 0,795.798157 > > With glClear: > > 8,450.937958 > 4,594.000610 > 2,641.354553 > 0,767.400818 > > 89b8c0a meta/blit: Replace the gl_FragColor write > With triangles to fill buffer: > > 8,597.943054 > 4,803.083862 > 2,942.862549 > 0,1129.432983 > > With glClear: > > 8,643.500671 > 4,980.392151 > 2,982.704407 > 0,1116.569946 > > 7025d43 meta/blit: Don't dynamically select the MSAA "merge" function > With triangles to fill buffer: > > 8,599.412598 > 4,954.745056 > 2,933.184021 > 0,1129.305420 > > With glClear: > > 8,582.106079 > 4,965.810303 > 2,975.419434 > 0,1169.180420 > > b895a4d meta/blit: Use a single, master shader template for MSAA-SS blits > With triangles to fill buffer: > > 8,591.786011 > 4,932.835815 > 2,910.498047 > 0,1072.501099 > > With glClear: > > 8,623.441406 > 4,952.562378 > 2,941.442261 > 0,1081.314819 > > 197e4be meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit > With triangles to fill buffer: > > 8,559.252869 > 4,795.355103 > 2,781.494202 > 0,941.619568 > > With glClear: > > 8,577.400513 > 4,806.321533 > 2,811.030029 > 0,957.579224 > > ec01613 fix loop nonsense derp > With triangles to fill buffer: > > 8,559.221558 > 4,788.767944 > 2,786.287170 > 0,1078.632324 > > With glClear: > > 8,575.771545 > 4,956.205750 > 2,813.206482 > 0,.111084 > > > I will probably also make a patch to fix the damage done by the do { ... > > } while (false). That's just comedy. > > > >> It seems like a pretty convincing win for the triangle case but the > >> clear case makes it slightly worse. Presumably this is because we don't > >> do anything to detect the value stored in the MCS buffer when doing a > >> fast clear so the fast path isn't taken and the shader being more > >> complicated makes it slower. > >> > >> Not sure if we want to try and do anything about that because > >> potentially the cleared pixels aren't very common in a
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On 02/18/2016 01:47 PM, Ian Romanick wrote: > On 02/18/2016 08:44 AM, Neil Roberts wrote: >> I made a pathological test case (attached) which repeatedly renders into >> an MSAA FBO and then blits it to the screen and measures the framerate. >> It checks it with a range of different sample counts. The rendering is >> done either by rendering two triangles to fill the framebuffer or by >> calling glClear. >> >> The percentage increase in framerate after applying the patch is like >> this: >> >> With triangles to fill buffer: >> >> 16 62,27% >> 8 48,59% >> 4 27,72% >> 2 5,34% >> 0 0,58% >> >> With glClear: >> >> 16 -5,20% >> 8 -7,08% >> 4 -2,45% >> 2 -20,76% >> 0 3,71% > > I find this result interesting. In the samples=0 case, the before and > after shaders should be identical. I dug into this a bit, and I think > the problem is the previous patch. Using do { ... } while (false) in > the macro makes the compiler generate some really, really bad code. > Just deleting those two lines makes a huge difference. > > With do { ... } while (false): > > SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills. > Promoted 0 constants. Compacted 3648 to 2192 bytes (40%) > > Without: > > SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills. > Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) > SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills. > Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) > > This is for samples=16 and sampler2DMS. The cycle counts are bogus > because the 4 loops have their cycle counts statically multiplied by > 10... even though the loop will only execute once. > > So... I'll try some more experiments. I also wonder about changing the > SAMPLES > 1 to SAMPLES > 2. This is weird, and I'm not sure what to make of it. I ran your test at a bunch of commits on my branch. Getting rid of gl_FragColor was the big win. The GL_EXT_shader_samples_identical patch is still the big loss. I'm assuming that causes enough register pressure over the previous commit to lose SIMD16. The following patch just removes the "do {" and "} while (false)". That gets back some of the losses, but not all of it. The raw per-commit data is below. 5c7f974 Android: disable unused-parameter warning With triangles to fill buffer: 8,419.603882 4,633.834045 2,595.628113 0,795.798157 With glClear: 8,450.937958 4,594.000610 2,641.354553 0,767.400818 89b8c0a meta/blit: Replace the gl_FragColor write With triangles to fill buffer: 8,597.943054 4,803.083862 2,942.862549 0,1129.432983 With glClear: 8,643.500671 4,980.392151 2,982.704407 0,1116.569946 7025d43 meta/blit: Don't dynamically select the MSAA "merge" function With triangles to fill buffer: 8,599.412598 4,954.745056 2,933.184021 0,1129.305420 With glClear: 8,582.106079 4,965.810303 2,975.419434 0,1169.180420 b895a4d meta/blit: Use a single, master shader template for MSAA-SS blits With triangles to fill buffer: 8,591.786011 4,932.835815 2,910.498047 0,1072.501099 With glClear: 8,623.441406 4,952.562378 2,941.442261 0,1081.314819 197e4be meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit With triangles to fill buffer: 8,559.252869 4,795.355103 2,781.494202 0,941.619568 With glClear: 8,577.400513 4,806.321533 2,811.030029 0,957.579224 ec01613 fix loop nonsense derp With triangles to fill buffer: 8,559.221558 4,788.767944 2,786.287170 0,1078.632324 With glClear: 8,575.771545 4,956.205750 2,813.206482 0,.111084 > I will probably also make a patch to fix the damage done by the do { ... > } while (false). That's just comedy. > >> It seems like a pretty convincing win for the triangle case but the >> clear case makes it slightly worse. Presumably this is because we don't >> do anything to detect the value stored in the MCS buffer when doing a >> fast clear so the fast path isn't taken and the shader being more >> complicated makes it slower. >> >> Not sure if we want to try and do anything about that because >> potentially the cleared pixels aren't very common in a framebuffer from >> a real use case so it might not really matter. >> >> Currently we don't use SIMD16 for 16x MSAA because we can't allocate the >> registers well enough to make it worthwhile. This patch makes that >> problem a bit more interesting because even if we end up spilling a lot >> it might still be worth doing SIMD16 because the cases where the spilled >> instructions are hit would be much less common. >> >> - Neil >> >> Ian Romanickwrites: >> >>> From: Ian Romanick >>> >>> Somewhat surprisingly, this didn't have any affect on performance in the >>> benchmarks that Martin tried for me. >>> >>> Signed-off-by: Ian Romanick >>> --- >>> src/mesa/drivers/common/meta_blit.c | 10 +- >>> 1 file changed, 9
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
On 02/18/2016 08:44 AM, Neil Roberts wrote: > I made a pathological test case (attached) which repeatedly renders into > an MSAA FBO and then blits it to the screen and measures the framerate. > It checks it with a range of different sample counts. The rendering is > done either by rendering two triangles to fill the framebuffer or by > calling glClear. > > The percentage increase in framerate after applying the patch is like > this: > > With triangles to fill buffer: > > 16 62,27% > 8 48,59% > 4 27,72% > 2 5,34% > 0 0,58% > > With glClear: > > 16 -5,20% > 8 -7,08% > 4 -2,45% > 2 -20,76% > 0 3,71% I find this result interesting. In the samples=0 case, the before and after shaders should be identical. I dug into this a bit, and I think the problem is the previous patch. Using do { ... } while (false) in the macro makes the compiler generate some really, really bad code. Just deleting those two lines makes a huge difference. With do { ... } while (false): SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills. Promoted 0 constants. Compacted 3648 to 2192 bytes (40%) Without: SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills. Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills. Promoted 0 constants. Compacted 2544 to 1600 bytes (37%) This is for samples=16 and sampler2DMS. The cycle counts are bogus because the 4 loops have their cycle counts statically multiplied by 10... even though the loop will only execute once. So... I'll try some more experiments. I also wonder about changing the SAMPLES > 1 to SAMPLES > 2. I will probably also make a patch to fix the damage done by the do { ... } while (false). That's just comedy. > It seems like a pretty convincing win for the triangle case but the > clear case makes it slightly worse. Presumably this is because we don't > do anything to detect the value stored in the MCS buffer when doing a > fast clear so the fast path isn't taken and the shader being more > complicated makes it slower. > > Not sure if we want to try and do anything about that because > potentially the cleared pixels aren't very common in a framebuffer from > a real use case so it might not really matter. > > Currently we don't use SIMD16 for 16x MSAA because we can't allocate the > registers well enough to make it worthwhile. This patch makes that > problem a bit more interesting because even if we end up spilling a lot > it might still be worth doing SIMD16 because the cases where the spilled > instructions are hit would be much less common. > > - Neil > > > > > Ian Romanickwrites: > >> From: Ian Romanick >> >> Somewhat surprisingly, this didn't have any affect on performance in the >> benchmarks that Martin tried for me. >> >> Signed-off-by: Ian Romanick >> --- >> src/mesa/drivers/common/meta_blit.c | 10 +- >> 1 file changed, 9 insertions(+), 1 deletion(-) >> >> diff --git a/src/mesa/drivers/common/meta_blit.c >> b/src/mesa/drivers/common/meta_blit.c >> index 28aabd3..c0ec51f 100644 >> --- a/src/mesa/drivers/common/meta_blit.c >> +++ b/src/mesa/drivers/common/meta_blit.c >> @@ -530,6 +530,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, >> fs_source = ralloc_asprintf(mem_ctx, >> "#version 130\n" >> "#extension >> GL_ARB_texture_multisample: require\n" >> + "#extension >> GL_EXT_shader_samples_identical: enable\n" >> "#define gvec4 %svec4\n" >> "uniform %ssampler2DMS%s texSampler;\n" >> "in %s texCoords;\n" >> @@ -569,7 +570,14 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx, >> " i%s tc = i%s(texCoords);\n" >> " int i;\n" >> "\n" >> - " for (i = 0; i < SAMPLES; i++)\n" >> + " S[0] = texelFetch(texSampler, tc, >> 0);\n" >> + "#if >> defined(GL_EXT_shader_samples_identical) && SAMPLES > 1\n" >> + " if >> (textureSamplesIdenticalEXT(texSampler, tc)) {\n" >> + " emit2(S[0]);\n" >> + " return;\n" >> + " }\n" >> + "#endif\n" >> + " for (i = 1; i < SAMPLES; i++)\n" >> " S[i] = texelFetch(texSampler, >> tc, i);\n" >>
Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
I made a pathological test case (attached) which repeatedly renders into an MSAA FBO and then blits it to the screen and measures the framerate. It checks it with a range of different sample counts. The rendering is done either by rendering two triangles to fill the framebuffer or by calling glClear. The percentage increase in framerate after applying the patch is like this: With triangles to fill buffer: 16 62,27% 8 48,59% 4 27,72% 2 5,34% 0 0,58% With glClear: 16 -5,20% 8 -7,08% 4 -2,45% 2 -20,76% 0 3,71% It seems like a pretty convincing win for the triangle case but the clear case makes it slightly worse. Presumably this is because we don't do anything to detect the value stored in the MCS buffer when doing a fast clear so the fast path isn't taken and the shader being more complicated makes it slower. Not sure if we want to try and do anything about that because potentially the cleared pixels aren't very common in a framebuffer from a real use case so it might not really matter. Currently we don't use SIMD16 for 16x MSAA because we can't allocate the registers well enough to make it worthwhile. This patch makes that problem a bit more interesting because even if we end up spilling a lot it might still be worth doing SIMD16 because the cases where the spilled instructions are hit would be much less common. - Neil #include #include #include #include #include #define N_FRAMES_TO_SKIP 1000 #define N_FRAMES_TO_DRAW 1 enum draw_mode { DRAW_MODE_TRIANGLES, DRAW_MODE_CLEAR }; struct data { SDL_Window *window; SDL_GLContext gl_context; }; static SDL_GLContext create_gl_context(SDL_Window *window) { /* First try creating a core context because if we get one it * can be more efficient. */ SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3); SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 1); SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK, SDL_GL_CONTEXT_PROFILE_CORE); return SDL_GL_CreateContext(window); } static const char vertex_shader_source[] = "#version 150\n" "in vec2 piglit_vertex;\n" "\n" "void\n" "main()\n" "{\n" "gl_Position = vec4(piglit_vertex, 0.0, 1.0);\n" "}\n"; static const char fragment_shader_source[] = "#version 150\n" "uniform vec3 color;\n" "\n" "void\n" "main()\n" "{\n" "gl_FragColor = vec4(color, 1.0);\n" "}\n"; static GLuint create_shader(const char *name, GLenum type, const char *source, int source_length) { GLuint shader; GLint length, compile_status; GLsizei actual_length; GLchar *info_log; shader = glCreateShader(type); glShaderSource(shader, 1, /* n_strings */ , _length); glCompileShader(shader); glGetShaderiv(shader, GL_INFO_LOG_LENGTH, ); if (length > 0) { info_log = malloc(length); glGetShaderInfoLog(shader, length, _length, info_log); if (*info_log) { fprintf(stderr, "Info log for %s:\n%s\n", name, info_log); } free(info_log); } glGetShaderiv(shader, GL_COMPILE_STATUS, _status); if (!compile_status) { fprintf(stderr, "%s compilation failed", name); glDeleteShader(shader); return 0; } return shader; } static bool link_program(GLuint program) { GLint length, link_status; GLsizei actual_length; GLchar *info_log; glLinkProgram(program); glGetProgramiv(program, GL_INFO_LOG_LENGTH, ); if (length > 0) { info_log = malloc(length); glGetProgramInfoLog(program, length, _length, info_log); if (*info_log) fprintf(stderr, "Link info log:\n%s\n", info_log); free(info_log); } glGetProgramiv(program, GL_LINK_STATUS, _status); if (!link_status) { fprintf(stderr, "program link failed"); return false; } return true; } static bool run_test(struct data *data, int n_samples, enum draw_mode draw_mode, float *result) { float verts[] = { -1.0f, -1.0f, 1.0f, -1.0f, -1.0f, 1.0f, 1.0f, 1.0f }; GLuint