Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-22 Thread Ian Romanick
On 02/22/2016 02:10 PM, Matt Turner wrote:
> On Mon, Feb 22, 2016 at 1:05 PM, Ian Romanick  wrote:
>> On 02/19/2016 02:57 AM, Iago Toral wrote:
>>> I don't know much about this, but using shader_samples_identical should
>>> only give a benefit if we actually get identical samples, otherwise it
>>> means more work, right?
>>
>> It is possible to do more work.  Assuming the backend compiler does
>> everything right, that amount of extra work should be trivial.  The
>> initial texelFetch will get the MCS data and the sample zero data.  The
>> textureSamplesIdenticalEXT also wants the MCS data.  The backend CSE
>> pass should eliminate the second fetch of MCS data.  Then the added
>> work is just comparing the MCS data with 0 and the flow control.
>>
>> Below is the BDW SIMD8 code for SAMPLES=4.  I see at least 8
>> instructions that could be removed (all the moves to g124-g127).
>> Without GL_EXT_shader_samples_identical, none of those extra
>> instructions are there.  I'll play around with the GLSL to see if I can
>> get the backend to generate better code.  Otherwise, we'll have to see
>> about fixing the backend.

Here's the actual shader.  I extracted this using Neil's
microbenchmark.  This one actually includes ld_mcs.  For some reason
it's not able to CSE across basic blocks.

Also... that stupid empty then-branch. :(  I tried a hack to not emit
it, but it must be getting emptied later.  I'll take a stab a updating
the dead CF pass to remove empty then-branches.  Comments in the NIR
translation code indicate that pass already handles empty else-branches.

SIMD8 shader: 55 instructions. 0 loops. 226 cycles. 0:0 spills:fills. Promoted 
0 constants. Compacted 880 to 576 bytes (35%)
   START B0
pln(8)  g34<1>F g4<0,1,0>F  g2<8,8,1>F  { align1 1Q 
compacted };
pln(8)  g35<1>F g4.4<0,1,0>Fg2<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g10<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
compacted };
mov(8)  g12<1>D g34<8,8,1>F { align1 1Q 
compacted };
mov(8)  g13<1>D g35<8,8,1>F { align1 1Q 
compacted };
send(8) g2<1>UW g12<8,8,1>F
sampler ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 
rlen 4 { align1 1Q };
mov(8)  g11<1>F g2<8,8,1>F  { align1 1Q 
compacted };
send(8) g6<1>UW g10<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
mov(8)  g124<1>Fg6<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g125<1>Fg7<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g126<1>Fg8<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g127<1>Fg9<8,8,1>F  { align1 1Q 
compacted };
cmp.z.f0(8) null<1>UD   g2<8,8,1>UD 0xUD{ align1 1Q 
compacted };
(+f0) if(8) JIP: 32 UIP: 424{ align1 1Q };
   END B0 ->B1 ->B2
   START B1 <-B0
else(8) JIP: 408UIP: 408{ align1 1Q };
   END B1 ->B3
   START B2 <-B0
mov(8)  g14<1>F g12<8,8,1>F { align1 1Q 
compacted };
mov(8)  g15<1>F g13<8,8,1>F { align1 1Q 
compacted };
mov(8)  g2<1>F  1.4013e-45F { align1 1Q };
mov(8)  g4<1>F  g12<8,8,1>F { align1 1Q 
compacted };
mov(8)  g5<1>F  g13<8,8,1>F { align1 1Q 
compacted };
mov(8)  g19<1>F 2.8026e-45F { align1 1Q };
mov(8)  g21<1>F g12<8,8,1>F { align1 1Q 
compacted };
mov(8)  g22<1>F g13<8,8,1>F { align1 1Q 
compacted };
mov(8)  g23<1>F 4.2039e-45F { align1 1Q };
mov(8)  g25<1>F g12<8,8,1>F { align1 1Q 
compacted };
mov(8)  g26<1>F g13<8,8,1>F { align1 1Q 
compacted };
send(8) g15<1>UWg14<8,8,1>F
sampler ld_mcs SIMD8 Surface = 1 Sampler = 0 mlen 2 
rlen 4 { align1 1Q };
mov(8)  g3<1>F  g15<8,8,1>F { align1 1Q 
compacted };
mov(8)  g20<1>F g15<8,8,1>F { align1 1Q 
compacted };
mov(8)  g24<1>F g15<8,8,1>F { align1 1Q 
compacted };
send(8) g10<1>UWg2<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
send(8) g14<1>UWg19<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
send(8) g18<1>UWg23<8,8,1>F

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-22 Thread Ian Romanick
On 02/22/2016 02:10 PM, Matt Turner wrote:
> On Mon, Feb 22, 2016 at 1:05 PM, Ian Romanick  wrote:
>> On 02/19/2016 02:57 AM, Iago Toral wrote:
>>> I don't know much about this, but using shader_samples_identical should
>>> only give a benefit if we actually get identical samples, otherwise it
>>> means more work, right?
>>
>> It is possible to do more work.  Assuming the backend compiler does
>> everything right, that amount of extra work should be trivial.  The
>> initial texelFetch will get the MCS data and the sample zero data.  The
>> textureSamplesIdenticalEXT also wants the MCS data.  The backend CSE
>> pass should eliminate the second fetch of MCS data.  Then the added
>> work is just comparing the MCS data with 0 and the flow control.
>>
>> Below is the BDW SIMD8 code for SAMPLES=4.  I see at least 8
>> instructions that could be removed (all the moves to g124-g127).
>> Without GL_EXT_shader_samples_identical, none of those extra
>> instructions are there.  I'll play around with the GLSL to see if I can
>> get the backend to generate better code.  Otherwise, we'll have to see
>> about fixing the backend.
>>
>> SIMD8 shader: 51 instructions. 0 loops. 170 cycles. 0:0 spills:fills. 
>> Promoted 0 constants. Compacted 816 to 544 bytes (33%)
>>START B0
>> pln(8)  g34<1>F g4<0,1,0>F  g2<8,8,1>F  { align1 1Q 
>> compacted };
>> pln(8)  g35<1>F g4.4<0,1,0>Fg2<8,8,1>F  { align1 1Q 
>> compacted };
>> mov(8)  g6<1>F  [0F, 0F, 0F, 0F]VF  { align1 1Q 
>> compacted };
>> mov(8)  g7<1>F  [0F, 0F, 0F, 0F]VF  { align1 1Q 
>> compacted };
>> mov(8)  g8<1>D  g34<8,8,1>F { align1 1Q 
>> compacted };
>> mov(8)  g9<1>D  g35<8,8,1>F { align1 1Q 
>> compacted };
>> send(8) g2<1>UW g6<8,8,1>F
>> sampler ld2dms SIMD8 Surface = 1 Sampler = 0 
>> mlen 4 rlen 4 { align1 1Q };
>> mov(8)  g124<1>Fg2<8,8,1>F  { align1 1Q 
>> compacted };
>> mov(8)  g125<1>Fg3<8,8,1>F  { align1 1Q 
>> compacted };
>> mov(8)  g126<1>Fg4<8,8,1>F  { align1 1Q 
>> compacted };
>> mov(8)  g127<1>Fg5<8,8,1>F  { align1 1Q 
>> compacted };
>> mov.nz.f0(8)null<1>UD   0xUD{ align1 1Q 
>> };
>> (+f0) if(8) JIP: 32 UIP: 392{ align1 1Q 
>> };
> 
> This is a dead branch unless I'm missing something. The mov.nz 0x0
> sets the flag to false.
> 
> I thought for a moment that maybe we were playing games and setting
> only enabled-channels of the flag to false, but there's nothing else
> that initializes the flag, so that can't be it.
> 
> Without the seemingly-dead control flow the movs to g124-g127 before
> the if would be removed.
> 
> Have I misunderstood something?

Here's what I used for a shader_runner test.

#version 130
#extension GL_ARB_texture_multisample: require
#extension GL_EXT_shader_samples_identical: enable
#define gvec4 vec4
uniform sampler2DMS texSampler;
in vec2 texCoords;
out gvec4 out_color[1];
#define SAMPLES 4
#define S s4

/* The divide will happen at the end for floats. */
vec4 merge(vec4 a, vec4 b) { return a + b; }

/* Reduce from N samples to N/2 samples. */
#define REDUCE(dst, src)  \
   do {   \
  if (src.length() <= SAMPLES) {  \
 for (i = 0; i < dst.length(); i++)   \
dst[i] = merge(src[i*2], src[i*2+1]); \
  }   \
   } while (false)

void emit2(gvec4 s) { for (int i = 0; i < out_color.length(); i++)
out_color[i] = s; }
/* Scale the final result. */
void emit(vec4 s) { emit2(gvec4(s / float(SAMPLES))); }

void main()
{
   gvec4 s4[4], s2[2], s1[1];
   ivec2 tc = ivec2(texCoords);
   int i;

   S[0] = texelFetch(texSampler, tc, 0);
   if (textureSamplesIdenticalEXT(texSampler, tc)) {
  emit2(S[0]);
  return;
   }

   for (i = 1; i < SAMPLES; i++)\n"
  S[i] = texelFetch(texSampler, tc, i);\n"

   REDUCE(s2, s4);
   REDUCE(s1, s2);
   emit(s1[0]);
}

I think it's pulling the SEND out of the end of the then-branch and the
else-branch.  That leaves the then-branch empty, and it doesn't get removed.

The empty basic block is gone, but the instruction count is the same, if
I change this to:

#version 130
#extension GL_ARB_texture_multisample: require
#extension GL_EXT_shader_samples_identical: enable
#define gvec4 vec4
uniform sampler2DMS texSampler;
in vec2 texCoords;
out gvec4 out_color[1];
#define SAMPLES 4
#define S s4

/* The divide will happen at the end for floats. */
vec4 merge(vec4 a, vec4 b) { return a + b; }

/* Reduce from N samples to N/2 samples. */
#define REDUCE(dst, src)  \
   do {  

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-22 Thread Matt Turner
On Mon, Feb 22, 2016 at 1:05 PM, Ian Romanick  wrote:
> On 02/19/2016 02:57 AM, Iago Toral wrote:
>> I don't know much about this, but using shader_samples_identical should
>> only give a benefit if we actually get identical samples, otherwise it
>> means more work, right?
>
> It is possible to do more work.  Assuming the backend compiler does
> everything right, that amount of extra work should be trivial.  The
> initial texelFetch will get the MCS data and the sample zero data.  The
> textureSamplesIdenticalEXT also wants the MCS data.  The backend CSE
> pass should eliminate the second fetch of MCS data.  Then the added
> work is just comparing the MCS data with 0 and the flow control.
>
> Below is the BDW SIMD8 code for SAMPLES=4.  I see at least 8
> instructions that could be removed (all the moves to g124-g127).
> Without GL_EXT_shader_samples_identical, none of those extra
> instructions are there.  I'll play around with the GLSL to see if I can
> get the backend to generate better code.  Otherwise, we'll have to see
> about fixing the backend.
>
> SIMD8 shader: 51 instructions. 0 loops. 170 cycles. 0:0 spills:fills. 
> Promoted 0 constants. Compacted 816 to 544 bytes (33%)
>START B0
> pln(8)  g34<1>F g4<0,1,0>F  g2<8,8,1>F  { align1 1Q 
> compacted };
> pln(8)  g35<1>F g4.4<0,1,0>Fg2<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g6<1>F  [0F, 0F, 0F, 0F]VF  { align1 1Q 
> compacted };
> mov(8)  g7<1>F  [0F, 0F, 0F, 0F]VF  { align1 1Q 
> compacted };
> mov(8)  g8<1>D  g34<8,8,1>F { align1 1Q 
> compacted };
> mov(8)  g9<1>D  g35<8,8,1>F { align1 1Q 
> compacted };
> send(8) g2<1>UW g6<8,8,1>F
> sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 
> 4 rlen 4 { align1 1Q };
> mov(8)  g124<1>Fg2<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g125<1>Fg3<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g126<1>Fg4<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g127<1>Fg5<8,8,1>F  { align1 1Q 
> compacted };
> mov.nz.f0(8)null<1>UD   0xUD{ align1 1Q };
> (+f0) if(8) JIP: 32 UIP: 392{ align1 1Q };

This is a dead branch unless I'm missing something. The mov.nz 0x0
sets the flag to false.

I thought for a moment that maybe we were playing games and setting
only enabled-channels of the flag to false, but there's nothing else
that initializes the flag, so that can't be it.

Without the seemingly-dead control flow the movs to g124-g127 before
the if would be removed.

Have I misunderstood something?

>END B0 ->B1 ->B2
>START B1 <-B0
> else(8) JIP: 376UIP: 376{ align1 1Q };

And then of course we have no code in the "then" case anyway... :)

>END B1 ->B3
>START B2 <-B0
> mov(8)  g10<1>F 1.4013e-45F { align1 1Q };
> mov(8)  g11<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
> compacted };
> mov(8)  g12<1>F g8<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g13<1>F g9<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g14<1>F 2.8026e-45F { align1 1Q };
> mov(8)  g15<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
> compacted };
> mov(8)  g16<1>F g8<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g17<1>F g9<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g18<1>F 4.2039e-45F { align1 1Q };
> mov(8)  g19<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
> compacted };
> mov(8)  g20<1>F g8<8,8,1>F  { align1 1Q 
> compacted };
> mov(8)  g21<1>F g9<8,8,1>F  { align1 1Q 
> compacted };
> send(8) g6<1>UW g10<8,8,1>F
> sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 
> 4 rlen 4 { align1 1Q };
> send(8) g10<1>UWg14<8,8,1>F
> sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 
> 4 rlen 4 { align1 1Q };
> send(8) g14<1>UWg18<8,8,1>F
> sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 
> 4 rlen 4 { align1 1Q };
> add(8)  g18<1>F g2<8,8,1>F  g6<8,8,1>F  { align1 1Q 
> compacted };
> add(8)  g22<1>F g3<8,8,1>F  g7<8,8,1>F  { align1 1Q 
> compacted };
> add(8)  g26<1>F g4<8,8,1>F  g8<8,8,1>F  { align1 1Q 
> compacted };
> add(8)  g30<1>F g5<8,8,1>F 

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-22 Thread Ian Romanick
On 02/19/2016 02:57 AM, Iago Toral wrote:
> I don't know much about this, but using shader_samples_identical should
> only give a benefit if we actually get identical samples, otherwise it
> means more work, right?

It is possible to do more work.  Assuming the backend compiler does
everything right, that amount of extra work should be trivial.  The
initial texelFetch will get the MCS data and the sample zero data.  The
textureSamplesIdenticalEXT also wants the MCS data.  The backend CSE
pass should eliminate the second fetch of MCS data.  Then the added
work is just comparing the MCS data with 0 and the flow control.

Below is the BDW SIMD8 code for SAMPLES=4.  I see at least 8
instructions that could be removed (all the moves to g124-g127).
Without GL_EXT_shader_samples_identical, none of those extra
instructions are there.  I'll play around with the GLSL to see if I can
get the backend to generate better code.  Otherwise, we'll have to see
about fixing the backend.

SIMD8 shader: 51 instructions. 0 loops. 170 cycles. 0:0 spills:fills. Promoted 
0 constants. Compacted 816 to 544 bytes (33%)
   START B0
pln(8)  g34<1>F g4<0,1,0>F  g2<8,8,1>F  { align1 1Q 
compacted };
pln(8)  g35<1>F g4.4<0,1,0>Fg2<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g6<1>F  [0F, 0F, 0F, 0F]VF  { align1 1Q 
compacted };
mov(8)  g7<1>F  [0F, 0F, 0F, 0F]VF  { align1 1Q 
compacted };
mov(8)  g8<1>D  g34<8,8,1>F { align1 1Q 
compacted };
mov(8)  g9<1>D  g35<8,8,1>F { align1 1Q 
compacted };
send(8) g2<1>UW g6<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
mov(8)  g124<1>Fg2<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g125<1>Fg3<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g126<1>Fg4<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g127<1>Fg5<8,8,1>F  { align1 1Q 
compacted };
mov.nz.f0(8)null<1>UD   0xUD{ align1 1Q };
(+f0) if(8) JIP: 32 UIP: 392{ align1 1Q };
   END B0 ->B1 ->B2
   START B1 <-B0
else(8) JIP: 376UIP: 376{ align1 1Q };
   END B1 ->B3
   START B2 <-B0
mov(8)  g10<1>F 1.4013e-45F { align1 1Q };
mov(8)  g11<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
compacted };
mov(8)  g12<1>F g8<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g13<1>F g9<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g14<1>F 2.8026e-45F { align1 1Q };
mov(8)  g15<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
compacted };
mov(8)  g16<1>F g8<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g17<1>F g9<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g18<1>F 4.2039e-45F { align1 1Q };
mov(8)  g19<1>F [0F, 0F, 0F, 0F]VF  { align1 1Q 
compacted };
mov(8)  g20<1>F g8<8,8,1>F  { align1 1Q 
compacted };
mov(8)  g21<1>F g9<8,8,1>F  { align1 1Q 
compacted };
send(8) g6<1>UW g10<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
send(8) g10<1>UWg14<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
send(8) g14<1>UWg18<8,8,1>F
sampler ld2dms SIMD8 Surface = 1 Sampler = 0 mlen 4 
rlen 4 { align1 1Q };
add(8)  g18<1>F g2<8,8,1>F  g6<8,8,1>F  { align1 1Q 
compacted };
add(8)  g22<1>F g3<8,8,1>F  g7<8,8,1>F  { align1 1Q 
compacted };
add(8)  g26<1>F g4<8,8,1>F  g8<8,8,1>F  { align1 1Q 
compacted };
add(8)  g30<1>F g5<8,8,1>F  g9<8,8,1>F  { align1 1Q 
compacted };
add(8)  g19<1>F g10<8,8,1>F g14<8,8,1>F { align1 1Q 
compacted };
add(8)  g23<1>F g11<8,8,1>F g15<8,8,1>F { align1 1Q 
compacted };
add(8)  g27<1>F g12<8,8,1>F g16<8,8,1>F { align1 1Q 
compacted };
add(8)  g31<1>F g13<8,8,1>F g17<8,8,1>F { align1 1Q 
compacted };
add(8)  g20<1>F g18<8,8,1>F g19<8,8,1>F { align1 1Q 
compacted };
add(8)  g24<1>F g22<8,8,1>F g23<8,8,1>F { align1 1Q 
compacted };
add(8)  g28<1>F g26<8,8,1>F g27<8,8,1>F { align1 1Q 
compacted };
add(8)  g32<1>F g30<8,8,1>F

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-19 Thread Roland Scheidegger
Am 19.02.2016 um 11:57 schrieb Iago Toral:
> On Thu, 2016-02-18 at 16:13 -0800, Ian Romanick wrote:
>> On 02/18/2016 01:47 PM, Ian Romanick wrote:
>>> On 02/18/2016 08:44 AM, Neil Roberts wrote:
 I made a pathological test case (attached) which repeatedly renders into
 an MSAA FBO and then blits it to the screen and measures the framerate.
 It checks it with a range of different sample counts. The rendering is
 done either by rendering two triangles to fill the framebuffer or by
 calling glClear.

 The percentage increase in framerate after applying the patch is like
 this:

 With triangles to fill buffer:

 16  62,27%
 8   48,59%
 4   27,72%
 2   5,34%
 0   0,58%
 
 With glClear:   
 
 16  -5,20%
 8   -7,08%
 4   -2,45%
 2   -20,76%
 0   3,71%
>>>
>>> I find this result interesting.  In the samples=0 case, the before and
>>> after shaders should be identical.  I dug into this a bit, and I think
>>> the problem is the previous patch.  Using do { ... } while (false) in
>>> the macro makes the compiler generate some really, really bad code.
>>> Just deleting those two lines makes a huge difference.
>>>
>>> With do { ... } while (false):
>>>
>>> SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills.
>>> Promoted 0 constants. Compacted 3648 to 2192 bytes (40%)
>>>
>>> Without:
>>>
>>> SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills.
>>> Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
>>> SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills.
>>> Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
>>>
>>> This is for samples=16 and sampler2DMS.  The cycle counts are bogus
>>> because the 4 loops have their cycle counts statically multiplied by
>>> 10... even though the loop will only execute once.
>>>
>>> So... I'll try some more experiments.  I also wonder about changing the
>>> SAMPLES > 1 to SAMPLES > 2.
>>
>> This is weird, and I'm not sure what to make of it.  I ran your test at
>> a bunch of commits on my branch.  Getting rid of gl_FragColor was the
>> big win.  The GL_EXT_shader_samples_identical patch is still the big
>> loss.  I'm assuming that causes enough register pressure over the
>> previous commit to lose SIMD16.  The following patch just removes the
>> "do {" and "} while (false)".  That gets back some of the losses,
>> but not all of it.
> 
> I don't know much about this, but using shader_samples_identical should
> only give a benefit if we actually get identical samples, otherwise it
> means more work, right? I noticed that the test renders a quad with
> random colors for each vertex that will be interpolated across the
> region, so could it be that we are hitting few cases of identical
> samples? Shouldn't the results be expected to be better if we rendered a
> flat color instead?

Unless you run that shader drawing random colors at per-sample frequency
(using per-sample interpolation), you still get identical samples (for
each pixel). Except at the edges of the quad.
(I didn't actually look at the test...)

Roland

> 
>> The raw per-commit data is below.
>>
>> 5c7f974 Android: disable unused-parameter warning
>> With triangles to fill buffer:
>>
>> 8,419.603882
>> 4,633.834045
>> 2,595.628113
>> 0,795.798157
>>
>> With glClear:
>>
>> 8,450.937958
>> 4,594.000610
>> 2,641.354553
>> 0,767.400818
>>
>> 89b8c0a meta/blit: Replace the gl_FragColor write
>> With triangles to fill buffer:
>>
>> 8,597.943054
>> 4,803.083862
>> 2,942.862549
>> 0,1129.432983
>>
>> With glClear:
>>
>> 8,643.500671
>> 4,980.392151
>> 2,982.704407
>> 0,1116.569946
>>
>> 7025d43 meta/blit: Don't dynamically select the MSAA "merge" function
>> With triangles to fill buffer:
>>
>> 8,599.412598
>> 4,954.745056
>> 2,933.184021
>> 0,1129.305420
>>
>> With glClear:
>>
>> 8,582.106079
>> 4,965.810303
>> 2,975.419434
>> 0,1169.180420
>>
>> b895a4d meta/blit: Use a single, master shader template for MSAA-SS blits
>> With triangles to fill buffer:
>>
>> 8,591.786011
>> 4,932.835815
>> 2,910.498047
>> 0,1072.501099
>>
>> With glClear:
>>
>> 8,623.441406
>> 4,952.562378
>> 2,941.442261
>> 0,1081.314819
>>
>> 197e4be meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve 
>> blit
>> With triangles to fill buffer:
>>
>> 8,559.252869
>> 4,795.355103
>> 2,781.494202
>> 0,941.619568
>>
>> With glClear:
>>
>> 8,577.400513
>> 4,806.321533
>> 2,811.030029
>> 0,957.579224
>>
>> ec01613 fix loop nonsense derp
>> With triangles to fill buffer:
>>
>> 8,559.221558
>> 4,788.767944
>> 2,786.287170
>> 0,1078.632324
>>
>> With glClear:
>>
>> 8,575.771545
>> 4,956.205750
>> 2,813.206482
>> 0,.111084
>>
>>> I will probably also make a patch to fix the damage done by the do { ...
>>> } while (false).  That's just comedy.
>>>
 It seems like a pretty convincing win for the triangle case but the
 clear case 

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-19 Thread Iago Toral
On Fri, 2016-02-19 at 11:47 +, Neil Roberts wrote:
> Iago Toral  writes:
> 
> > I don't know much about this, but using shader_samples_identical
> > should only give a benefit if we actually get identical samples,
> > otherwise it means more work, right? I noticed that the test renders a
> > quad with random colors for each vertex that will be interpolated
> > across the region, so could it be that we are hitting few cases of
> > identical samples? Shouldn't the results be expected to be better if
> > we rendered a flat color instead?
> 
> It should all be identical regardless of the color chosen because it is
> doing per-pixel shading rather than per-sample which means the fragment
> shader is only run once per pixel and the same value is written to all
> of the samples. The only case where you get differing samples normally
> is at the edges of a polygon where the single color is only written to a
> few of the samples.
> 
> In this case it is even more identical because the random color is set
> on a uniform rather than a vertex so it is actually using the same color
> for the whole framebuffer. However, it is rendered as two triangles
> which might mean that along the diagonal of the framebuffer where the
> two triangles meet there will be a polygon edge which means the hardware
> might not recognise that these all the same color, but that should only
> be a few pixels and probably won't affect the results very much.

You're right, I read through the test code too fast :)

Iago

___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-19 Thread Neil Roberts
Iago Toral  writes:

> I don't know much about this, but using shader_samples_identical
> should only give a benefit if we actually get identical samples,
> otherwise it means more work, right? I noticed that the test renders a
> quad with random colors for each vertex that will be interpolated
> across the region, so could it be that we are hitting few cases of
> identical samples? Shouldn't the results be expected to be better if
> we rendered a flat color instead?

It should all be identical regardless of the color chosen because it is
doing per-pixel shading rather than per-sample which means the fragment
shader is only run once per pixel and the same value is written to all
of the samples. The only case where you get differing samples normally
is at the edges of a polygon where the single color is only written to a
few of the samples.

In this case it is even more identical because the random color is set
on a uniform rather than a vertex so it is actually using the same color
for the whole framebuffer. However, it is rendered as two triangles
which might mean that along the diagonal of the framebuffer where the
two triangles meet there will be a polygon edge which means the hardware
might not recognise that these all the same color, but that should only
be a few pixels and probably won't affect the results very much.

Regards,
- Neil
___
mesa-dev mailing list
mesa-dev@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/mesa-dev


Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-19 Thread Iago Toral
On Thu, 2016-02-18 at 16:13 -0800, Ian Romanick wrote:
> On 02/18/2016 01:47 PM, Ian Romanick wrote:
> > On 02/18/2016 08:44 AM, Neil Roberts wrote:
> >> I made a pathological test case (attached) which repeatedly renders into
> >> an MSAA FBO and then blits it to the screen and measures the framerate.
> >> It checks it with a range of different sample counts. The rendering is
> >> done either by rendering two triangles to fill the framebuffer or by
> >> calling glClear.
> >>
> >> The percentage increase in framerate after applying the patch is like
> >> this:
> >>
> >> With triangles to fill buffer:
> >>
> >> 16  62,27%
> >> 8   48,59%
> >> 4   27,72%
> >> 2   5,34%
> >> 0   0,58%
> >> 
> >> With glClear:   
> >> 
> >> 16  -5,20%
> >> 8   -7,08%
> >> 4   -2,45%
> >> 2   -20,76%
> >> 0   3,71%
> > 
> > I find this result interesting.  In the samples=0 case, the before and
> > after shaders should be identical.  I dug into this a bit, and I think
> > the problem is the previous patch.  Using do { ... } while (false) in
> > the macro makes the compiler generate some really, really bad code.
> > Just deleting those two lines makes a huge difference.
> > 
> > With do { ... } while (false):
> > 
> > SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills.
> > Promoted 0 constants. Compacted 3648 to 2192 bytes (40%)
> > 
> > Without:
> > 
> > SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills.
> > Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
> > SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills.
> > Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
> > 
> > This is for samples=16 and sampler2DMS.  The cycle counts are bogus
> > because the 4 loops have their cycle counts statically multiplied by
> > 10... even though the loop will only execute once.
> > 
> > So... I'll try some more experiments.  I also wonder about changing the
> > SAMPLES > 1 to SAMPLES > 2.
> 
> This is weird, and I'm not sure what to make of it.  I ran your test at
> a bunch of commits on my branch.  Getting rid of gl_FragColor was the
> big win.  The GL_EXT_shader_samples_identical patch is still the big
> loss.  I'm assuming that causes enough register pressure over the
> previous commit to lose SIMD16.  The following patch just removes the
> "do {" and "} while (false)".  That gets back some of the losses,
> but not all of it.

I don't know much about this, but using shader_samples_identical should
only give a benefit if we actually get identical samples, otherwise it
means more work, right? I noticed that the test renders a quad with
random colors for each vertex that will be interpolated across the
region, so could it be that we are hitting few cases of identical
samples? Shouldn't the results be expected to be better if we rendered a
flat color instead?

> The raw per-commit data is below.
> 
> 5c7f974 Android: disable unused-parameter warning
> With triangles to fill buffer:
> 
> 8,419.603882
> 4,633.834045
> 2,595.628113
> 0,795.798157
> 
> With glClear:
> 
> 8,450.937958
> 4,594.000610
> 2,641.354553
> 0,767.400818
> 
> 89b8c0a meta/blit: Replace the gl_FragColor write
> With triangles to fill buffer:
> 
> 8,597.943054
> 4,803.083862
> 2,942.862549
> 0,1129.432983
> 
> With glClear:
> 
> 8,643.500671
> 4,980.392151
> 2,982.704407
> 0,1116.569946
> 
> 7025d43 meta/blit: Don't dynamically select the MSAA "merge" function
> With triangles to fill buffer:
> 
> 8,599.412598
> 4,954.745056
> 2,933.184021
> 0,1129.305420
> 
> With glClear:
> 
> 8,582.106079
> 4,965.810303
> 2,975.419434
> 0,1169.180420
> 
> b895a4d meta/blit: Use a single, master shader template for MSAA-SS blits
> With triangles to fill buffer:
> 
> 8,591.786011
> 4,932.835815
> 2,910.498047
> 0,1072.501099
> 
> With glClear:
> 
> 8,623.441406
> 4,952.562378
> 2,941.442261
> 0,1081.314819
> 
> 197e4be meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
> With triangles to fill buffer:
> 
> 8,559.252869
> 4,795.355103
> 2,781.494202
> 0,941.619568
> 
> With glClear:
> 
> 8,577.400513
> 4,806.321533
> 2,811.030029
> 0,957.579224
> 
> ec01613 fix loop nonsense derp
> With triangles to fill buffer:
> 
> 8,559.221558
> 4,788.767944
> 2,786.287170
> 0,1078.632324
> 
> With glClear:
> 
> 8,575.771545
> 4,956.205750
> 2,813.206482
> 0,.111084
> 
> > I will probably also make a patch to fix the damage done by the do { ...
> > } while (false).  That's just comedy.
> > 
> >> It seems like a pretty convincing win for the triangle case but the
> >> clear case makes it slightly worse. Presumably this is because we don't
> >> do anything to detect the value stored in the MCS buffer when doing a
> >> fast clear so the fast path isn't taken and the shader being more
> >> complicated makes it slower.
> >>
> >> Not sure if we want to try and do anything about that because
> >> potentially the cleared pixels aren't very common in a 

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-18 Thread Ian Romanick
On 02/18/2016 01:47 PM, Ian Romanick wrote:
> On 02/18/2016 08:44 AM, Neil Roberts wrote:
>> I made a pathological test case (attached) which repeatedly renders into
>> an MSAA FBO and then blits it to the screen and measures the framerate.
>> It checks it with a range of different sample counts. The rendering is
>> done either by rendering two triangles to fill the framebuffer or by
>> calling glClear.
>>
>> The percentage increase in framerate after applying the patch is like
>> this:
>>
>> With triangles to fill buffer:
>>  
>> 16  62,27%
>> 8   48,59%
>> 4   27,72%
>> 2   5,34%
>> 0   0,58%
>> 
>> With glClear:   
>> 
>> 16  -5,20%
>> 8   -7,08%
>> 4   -2,45%
>> 2   -20,76%
>> 0   3,71%
> 
> I find this result interesting.  In the samples=0 case, the before and
> after shaders should be identical.  I dug into this a bit, and I think
> the problem is the previous patch.  Using do { ... } while (false) in
> the macro makes the compiler generate some really, really bad code.
> Just deleting those two lines makes a huge difference.
> 
> With do { ... } while (false):
> 
> SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills.
> Promoted 0 constants. Compacted 3648 to 2192 bytes (40%)
> 
> Without:
> 
> SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills.
> Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
> SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills.
> Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
> 
> This is for samples=16 and sampler2DMS.  The cycle counts are bogus
> because the 4 loops have their cycle counts statically multiplied by
> 10... even though the loop will only execute once.
> 
> So... I'll try some more experiments.  I also wonder about changing the
> SAMPLES > 1 to SAMPLES > 2.

This is weird, and I'm not sure what to make of it.  I ran your test at
a bunch of commits on my branch.  Getting rid of gl_FragColor was the
big win.  The GL_EXT_shader_samples_identical patch is still the big
loss.  I'm assuming that causes enough register pressure over the
previous commit to lose SIMD16.  The following patch just removes the
"do {" and "} while (false)".  That gets back some of the losses,
but not all of it.

The raw per-commit data is below.

5c7f974 Android: disable unused-parameter warning
With triangles to fill buffer:

8,419.603882
4,633.834045
2,595.628113
0,795.798157

With glClear:

8,450.937958
4,594.000610
2,641.354553
0,767.400818

89b8c0a meta/blit: Replace the gl_FragColor write
With triangles to fill buffer:

8,597.943054
4,803.083862
2,942.862549
0,1129.432983

With glClear:

8,643.500671
4,980.392151
2,982.704407
0,1116.569946

7025d43 meta/blit: Don't dynamically select the MSAA "merge" function
With triangles to fill buffer:

8,599.412598
4,954.745056
2,933.184021
0,1129.305420

With glClear:

8,582.106079
4,965.810303
2,975.419434
0,1169.180420

b895a4d meta/blit: Use a single, master shader template for MSAA-SS blits
With triangles to fill buffer:

8,591.786011
4,932.835815
2,910.498047
0,1072.501099

With glClear:

8,623.441406
4,952.562378
2,941.442261
0,1081.314819

197e4be meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit
With triangles to fill buffer:

8,559.252869
4,795.355103
2,781.494202
0,941.619568

With glClear:

8,577.400513
4,806.321533
2,811.030029
0,957.579224

ec01613 fix loop nonsense derp
With triangles to fill buffer:

8,559.221558
4,788.767944
2,786.287170
0,1078.632324

With glClear:

8,575.771545
4,956.205750
2,813.206482
0,.111084

> I will probably also make a patch to fix the damage done by the do { ...
> } while (false).  That's just comedy.
> 
>> It seems like a pretty convincing win for the triangle case but the
>> clear case makes it slightly worse. Presumably this is because we don't
>> do anything to detect the value stored in the MCS buffer when doing a
>> fast clear so the fast path isn't taken and the shader being more
>> complicated makes it slower.
>>
>> Not sure if we want to try and do anything about that because
>> potentially the cleared pixels aren't very common in a framebuffer from
>> a real use case so it might not really matter.
>>
>> Currently we don't use SIMD16 for 16x MSAA because we can't allocate the
>> registers well enough to make it worthwhile. This patch makes that
>> problem a bit more interesting because even if we end up spilling a lot
>> it might still be worth doing SIMD16 because the cases where the spilled
>> instructions are hit would be much less common.
>>
>> - Neil
>>
>> Ian Romanick  writes:
>>
>>> From: Ian Romanick 
>>>
>>> Somewhat surprisingly, this didn't have any affect on performance in the
>>> benchmarks that Martin tried for me.
>>>
>>> Signed-off-by: Ian Romanick 
>>> ---
>>>  src/mesa/drivers/common/meta_blit.c | 10 +-
>>>  1 file changed, 9 

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-18 Thread Ian Romanick
On 02/18/2016 08:44 AM, Neil Roberts wrote:
> I made a pathological test case (attached) which repeatedly renders into
> an MSAA FBO and then blits it to the screen and measures the framerate.
> It checks it with a range of different sample counts. The rendering is
> done either by rendering two triangles to fill the framebuffer or by
> calling glClear.
> 
> The percentage increase in framerate after applying the patch is like
> this:
> 
> With triangles to fill buffer:
>   
> 16  62,27%
> 8   48,59%
> 4   27,72%
> 2   5,34%
> 0   0,58%
> 
> With glClear:   
> 
> 16  -5,20%
> 8   -7,08%
> 4   -2,45%
> 2   -20,76%
> 0   3,71%

I find this result interesting.  In the samples=0 case, the before and
after shaders should be identical.  I dug into this a bit, and I think
the problem is the previous patch.  Using do { ... } while (false) in
the macro makes the compiler generate some really, really bad code.
Just deleting those two lines makes a huge difference.

With do { ... } while (false):

SIMD8 shader: 228 instructions. 4 loops. 1554 cycles. 0:0 spills:fills.
Promoted 0 constants. Compacted 3648 to 2192 bytes (40%)

Without:

SIMD8 shader: 159 instructions. 0 loops. 380 cycles. 0:0 spills:fills.
Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)
SIMD16 shader: 159 instructions. 0 loops. 898 cycles. 0:0 spills:fills.
Promoted 0 constants. Compacted 2544 to 1600 bytes (37%)

This is for samples=16 and sampler2DMS.  The cycle counts are bogus
because the 4 loops have their cycle counts statically multiplied by
10... even though the loop will only execute once.

So... I'll try some more experiments.  I also wonder about changing the
SAMPLES > 1 to SAMPLES > 2.

I will probably also make a patch to fix the damage done by the do { ...
} while (false).  That's just comedy.

> It seems like a pretty convincing win for the triangle case but the
> clear case makes it slightly worse. Presumably this is because we don't
> do anything to detect the value stored in the MCS buffer when doing a
> fast clear so the fast path isn't taken and the shader being more
> complicated makes it slower.
> 
> Not sure if we want to try and do anything about that because
> potentially the cleared pixels aren't very common in a framebuffer from
> a real use case so it might not really matter.
> 
> Currently we don't use SIMD16 for 16x MSAA because we can't allocate the
> registers well enough to make it worthwhile. This patch makes that
> problem a bit more interesting because even if we end up spilling a lot
> it might still be worth doing SIMD16 because the cases where the spilled
> instructions are hit would be much less common.
> 
> - Neil
> 
> 
> 
> 
> Ian Romanick  writes:
> 
>> From: Ian Romanick 
>>
>> Somewhat surprisingly, this didn't have any affect on performance in the
>> benchmarks that Martin tried for me.
>>
>> Signed-off-by: Ian Romanick 
>> ---
>>  src/mesa/drivers/common/meta_blit.c | 10 +-
>>  1 file changed, 9 insertions(+), 1 deletion(-)
>>
>> diff --git a/src/mesa/drivers/common/meta_blit.c 
>> b/src/mesa/drivers/common/meta_blit.c
>> index 28aabd3..c0ec51f 100644
>> --- a/src/mesa/drivers/common/meta_blit.c
>> +++ b/src/mesa/drivers/common/meta_blit.c
>> @@ -530,6 +530,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
>>   fs_source = ralloc_asprintf(mem_ctx,
>>   "#version 130\n"
>>   "#extension 
>> GL_ARB_texture_multisample: require\n"
>> + "#extension 
>> GL_EXT_shader_samples_identical: enable\n"
>>   "#define gvec4 %svec4\n"
>>   "uniform %ssampler2DMS%s texSampler;\n"
>>   "in %s texCoords;\n"
>> @@ -569,7 +570,14 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
>>   "   i%s tc = i%s(texCoords);\n"
>>   "   int i;\n"
>>   "\n"
>> - "   for (i = 0; i < SAMPLES; i++)\n"
>> + "   S[0] = texelFetch(texSampler, tc, 
>> 0);\n"
>> + "#if 
>> defined(GL_EXT_shader_samples_identical) && SAMPLES > 1\n"
>> + "   if 
>> (textureSamplesIdenticalEXT(texSampler, tc)) {\n"
>> + "  emit2(S[0]);\n"
>> + "  return;\n"
>> + "   }\n"
>> + "#endif\n"
>> + "   for (i = 1; i < SAMPLES; i++)\n"
>>   "  S[i] = texelFetch(texSampler, 
>> tc, i);\n"
>>   

Re: [Mesa-dev] [PATCH 22/22] meta/blit: Use GL_EXT_shader_samples_identical in MSAA-SS resolve blit

2016-02-18 Thread Neil Roberts
I made a pathological test case (attached) which repeatedly renders into
an MSAA FBO and then blits it to the screen and measures the framerate.
It checks it with a range of different sample counts. The rendering is
done either by rendering two triangles to fill the framebuffer or by
calling glClear.

The percentage increase in framerate after applying the patch is like
this:

With triangles to fill buffer:

16  62,27%
8   48,59%
4   27,72%
2   5,34%
0   0,58%

With glClear:   

16  -5,20%
8   -7,08%
4   -2,45%
2   -20,76%
0   3,71%

It seems like a pretty convincing win for the triangle case but the
clear case makes it slightly worse. Presumably this is because we don't
do anything to detect the value stored in the MCS buffer when doing a
fast clear so the fast path isn't taken and the shader being more
complicated makes it slower.

Not sure if we want to try and do anything about that because
potentially the cleared pixels aren't very common in a framebuffer from
a real use case so it might not really matter.

Currently we don't use SIMD16 for 16x MSAA because we can't allocate the
registers well enough to make it worthwhile. This patch makes that
problem a bit more interesting because even if we end up spilling a lot
it might still be worth doing SIMD16 because the cases where the spilled
instructions are hit would be much less common.

- Neil

#include 
#include 
#include 
#include 
#include 

#define N_FRAMES_TO_SKIP 1000
#define N_FRAMES_TO_DRAW 1

enum draw_mode {
DRAW_MODE_TRIANGLES,
DRAW_MODE_CLEAR
};

struct data {
SDL_Window *window;
SDL_GLContext gl_context;
};

static SDL_GLContext
create_gl_context(SDL_Window *window)
{
/* First try creating a core context because if we get one it
 * can be more efficient.
 */
SDL_GL_SetAttribute(SDL_GL_CONTEXT_MAJOR_VERSION, 3);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_MINOR_VERSION, 1);
SDL_GL_SetAttribute(SDL_GL_CONTEXT_PROFILE_MASK,
SDL_GL_CONTEXT_PROFILE_CORE);

return SDL_GL_CreateContext(window);
}

static const char
vertex_shader_source[] =
"#version 150\n"
"in vec2 piglit_vertex;\n"
"\n"
"void\n"
"main()\n"
"{\n"
"gl_Position = vec4(piglit_vertex, 0.0, 1.0);\n"
"}\n";

static const char
fragment_shader_source[] =
"#version 150\n"
"uniform vec3 color;\n"
"\n"
"void\n"
"main()\n"
"{\n"
"gl_FragColor = vec4(color, 1.0);\n"
"}\n";

static GLuint
create_shader(const char *name,
  GLenum type,
  const char *source,
  int source_length)
{
GLuint shader;
GLint length, compile_status;
GLsizei actual_length;
GLchar *info_log;

shader = glCreateShader(type);

glShaderSource(shader,
   1, /* n_strings */
   ,
   _length);

glCompileShader(shader);

glGetShaderiv(shader, GL_INFO_LOG_LENGTH, );

if (length > 0) {
info_log = malloc(length);
glGetShaderInfoLog(shader, length,
   _length,
   info_log);
if (*info_log) {
fprintf(stderr,
"Info log for %s:\n%s\n",
name,
info_log);
}
free(info_log);
}

glGetShaderiv(shader, GL_COMPILE_STATUS, _status);

if (!compile_status) {
fprintf(stderr, "%s compilation failed", name);
glDeleteShader(shader);
return 0;
}

return shader;
}

static bool
link_program(GLuint program)
{
GLint length, link_status;
GLsizei actual_length;
GLchar *info_log;

glLinkProgram(program);

glGetProgramiv(program, GL_INFO_LOG_LENGTH, );

if (length > 0) {
info_log = malloc(length);
glGetProgramInfoLog(program, length,
_length,
info_log);
if (*info_log)
fprintf(stderr, "Link info log:\n%s\n", info_log);
free(info_log);
}

glGetProgramiv(program, GL_LINK_STATUS, _status);

if (!link_status) {
fprintf(stderr, "program link failed");
return false;
}

return true;
}

static bool
run_test(struct data *data,
 int n_samples,
 enum draw_mode draw_mode,
 float *result)
{
float verts[] = {
-1.0f, -1.0f,
1.0f, -1.0f,
-1.0f, 1.0f,
1.0f, 1.0f
};
GLuint