RE: A case that PRE optimization hurts performance

Jiangning Liu Thu, 15 Sep 2011 19:01:22 -0700

Hi Richard,

I slightly changed the case to be like below,


int f(char *t) {
    int s=0;

    while (*t && s != 1) {
        switch (s) {
        case 0:   /* path 1 */
            s = 2;
            break;
        case 2:   /* path 2 */
            s = 3; /* changed */
            break;
        default:  /* path 3 */
            if (*t == '-') 
                s = 2;
            break;
        }
        t++;
    }

    return s;
}

"-O2" is still worse than "-O2 -fno-tree-pre". 

"-O2 -fno-tree-pre" result is 

f:
        pushl   %ebp
        xorl    %eax, %eax
        movl    %esp, %ebp
        movl    8(%ebp), %edx
        movzbl  (%edx), %ecx
        jmp     .L14
        .p2align 4,,7
        .p2align 3
.L5:
        movl    $2, %eax
.L7:
        addl    $1, %edx
        cmpl    $1, %eax
        movzbl  (%edx), %ecx
        je      .L3
.L14:
        testb   %cl, %cl
        je      .L3
        testl   %eax, %eax
        je      .L5
        cmpl    $2, %eax
        .p2align 4,,5
        je      .L17
        cmpb    $45, %cl
        .p2align 4,,5
        je      .L5
        addl    $1, %edx
        cmpl    $1, %eax
        movzbl  (%edx), %ecx
        jne     .L14
        .p2align 4,,7
        .p2align 3
.L3:
        popl    %ebp
        .p2align 4,,2
        ret
        .p2align 4,,7
        .p2align 3
.L17:
        movb    $3, %al
        .p2align 4,,3
        jmp     .L7

While "-O2" result is 

f:
        pushl   %ebp
        xorl    %eax, %eax
        movl    %esp, %ebp
        movl    8(%ebp), %edx
        pushl   %ebx
        movzbl  (%edx), %ecx
        jmp     .L14
        .p2align 4,,7
        .p2align 3
.L5:
        movl    $1, %ebx
        movl    $2, %eax
.L7:
        addl    $1, %edx
        testb   %bl, %bl
        movzbl  (%edx), %ecx
        je      .L3
.L14:
        testb   %cl, %cl
        je      .L3
        testl   %eax, %eax
        je      .L5
        cmpl    $2, %eax
        .p2align 4,,5
        je      .L16
        cmpb    $45, %cl
        .p2align 4,,5
        je      .L5
        cmpl    $1, %eax
        setne   %bl
        addl    $1, %edx
        testb   %bl, %bl
        movzbl  (%edx), %ecx
        jne     .L14
        .p2align 4,,7
        .p2align 3
.L3:
        popl    %ebx
        popl    %ebp
        ret
        .p2align 4,,7
        .p2align 3
.L16:
        movl    $1, %ebx
        movb    $3, %al
        jmp     .L7

You may notice that register ebx is introduced, and some more instructions
around ebx are generated as well. i.e.

        setne   %bl
        testb   %bl, %bl

I agree with you that in theory PRE does the right thing to minimize the
computation cost on gimple level. However, the problem is the cost of
converting comparison result to a bool value is not considered, so it
actually makes binary code worse. For this case, as I summarized below, to
complete the same functionality "With PRE" is worse than "Without PRE" for
all three paths,

* Without PRE,

Path1:
        movl    $2, %eax
        cmpl    $1, %eax
        je      .L3

Path2:
        movb    $3, %al
        cmpl    $1, %eax
        je      .L3

Path3:
        cmpl    $1, %eax
        jne     .L14

* With PRE,

Path1:
        movl    $1, %ebx
        movl    $2, %eax
        testb   %bl, %bl
        je      .L3

Path2:
        movl    $1, %ebx
        movb    $3, %al
        testb   %bl, %bl
        je      .L3

Path3:
        cmpl    $1, %eax
        setne   %bl
        testb   %bl, %bl
        jne     .L14

Do you have any more thoughts?

Thanks,
-Jiangning

> -----Original Message-----
> From: Richard Guenther [mailto:[email protected]]
> Sent: Tuesday, August 02, 2011 5:23 PM
> To: Jiangning Liu
> Cc: [email protected]
> Subject: Re: A case that PRE optimization hurts performance
> 
> On Tue, Aug 2, 2011 at 4:37 AM, Jiangning Liu <[email protected]>
> wrote:
> > Hi,
> >
> > For the following simple test case, PRE optimization hoists
> computation
> > (s!=1) into the default branch of the switch statement, and finally
> causes
> > very poor code generation. This problem occurs in both X86 and ARM,
> and I
> > believe it is also a problem for other targets.
> >
> > int f(char *t) {
> >    int s=0;
> >
> >    while (*t && s != 1) {
> >        switch (s) {
> >        case 0:
> >            s = 2;
> >            break;
> >        case 2:
> >            s = 1;
> >            break;
> >        default:
> >            if (*t == '-')
> >                s = 1;
> >            break;
> >        }
> >        t++;
> >    }
> >
> >    return s;
> > }
> >
> > Taking X86 as an example, with option "-O2" you may find 52
> instructions
> > generated like below,
> >
> > 00000000 <f>:
> >   0:   55                      push   %ebp
> >   1:   31 c0                   xor    %eax,%eax
> >   3:   89 e5                   mov    %esp,%ebp
> >   5:   57                      push   %edi
> >   6:   56                      push   %esi
> >   7:   53                      push   %ebx
> >   8:   8b 55 08                mov    0x8(%ebp),%edx
> >   b:   0f b6 0a                movzbl (%edx),%ecx
> >   e:   84 c9                   test   %cl,%cl
> >  10:   74 50                   je     62 <f+0x62>
> >  12:   83 c2 01                add    $0x1,%edx
> >  15:   85 c0                   test   %eax,%eax
> >  17:   75 23                   jne    3c <f+0x3c>
> >  19:   8d b4 26 00 00 00 00    lea    0x0(%esi,%eiz,1),%esi
> >  20:   0f b6 0a                movzbl (%edx),%ecx
> >  23:   84 c9                   test   %cl,%cl
> >  25:   0f 95 c0                setne  %al
> >  28:   89 c7                   mov    %eax,%edi
> >  2a:   b8 02 00 00 00          mov    $0x2,%eax
> >  2f:   89 fb                   mov    %edi,%ebx
> >  31:   83 c2 01                add    $0x1,%edx
> >  34:   84 db                   test   %bl,%bl
> >  36:   74 2a                   je     62 <f+0x62>
> >  38:   85 c0                   test   %eax,%eax
> >  3a:   74 e4                   je     20 <f+0x20>
> >  3c:   83 f8 02                cmp    $0x2,%eax
> >  3f:   74 1f                   je     60 <f+0x60>
> >  41:   80 f9 2d                cmp    $0x2d,%cl
> >  44:   74 22                   je     68 <f+0x68>
> >  46:   0f b6 0a                movzbl (%edx),%ecx
> >  49:   83 f8 01                cmp    $0x1,%eax
> >  4c:   0f 95 c3                setne  %bl
> >  4f:   89 df                   mov    %ebx,%edi
> >  51:   84 c9                   test   %cl,%cl
> >  53:   0f 95 c3                setne  %bl
> >  56:   89 de                   mov    %ebx,%esi
> >  58:   21 f7                   and    %esi,%edi
> >  5a:   eb d3                   jmp    2f <f+0x2f>
> >  5c:   8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
> >  60:   b0 01                   mov    $0x1,%al
> >  62:   5b                      pop    %ebx
> >  63:   5e                      pop    %esi
> >  64:   5f                      pop    %edi
> >  65:   5d                      pop    %ebp
> >  66:   c3                      ret
> >  67:   90                      nop
> >  68:   b8 01 00 00 00          mov    $0x1,%eax
> >  6d:   5b                      pop    %ebx
> >  6e:   5e                      pop    %esi
> >  6f:   5f                      pop    %edi
> >  70:   5d                      pop    %ebp
> >  71:   c3                      ret
> >
> > But with command line option "-O2 -fno-tree-pre", there are only 12
> > instructions generated, and the code would be very clean like below,
> >
> > 00000000 <f>:
> >   0:   55                      push   %ebp
> >   1:   31 c0                   xor    %eax,%eax
> >   3:   89 e5                   mov    %esp,%ebp
> >   5:   8b 55 08                mov    0x8(%ebp),%edx
> >   8:   80 3a 00                cmpb   $0x0,(%edx)
> >   b:   74 0e                   je     1b <f+0x1b>
> >   d:   80 7a 01 00             cmpb   $0x0,0x1(%edx)
> >  11:   b0 02                   mov    $0x2,%al
> >  13:   ba 01 00 00 00          mov    $0x1,%edx
> >  18:   0f 45 c2                cmovne %edx,%eax
> >  1b:   5d                      pop    %ebp
> >  1c:   c3                      ret
> >
> > Do you have any idea about this?
> 
> If you look at what PRE does it is clear that it's a profitable
> transformation
> as it reduces the number of instructions computed on the paths where
> s is set to a constant.  If you ask for size optimization you won't get
> this transformation.
> 
> Now - on the tree level we don't see the if-conversion opportunity
> (we don't have a conditional move instruction), but still we could
> improve phiopt to handle switch statements - not sure what we
> could do with the case in question though which is
> 
> <bb 3>:
>   switch (s_3) <default: <L3>, case 0: <L11>, case 2: <L10>>
> 
> <L11>:
>   goto <bb 7> (<L10>);
> 
> <L3>:
>   if (D.2703_6 == 45)
>     goto <bb 6>;
>   else
>     goto <bb 7> (<L10>);
> 
> <bb 6>:
> 
>   # s_2 = PHI <2(4), 1(3), 1(6), s_3(5)>
> <L10>:
> 
> as the condition in at L3 complicates things.
> 
> The code is also really really special (and written in an awful
> way ...).
> 
> Richard.
> 
> > Thanks,
> > -Jiangning
> >
> >
> >
> >

RE: A case that PRE optimization hurts performance

Reply via email to