A case that PRE optimization hurts performance

Jiangning Liu Mon, 01 Aug 2011 19:37:34 -0700

Hi,

For the following simple test case, PRE optimization hoists computation
(s!=1) into the default branch of the switch statement, and finally causes
very poor code generation. This problem occurs in both X86 and ARM, and I
believe it is also a problem for other targets.


int f(char *t) {
    int s=0;

    while (*t && s != 1) {
        switch (s) {
        case 0:
            s = 2;
            break;
        case 2:
            s = 1;
            break;
        default:
            if (*t == '-') 
                s = 1;
            break;
        }
        t++;
    }

    return s;
}

Taking X86 as an example, with option "-O2" you may find 52 instructions
generated like below,

00000000 <f>:
   0:   55                      push   %ebp
   1:   31 c0                   xor    %eax,%eax
   3:   89 e5                   mov    %esp,%ebp
   5:   57                      push   %edi
   6:   56                      push   %esi
   7:   53                      push   %ebx
   8:   8b 55 08                mov    0x8(%ebp),%edx
   b:   0f b6 0a                movzbl (%edx),%ecx
   e:   84 c9                   test   %cl,%cl
  10:   74 50                   je     62 <f+0x62>
  12:   83 c2 01                add    $0x1,%edx
  15:   85 c0                   test   %eax,%eax
  17:   75 23                   jne    3c <f+0x3c>
  19:   8d b4 26 00 00 00 00    lea    0x0(%esi,%eiz,1),%esi
  20:   0f b6 0a                movzbl (%edx),%ecx
  23:   84 c9                   test   %cl,%cl
  25:   0f 95 c0                setne  %al
  28:   89 c7                   mov    %eax,%edi
  2a:   b8 02 00 00 00          mov    $0x2,%eax
  2f:   89 fb                   mov    %edi,%ebx
  31:   83 c2 01                add    $0x1,%edx
  34:   84 db                   test   %bl,%bl
  36:   74 2a                   je     62 <f+0x62>
  38:   85 c0                   test   %eax,%eax
  3a:   74 e4                   je     20 <f+0x20>
  3c:   83 f8 02                cmp    $0x2,%eax
  3f:   74 1f                   je     60 <f+0x60>
  41:   80 f9 2d                cmp    $0x2d,%cl
  44:   74 22                   je     68 <f+0x68>
  46:   0f b6 0a                movzbl (%edx),%ecx
  49:   83 f8 01                cmp    $0x1,%eax
  4c:   0f 95 c3                setne  %bl
  4f:   89 df                   mov    %ebx,%edi
  51:   84 c9                   test   %cl,%cl
  53:   0f 95 c3                setne  %bl
  56:   89 de                   mov    %ebx,%esi
  58:   21 f7                   and    %esi,%edi
  5a:   eb d3                   jmp    2f <f+0x2f>
  5c:   8d 74 26 00             lea    0x0(%esi,%eiz,1),%esi
  60:   b0 01                   mov    $0x1,%al
  62:   5b                      pop    %ebx
  63:   5e                      pop    %esi
  64:   5f                      pop    %edi
  65:   5d                      pop    %ebp
  66:   c3                      ret    
  67:   90                      nop
  68:   b8 01 00 00 00          mov    $0x1,%eax
  6d:   5b                      pop    %ebx
  6e:   5e                      pop    %esi
  6f:   5f                      pop    %edi
  70:   5d                      pop    %ebp
  71:   c3                      ret    

But with command line option "-O2 -fno-tree-pre", there are only 12
instructions generated, and the code would be very clean like below,

00000000 <f>:
   0:   55                      push   %ebp
   1:   31 c0                   xor    %eax,%eax
   3:   89 e5                   mov    %esp,%ebp
   5:   8b 55 08                mov    0x8(%ebp),%edx
   8:   80 3a 00                cmpb   $0x0,(%edx)
   b:   74 0e                   je     1b <f+0x1b>
   d:   80 7a 01 00             cmpb   $0x0,0x1(%edx)
  11:   b0 02                   mov    $0x2,%al
  13:   ba 01 00 00 00          mov    $0x1,%edx
  18:   0f 45 c2                cmovne %edx,%eax
  1b:   5d                      pop    %ebp
  1c:   c3                      ret    

Do you have any idea about this?

Thanks,
-Jiangning

A case that PRE optimization hurts performance

Reply via email to