https://gcc.gnu.org/bugzilla/show_bug.cgi?id=50481

--- Comment #29 from Jakub Jelinek <jakub at gcc dot gnu.org> ---
(In reply to Alexander Kleinsorge from comment #13)
> for single bytes (uint8), there could be a faster way (x86 + x64).
> there are only logical ops and shifts, nothing else.
> 
> static inline uint8 byte_rev(uint8 v) {
>     const uint64 BREV64 = ~0x084c2a6e195d3b7fLLu; // verify this number (LUT
> like)
>     uint8 a = (BREV64) >> ((v % 16u) * 4u); // from low
>     uint8 b = (BREV64) >> ((v / 16u) * 4u); // from high
>     return (a * 16u) | (b % 16u);
> }

Why do you think this is faster?
It is certainly larger (both x86_64 and ia32) and on x86_64 same number of
instructions:
   0:   48 ba 80 c4 a2 e6 91    movabs $0xf7b3d591e6a2c480,%rdx
   7:   d5 b3 f7 
   a:   89 f9                   mov    %edi,%ecx
   c:   40 c0 ef 04             shr    $0x4,%dil
  10:   83 e1 0f                and    $0xf,%ecx
  13:   48 89 d0                mov    %rdx,%rax
  16:   c1 e1 02                shl    $0x2,%ecx
  19:   48 d3 e8                shr    %cl,%rax
  1c:   40 0f b6 cf             movzbl %dil,%ecx
  20:   c1 e1 02                shl    $0x2,%ecx
  23:   c1 e0 04                shl    $0x4,%eax
  26:   48 d3 ea                shr    %cl,%rdx
  29:   83 e2 0f                and    $0xf,%edx
  2c:   09 d0                   or     %edx,%eax
  2e:   c3                      ret
vs.
   0:   40 c0 c7 04             rol    $0x4,%dil
   4:   89 fa                   mov    %edi,%edx
   6:   83 e7 33                and    $0x33,%edi
   9:   c0 ea 02                shr    $0x2,%dl
   c:   c1 e7 02                shl    $0x2,%edi
   f:   83 e2 33                and    $0x33,%edx
  12:   09 fa                   or     %edi,%edx
  14:   89 d0                   mov    %edx,%eax
  16:   83 e2 55                and    $0x55,%edx
  19:   d0 e8                   shr    $1,%al
  1b:   01 d2                   add    %edx,%edx
  1d:   83 e0 55                and    $0x55,%eax
  20:   09 d0                   or     %edx,%eax
  22:   c3                      ret
Both are 14 insns, but the #c13 is larger.
For ia32, it is 105 bytes vs. 39, 34 vs. 15 insns.

Reply via email to