On Fri, Apr 16, 2021 at 10:11 PM Eric Dumazet <eduma...@google.com> wrote:
>
> On Fri, Apr 16, 2021 at 9:44 PM Al Viro <v...@zeniv.linux.org.uk> wrote:
> >
> > On Fri, Apr 16, 2021 at 12:24:13PM -0700, Eric Dumazet wrote:
> > > From: Eric Dumazet <eduma...@google.com>
> > >
> > > We have to loop only to copy u64 values.
> > > After this first loop, we copy at most one u32, one u16 and one byte.
> >
> > Does it actually yield a better code?
> >
>
> Yes, my patch gives a better code, on actual kernel use-case
>
> (net-next tree, look at put_cmsg())
>
> 5ca: 48 89 0f              mov    %rcx,(%rdi)
>  5cd: 89 77 08              mov    %esi,0x8(%rdi)
>  5d0: 89 57 0c              mov    %edx,0xc(%rdi)
>  5d3: 48 83 c7 10          add    $0x10,%rdi
>  5d7: 48 83 c1 f0          add    $0xfffffffffffffff0,%rcx
>  5db: 48 83 f9 07          cmp    $0x7,%rcx
>  5df: 76 40                jbe    621 <put_cmsg+0x111>
>  5e1: 66 66 66 66 66 66 2e data16 data16 data16 data16 data16 nopw
> %cs:0x0(%rax,%rax,1)
>  5e8: 0f 1f 84 00 00 00 00
>  5ef: 00
>  5f0: 49 8b 10              mov    (%r8),%rdx
>  5f3: 48 89 17              mov    %rdx,(%rdi)
>  5f6: 48 83 c7 08          add    $0x8,%rdi
>  5fa: 49 83 c0 08          add    $0x8,%r8
>  5fe: 48 83 c1 f8          add    $0xfffffffffffffff8,%rcx
>  602: 48 83 f9 07          cmp    $0x7,%rcx
>  606: 77 e8                ja     5f0 <put_cmsg+0xe0>
>  608: eb 17                jmp    621 <put_cmsg+0x111>
>  60a: 66 0f 1f 44 00 00    nopw   0x0(%rax,%rax,1)
>  610: 41 8b 10              mov    (%r8),%edx
>  613: 89 17                mov    %edx,(%rdi)
>  615: 48 83 c7 04          add    $0x4,%rdi
>  619: 49 83 c0 04          add    $0x4,%r8
>  61d: 48 83 c1 fc          add    $0xfffffffffffffffc,%rcx
>  621: 48 83 f9 03          cmp    $0x3,%rcx
>  625: 77 e9                ja     610 <put_cmsg+0x100>
>  627: eb 1a                jmp    643 <put_cmsg+0x133>
>  629: 0f 1f 80 00 00 00 00 nopl   0x0(%rax)
>  630: 41 0f b7 10          movzwl (%r8),%edx
>  634: 66 89 17              mov    %dx,(%rdi)
>  637: 48 83 c7 02          add    $0x2,%rdi
>  63b: 49 83 c0 02          add    $0x2,%r8
>  63f: 48 83 c1 fe          add    $0xfffffffffffffffe,%rcx
>  643: 48 83 f9 01          cmp    $0x1,%rcx
>  647: 77 e7                ja     630 <put_cmsg+0x120>
>  649: eb 15                jmp    660 <put_cmsg+0x150>
>  64b: 0f 1f 44 00 00        nopl   0x0(%rax,%rax,1)
>  650: 41 0f b6 08          movzbl (%r8),%ecx
>  654: 88 0f                mov    %cl,(%rdi)
>  656: 48 83 c7 01          add    $0x1,%rdi
>  65a: 49 83 c0 01          add    $0x1,%r8
>  65e: 31 c9                xor    %ecx,%ecx
>  660: 48 85 c9              test   %rcx,%rcx
>  663: 75 eb                jne    650 <put_cmsg+0x140>

After the change code is now what we would expect (no jmp around)
 5db: 48 83 f9 08          cmp    $0x8,%rcx
 5df: 72 27                jb     608 <put_cmsg+0xf8>
 5e1: 66 66 66 66 66 66 2e data16 data16 data16 data16 data16 nopw
%cs:0x0(%rax,%rax,1)
 5e8: 0f 1f 84 00 00 00 00
 5ef: 00
 5f0: 49 8b 10              mov    (%r8),%rdx
 5f3: 48 89 17              mov    %rdx,(%rdi)
 5f6: 48 83 c7 08          add    $0x8,%rdi
 5fa: 49 83 c0 08          add    $0x8,%r8
 5fe: 48 83 c1 f8          add    $0xfffffffffffffff8,%rcx
 602: 48 83 f9 08          cmp    $0x8,%rcx
 606: 73 e8                jae    5f0 <put_cmsg+0xe0>
 608: 48 83 f9 04          cmp    $0x4,%rcx
 60c: 72 11                jb     61f <put_cmsg+0x10f>
 60e: 41 8b 10              mov    (%r8),%edx
 611: 89 17                mov    %edx,(%rdi)
 613: 48 83 c7 04          add    $0x4,%rdi
 617: 49 83 c0 04          add    $0x4,%r8
 61b: 48 83 c1 fc          add    $0xfffffffffffffffc,%rcx
 61f: 48 83 f9 02          cmp    $0x2,%rcx
 623: 72 13                jb     638 <put_cmsg+0x128>
 625: 41 0f b7 10          movzwl (%r8),%edx
 629: 66 89 17              mov    %dx,(%rdi)
 62c: 48 83 c7 02          add    $0x2,%rdi
 630: 49 83 c0 02          add    $0x2,%r8
 634: 48 83 c1 fe          add    $0xfffffffffffffffe,%rcx
 638: 48 85 c9              test   %rcx,%rcx
 63b: 74 05                je     642 <put_cmsg+0x132>
 63d: 41 8a 08              mov    (%r8),%cl
 640: 88 0f                mov    %cl,(%rdi)

As I said, its minor, I am sure you can come up to something much better !

Thanks !

>

>
> > FWIW, this
> > void bar(unsigned);
> > void foo(unsigned n)
> > {
> >         while (n >= 8) {
> >                 bar(n);
> >                 n -= 8;
> >         }
> >         while (n >= 4) {
> >                 bar(n);
> >                 n -= 4;
> >         }
> >         while (n >= 2) {
> >                 bar(n);
> >                 n -= 2;
> >         }
> >         while (n >= 1) {
> >                 bar(n);
> >                 n -= 1;
> >         }
> > }
> >
> > will compile (with -O2) to
> >         pushq   %rbp
> >         pushq   %rbx
> >         movl    %edi, %ebx
> >         subq    $8, %rsp
> >         cmpl    $7, %edi
> >         jbe     .L2
> >         movl    %edi, %ebp
> > .L3:
> >         movl    %ebp, %edi
> >         subl    $8, %ebp
> >         call    bar@PLT
> >         cmpl    $7, %ebp
> >         ja      .L3
> >         andl    $7, %ebx
> > .L2:
> >         cmpl    $3, %ebx
> >         jbe     .L4
> >         movl    %ebx, %edi
> >         andl    $3, %ebx
> >         call    bar@PLT
> > .L4:
> >         cmpl    $1, %ebx
> >         jbe     .L5
> >         movl    %ebx, %edi
> >         andl    $1, %ebx
> >         call    bar@PLT
> > .L5:
> >         testl   %ebx, %ebx
> >         je      .L1
> >         addq    $8, %rsp
> >         movl    $1, %edi
> >         popq    %rbx
> >         popq    %rbp
> >         jmp     bar@PLT
> > .L1:
> >         addq    $8, %rsp
> >         popq    %rbx
> >         popq    %rbp
> >         ret
> >
> > i.e. loop + if + if + if...

Reply via email to