On Fri, Apr 16, 2021 at 12:24:13PM -0700, Eric Dumazet wrote:
> From: Eric Dumazet <eduma...@google.com>
> 
> We have to loop only to copy u64 values.
> After this first loop, we copy at most one u32, one u16 and one byte.

Does it actually yield a better code?

FWIW, this
void bar(unsigned);
void foo(unsigned n)
{
        while (n >= 8) {
                bar(n);
                n -= 8;
        }
        while (n >= 4) {
                bar(n);
                n -= 4;
        }
        while (n >= 2) {
                bar(n);
                n -= 2;
        }
        while (n >= 1) {
                bar(n);
                n -= 1;
        }
}

will compile (with -O2) to
        pushq   %rbp
        pushq   %rbx
        movl    %edi, %ebx
        subq    $8, %rsp
        cmpl    $7, %edi
        jbe     .L2
        movl    %edi, %ebp
.L3:
        movl    %ebp, %edi
        subl    $8, %ebp
        call    bar@PLT
        cmpl    $7, %ebp
        ja      .L3
        andl    $7, %ebx
.L2:
        cmpl    $3, %ebx
        jbe     .L4
        movl    %ebx, %edi
        andl    $3, %ebx
        call    bar@PLT
.L4:
        cmpl    $1, %ebx
        jbe     .L5
        movl    %ebx, %edi
        andl    $1, %ebx
        call    bar@PLT
.L5:
        testl   %ebx, %ebx
        je      .L1
        addq    $8, %rsp
        movl    $1, %edi
        popq    %rbx
        popq    %rbp
        jmp     bar@PLT
.L1:
        addq    $8, %rsp
        popq    %rbx
        popq    %rbp
        ret

i.e. loop + if + if + if...

Reply via email to