Re: [RFC] csum experts, csum_replace2() is too expensive

Eric Dumazet Fri, 21 Mar 2014 05:52:31 -0700

On Thu, 2014-03-20 at 18:56 -0700, Andi Kleen wrote:
> Eric Dumazet <eric.duma...@gmail.com> writes:
> >
> > I saw csum_partial() consuming 1% of cpu cycles in a GRO workload, that
> > is insane...
> 
> 
> Couldn't it just be the cache miss?


Or the fact that we mix 16 bit stores and 32bit loads ?

iph->tot_len = newlen;
iph->check = 0;
iph->check = ip_fast_csum(iph, 5);

-> following perf annotation :

         :      ffffffff81538e70 <inet_gro_complete>:
    0.49 :      ffffffff81538e70:       callq  ffffffff81578c80 
<__entry_text_start>
    0.49 :      ffffffff81538e75:       push   %rbp
    1.46 :      ffffffff81538e76:       movzwl 0x60(%rdi),%edx
    0.00 :      ffffffff81538e7a:       movslq %esi,%rax
    0.00 :      ffffffff81538e7d:       add    0xc8(%rdi),%rax
    1.46 :      ffffffff81538e84:       mov    %rsp,%rbp
    0.00 :      ffffffff81538e87:       sub    %esi,%edx
    0.00 :      ffffffff81538e89:       rol    $0x8,%dx
    0.00 :      ffffffff81538e8d:       movzbl 0x9(%rax),%ecx
    0.98 :      ffffffff81538e91:       mov    %dx,0x2(%rax)     iph->tot_len = 
newlen;
    0.49 :      ffffffff81538e95:       xor    %edx,%edx
    0.00 :      ffffffff81538e97:       mov    %dx,0xa(%rax)     iph->check = 0;
    0.00 :      ffffffff81538e9b:       mov    (%rax),%edx       // 32bit load  
-> stall 
   86.34 :      ffffffff81538e9d:       add    0x4(%rax),%edx
    0.49 :      ffffffff81538ea0:       adc    0x8(%rax),%edx
    0.49 :      ffffffff81538ea3:       adc    0xc(%rax),%edx
    0.98 :      ffffffff81538ea6:       adc    0x10(%rax),%edx
    0.49 :      ffffffff81538ea9:       adc    $0x0,%edx
    0.00 :      ffffffff81538eac:       mov    %edx,%r8d
    0.49 :      ffffffff81538eaf:       xor    %dx,%dx
    0.00 :      ffffffff81538eb2:       shl    $0x10,%r8d
    0.98 :      ffffffff81538eb6:       add    %r8d,%edx
    0.98 :      ffffffff81538eb9:       adc    $0xffff,%edx
    0.98 :      ffffffff81538ebf:       not    %edx
    0.00 :      ffffffff81538ec1:       shr    $0x10,%edx
    0.49 :      ffffffff81538ec4:       mov    %dx,0xa(%rax)
    0.00 :      ffffffff81538ec8:       movslq %ecx,%rax
    0.00 :      ffffffff81538ecb:       mov    -0x7e734f40(,%rax,8),%rax
    0.00 :      ffffffff81538ed3:       test   %rax,%rax
    0.00 :      ffffffff81538ed6:       je     ffffffff81538ef0 
<inet_gro_complete+0x80>

BTW, any idea why ip_fast_csum() on x86 contains a "memory" constraint ?
It looks like a barrier() would be more appropriate.

Following patch seems to help, but the stall seems to be the fact that
we write on iph->check (16bits) before doing the checksum using 32bit
loads.

Note that we could share same ip_fast_csum() implementation between x86
32/64 bits...

diff --git a/arch/x86/include/asm/checksum_64.h 
b/arch/x86/include/asm/checksum_64.h
index e6fd8a026c7b..a81cc3a5facc 100644
--- a/arch/x86/include/asm/checksum_64.h
+++ b/arch/x86/include/asm/checksum_64.h
@@ -46,6 +46,24 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned 
int ihl)
 {
        unsigned int sum;
 
+       /*
+        * Callers might clear iph->check before calling us, make sure
+        * compiler wont mess things...
+        */
+       barrier();
+
+       if (__builtin_constant_p(ihl) && ihl == 5) {
+               asm("  movl (%[iph]), %[sum]\n"
+                   "  addl 4(%[iph]), %[sum]\n"
+                   "  adcl 8(%[iph]), %[sum]\n"
+                   "  adcl 12(%[iph]), %[sum]\n"
+                   "  adcl 16(%[iph]), %[sum]\n"
+                   "  adcl $0, %[sum]\n"
+                   : [sum] "=r" (sum)
+                   : [iph] "r" (iph)
+                   );
+               return csum_fold(sum);
+       }
        asm("  movl (%1), %0\n"
            "  subl $4, %2\n"
            "  jbe 2f\n"
@@ -68,7 +86,7 @@ static inline __sum16 ip_fast_csum(const void *iph, unsigned 
int ihl)
           will assume they contain their original values. */
            : "=r" (sum), "=r" (iph), "=r" (ihl)
            : "1" (iph), "2" (ihl)
-           : "memory");
+           );
        return (__force __sum16)sum;
 }
 
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 8c54870db792..90aa562dfbf5 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1434,8 +1434,9 @@ static int inet_gro_complete(struct sk_buff *skb, int 
nhoff)
        int proto = iph->protocol;
        int err = -ENOSYS;
 
-       csum_replace2(&iph->check, iph->tot_len, newlen);
        iph->tot_len = newlen;
+       iph->check = 0;
+       iph->check = ip_fast_csum((u8 *)iph, 5);
 
        rcu_read_lock();
        ops = rcu_dereference(inet_offloads[proto]);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [RFC] csum experts, csum_replace2() is too expensive

Reply via email to