* Ingo Molnar <[EMAIL PROTECTED]> wrote:

> 100.000000 total
> ................
>   1.469183 tcp_current_mss

                      hits (total: 146918)
                 .........
ffffffff804c5237:      526 <tcp_current_mss>:
ffffffff804c5237:      526      41 54                   push   %r12
ffffffff804c5239:     5929      55                      push   %rbp
ffffffff804c523a:       32      53                      push   %rbx
ffffffff804c523b:      294      48 89 fb                mov    %rdi,%rbx
ffffffff804c523e:      539      48 83 ec 30             sub    $0x30,%rsp
ffffffff804c5242:     2590      85 f6                   test   %esi,%esi
ffffffff804c5244:      444      48 8b 4f 78             mov    0x78(%rdi),%rcx
ffffffff804c5248:      521      8b af 4c 04 00 00       mov    0x44c(%rdi),%ebp
ffffffff804c524e:      791      74 2a                   je     ffffffff804c527a 
<tcp_current_mss+0x43>
ffffffff804c5250:      433      8b 87 00 01 00 00       mov    0x100(%rdi),%eax
ffffffff804c5256:      236      c1 e0 10                shl    $0x10,%eax
ffffffff804c5259:      191      89 c2                   mov    %eax,%edx
ffffffff804c525b:      487      23 97 fc 00 00 00       and    0xfc(%rdi),%edx
ffffffff804c5261:      362      39 c2                   cmp    %eax,%edx
ffffffff804c5263:      342      75 15                   jne    ffffffff804c527a 
<tcp_current_mss+0x43>
ffffffff804c5265:      473      45 31 e4                xor    %r12d,%r12d
ffffffff804c5268:      221      8b 87 00 04 00 00       mov    0x400(%rdi),%eax
ffffffff804c526e:      194      3b 87 80 04 00 00       cmp    0x480(%rdi),%eax
ffffffff804c5274:      445      41 0f 94 c4             sete   %r12b
ffffffff804c5278:      261      eb 03                   jmp    ffffffff804c527d 
<tcp_current_mss+0x46>
ffffffff804c527a:        0      45 31 e4                xor    %r12d,%r12d
ffffffff804c527d:      185      48 85 c9                test   %rcx,%rcx
ffffffff804c5280:      686      74 15                   je     ffffffff804c5297 
<tcp_current_mss+0x60>
ffffffff804c5282:     1806      8b 71 7c                mov    0x7c(%rcx),%esi
ffffffff804c5285:        1      3b b3 5c 03 00 00       cmp    0x35c(%rbx),%esi
ffffffff804c528b:       21      74 0a                   je     ffffffff804c5297 
<tcp_current_mss+0x60>
ffffffff804c528d:        0      48 89 df                mov    %rbx,%rdi
ffffffff804c5290:        0      e8 8b fb ff ff          callq  ffffffff804c4e20 
<tcp_sync_mss>
ffffffff804c5295:        0      89 c5                   mov    %eax,%ebp
ffffffff804c5297:      864      48 8d 4c 24 28          lea    0x28(%rsp),%rcx
ffffffff804c529c:      634      48 8d 54 24 10          lea    0x10(%rsp),%rdx
ffffffff804c52a1:      995      31 f6                   xor    %esi,%esi
ffffffff804c52a3:        0      48 89 df                mov    %rbx,%rdi
ffffffff804c52a6:        2      e8 f2 fe ff ff          callq  ffffffff804c519d 
<tcp_established_options>
ffffffff804c52ab:      859      8b 8b e8 03 00 00       mov    0x3e8(%rbx),%ecx
ffffffff804c52b1:      936      83 c0 14                add    $0x14,%eax
ffffffff804c52b4:        6      0f b7 d1                movzwl %cx,%edx
ffffffff804c52b7:        0      39 d0                   cmp    %edx,%eax
ffffffff804c52b9:      911      74 04                   je     ffffffff804c52bf 
<tcp_current_mss+0x88>
ffffffff804c52bb:        0      29 d0                   sub    %edx,%eax
ffffffff804c52bd:        0      29 c5                   sub    %eax,%ebp
ffffffff804c52bf:        0      45 85 e4                test   %r12d,%r12d
ffffffff804c52c2:     6894      89 e8                   mov    %ebp,%eax
ffffffff804c52c4:        0      74 38                   je     ffffffff804c52fe 
<tcp_current_mss+0xc7>
ffffffff804c52c6:      990      48 8b 83 68 03 00 00    mov    0x368(%rbx),%rax
ffffffff804c52cd:      642      8b b3 04 01 00 00       mov    0x104(%rbx),%esi
ffffffff804c52d3:        3      48 89 df                mov    %rbx,%rdi
ffffffff804c52d6:      240      66 2b 70 30             sub    0x30(%rax),%si
ffffffff804c52da:      588      66 2b b3 7e 03 00 00    sub    0x37e(%rbx),%si
ffffffff804c52e1:        2      66 29 ce                sub    %cx,%si
ffffffff804c52e4:      284      ff ce                   dec    %esi
ffffffff804c52e6:      664      0f b7 f6                movzwl %si,%esi
ffffffff804c52e9:        2      e8 0a fb ff ff          callq  ffffffff804c4df8 
<tcp_bound_to_half_wnd>
ffffffff804c52ee:       68      0f b7 d0                movzwl %ax,%edx
ffffffff804c52f1:     1870      89 c1                   mov    %eax,%ecx
ffffffff804c52f3:        0      89 d0                   mov    %edx,%eax
ffffffff804c52f5:        0      31 d2                   xor    %edx,%edx
ffffffff804c52f7:     2135      f7 f5                   div    %ebp
ffffffff804c52f9:   107010      89 c8                   mov    %ecx,%eax
ffffffff804c52fb:     1670      66 29 d0                sub    %dx,%ax
ffffffff804c52fe:        0      66 89 83 ea 03 00 00    mov    %ax,0x3ea(%rbx)
ffffffff804c5305:        4      48 83 c4 30             add    $0x30,%rsp
ffffffff804c5309:      855      89 e8                   mov    %ebp,%eax
ffffffff804c530b:        0      5b                      pop    %rbx
ffffffff804c530c:      797      5d                      pop    %rbp
ffffffff804c530d:        0      41 5c                   pop    %r12
ffffffff804c530f:        0      c3                      retq   

apparently this division causes 1.0% of tbench overhead:

ffffffff804c52f5:        0      31 d2                   xor    %edx,%edx
ffffffff804c52f7:     2135      f7 f5                   div    %ebp
ffffffff804c52f9:   107010      89 c8                   mov    %ecx,%eax

(gdb) list *0xffffffff804c52f7
0xffffffff804c52f7 is in tcp_current_mss (net/ipv4/tcp_output.c:1078).
1073                                      
inet_csk(sk)->icsk_af_ops->net_header_len -
1074                                      inet_csk(sk)->icsk_ext_hdr_len -
1075                                      tp->tcp_header_len);
1076    
1077                    xmit_size_goal = tcp_bound_to_half_wnd(tp, 
xmit_size_goal);
1078                    xmit_size_goal -= (xmit_size_goal % mss_now);
1079            }
1080            tp->xmit_size_goal = xmit_size_goal;
1081    
1082            return mss_now;
(gdb) 

it's this division:

        if (doing_tso) {
        [...]
                        xmit_size_goal -= (xmit_size_goal % mss_now);

Has no-one hit this before? Perhaps this is why switching loopback 
networking to TSO had a performance impact for others?

It's still a bit weird ... how can a single division cause this much 
overhead? tcp_bound_to_half_wnd() [which is called straight before 
this sequence] seems low-overhead.

        Ingo
--
To unsubscribe from this list: send the line "unsubscribe kernel-testers" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to