* Ingo Molnar <[EMAIL PROTECTED]> wrote:

> 100.000000 total
> ................
>   3.356152 ip_queue_xmit

                      hits (335615 total)
                 .........
ffffffff804b7045:     1001 <ip_queue_xmit>:
ffffffff804b7045:     1001      41 57                   push   %r15
ffffffff804b7047:    36698      41 56                   push   %r14
ffffffff804b7049:        0      49 89 fe                mov    %rdi,%r14
ffffffff804b704c:        0      41 55                   push   %r13
ffffffff804b704e:      447      41 54                   push   %r12
ffffffff804b7050:        0      55                      push   %rbp
ffffffff804b7051:        4      53                      push   %rbx
ffffffff804b7052:      465      48 83 ec 68             sub    $0x68,%rsp
ffffffff804b7056:        1      89 74 24 08             mov    %esi,0x8(%rsp)
ffffffff804b705a:      486      48 8b 47 28             mov    0x28(%rdi),%rax
ffffffff804b705e:        0      48 8b 6f 10             mov    0x10(%rdi),%rbp
ffffffff804b7062:        7      48 85 c0                test   %rax,%rax
ffffffff804b7065:      480      48 89 44 24 58          mov    %rax,0x58(%rsp)
ffffffff804b706a:        0      4c 8b bd 48 02 00 00    mov    0x248(%rbp),%r15
ffffffff804b7071:        7      0f 85 0d 01 00 00       jne    ffffffff804b7184 
<ip_queue_xmit+0x13f>
ffffffff804b7077:      452      31 f6                   xor    %esi,%esi
ffffffff804b7079:        0      48 89 ef                mov    %rbp,%rdi
ffffffff804b707c:        5      e8 c1 eb fc ff          callq  ffffffff80485c42 
<__sk_dst_check>
ffffffff804b7081:      434      48 85 c0                test   %rax,%rax
ffffffff804b7084:       54      48 89 44 24 58          mov    %rax,0x58(%rsp)
ffffffff804b7089:        0      0f 85 e0 00 00 00       jne    ffffffff804b716f 
<ip_queue_xmit+0x12a>
ffffffff804b708f:        0      4d 85 ff                test   %r15,%r15
ffffffff804b7092:        0      44 8b ad 30 02 00 00    mov    0x230(%rbp),%r13d
ffffffff804b7099:        0      74 0a                   je     ffffffff804b70a5 
<ip_queue_xmit+0x60>
ffffffff804b709b:        0      41 80 7f 05 00          cmpb   $0x0,0x5(%r15)
ffffffff804b70a0:        0      74 03                   je     ffffffff804b70a5 
<ip_queue_xmit+0x60>
ffffffff804b70a2:        0      45 8b 2f                mov    (%r15),%r13d
ffffffff804b70a5:        0      8b 85 3c 02 00 00       mov    0x23c(%rbp),%eax
ffffffff804b70ab:        0      48 8d b5 10 01 00 00    lea    0x110(%rbp),%rsi
ffffffff804b70b2:        0      44 8b 65 04             mov    0x4(%rbp),%r12d
ffffffff804b70b6:        0      bf 0d 00 00 00          mov    $0xd,%edi
ffffffff804b70bb:        0      89 44 24 0c             mov    %eax,0xc(%rsp)
ffffffff804b70bf:        0      8a 9d 54 02 00 00       mov    0x254(%rbp),%bl
ffffffff804b70c5:        0      e8 9a df ff ff          callq  ffffffff804b5064 
<constant_test_bit>
ffffffff804b70ca:        0      31 d2                   xor    %edx,%edx
ffffffff804b70cc:        0      48 8d 7c 24 10          lea    0x10(%rsp),%rdi
ffffffff804b70d1:        0      41 89 c3                mov    %eax,%r11d
ffffffff804b70d4:        0      fc                      cld    
ffffffff804b70d5:        0      89 d0                   mov    %edx,%eax
ffffffff804b70d7:        0      b9 10 00 00 00          mov    $0x10,%ecx
ffffffff804b70dc:        0      44 8a 45 39             mov    0x39(%rbp),%r8b
ffffffff804b70e0:        0      40 8a b5 57 02 00 00    mov    0x257(%rbp),%sil
ffffffff804b70e7:        0      44 8b 8d 50 02 00 00    mov    0x250(%rbp),%r9d
ffffffff804b70ee:        0      83 e3 1e                and    $0x1e,%ebx
ffffffff804b70f1:        0      44 8b 95 38 02 00 00    mov    0x238(%rbp),%r10d
ffffffff804b70f8:        0      44 09 db                or     %r11d,%ebx
ffffffff804b70fb:        0      f3 ab                   rep stos %eax,%es:(%rdi)
ffffffff804b70fd:        0      40 c0 ee 05             shr    $0x5,%sil
ffffffff804b7101:        0      88 5c 24 24             mov    %bl,0x24(%rsp)
ffffffff804b7105:        0      48 8d 5c 24 10          lea    0x10(%rsp),%rbx
ffffffff804b710a:        0      83 e6 01                and    $0x1,%esi
ffffffff804b710d:        0      48 89 ef                mov    %rbp,%rdi
ffffffff804b7110:        0      44 88 44 24 40          mov    %r8b,0x40(%rsp)
ffffffff804b7115:        0      8b 44 24 0c             mov    0xc(%rsp),%eax
ffffffff804b7119:        0      40 88 74 24 41          mov    %sil,0x41(%rsp)
ffffffff804b711e:        0      48 89 de                mov    %rbx,%rsi
ffffffff804b7121:        0      66 44 89 4c 24 44       mov    %r9w,0x44(%rsp)
ffffffff804b7127:        0      66 44 89 54 24 46       mov    %r10w,0x46(%rsp)
ffffffff804b712d:        0      44 89 64 24 10          mov    %r12d,0x10(%rsp)
ffffffff804b7132:        0      44 89 6c 24 1c          mov    %r13d,0x1c(%rsp)
ffffffff804b7137:        0      89 44 24 20             mov    %eax,0x20(%rsp)
ffffffff804b713b:        0      e8 2d 9f e5 ff          callq  ffffffff8031106d 
<security_sk_classify_flow>
ffffffff804b7140:        0      48 8d 74 24 58          lea    0x58(%rsp),%rsi
ffffffff804b7145:        0      45 31 c0                xor    %r8d,%r8d
ffffffff804b7148:        0      48 89 e9                mov    %rbp,%rcx
ffffffff804b714b:        0      48 89 da                mov    %rbx,%rdx
ffffffff804b714e:        0      48 c7 c7 d0 15 ab 80    mov    
$0xffffffff80ab15d0,%rdi
ffffffff804b7155:        0      e8 1a 91 ff ff          callq  ffffffff804b0274 
<ip_route_output_flow>
ffffffff804b715a:        0      85 c0                   test   %eax,%eax
ffffffff804b715c:        0      0f 85 9f 01 00 00       jne    ffffffff804b7301 
<ip_queue_xmit+0x2bc>
ffffffff804b7162:        0      48 8b 74 24 58          mov    0x58(%rsp),%rsi
ffffffff804b7167:        0      48 89 ef                mov    %rbp,%rdi
ffffffff804b716a:        0      e8 a8 eb fc ff          callq  ffffffff80485d17 
<sk_setup_caps>
ffffffff804b716f:      441      48 8b 44 24 58          mov    0x58(%rsp),%rax
ffffffff804b7174:     1388      48 85 c0                test   %rax,%rax
ffffffff804b7177:        0      74 07                   je     ffffffff804b7180 
<ip_queue_xmit+0x13b>
ffffffff804b7179:        0      f0 ff 80 b0 00 00 00    lock incl 0xb0(%rax)
ffffffff804b7180:      556      49 89 46 28             mov    %rax,0x28(%r14)
ffffffff804b7184:     8351      4d 85 ff                test   %r15,%r15
ffffffff804b7187:        0      be 14 00 00 00          mov    $0x14,%esi
ffffffff804b718c:      461      74 26                   je     ffffffff804b71b4 
<ip_queue_xmit+0x16f>
ffffffff804b718e:        0      41 f6 47 08 01          testb  $0x1,0x8(%r15)
ffffffff804b7193:        0      74 17                   je     ffffffff804b71ac 
<ip_queue_xmit+0x167>
ffffffff804b7195:        0      48 8b 54 24 58          mov    0x58(%rsp),%rdx
ffffffff804b719a:        0      8b 82 28 01 00 00       mov    0x128(%rdx),%eax
ffffffff804b71a0:        0      39 82 1c 01 00 00       cmp    %eax,0x11c(%rdx)
ffffffff804b71a6:        0      0f 85 55 01 00 00       jne    ffffffff804b7301 
<ip_queue_xmit+0x2bc>
ffffffff804b71ac:        0      41 0f b6 47 04          movzbl 0x4(%r15),%eax
ffffffff804b71b1:        0      8d 70 14                lea    0x14(%rax),%esi
ffffffff804b71b4:       39      4c 89 f7                mov    %r14,%rdi
ffffffff804b71b7:      493      e8 f8 18 fd ff          callq  ffffffff80488ab4 
<skb_push>
ffffffff804b71bc:        0      4c 89 f7                mov    %r14,%rdi
ffffffff804b71bf:     1701      e8 99 df ff ff          callq  ffffffff804b515d 
<skb_reset_network_header>
ffffffff804b71c4:      481      0f b6 85 54 02 00 00    movzbl 0x254(%rbp),%eax
ffffffff804b71cb:     4202      41 8b 9e bc 00 00 00    mov    0xbc(%r14),%ebx
ffffffff804b71d2:        3      48 89 ef                mov    %rbp,%rdi
ffffffff804b71d5:        0      49 03 9e d0 00 00 00    add    0xd0(%r14),%rbx
ffffffff804b71dc:      466      80 cc 45                or     $0x45,%ah
ffffffff804b71df:        7      66 c1 c0 08             rol    $0x8,%ax
ffffffff804b71e3:        0      66 89 03                mov    %ax,(%rbx)
ffffffff804b71e6:      492      48 8b 74 24 58          mov    0x58(%rsp),%rsi
ffffffff804b71eb:        3      e8 a0 df ff ff          callq  ffffffff804b5190 
<ip_dont_fragment>
ffffffff804b71f0:     1405      85 c0                   test   %eax,%eax
ffffffff804b71f2:     4391      74 0f                   je     ffffffff804b7203 
<ip_queue_xmit+0x1be>
ffffffff804b71f4:        0      83 7c 24 08 00          cmpl   $0x0,0x8(%rsp)
ffffffff804b71f9:      417      75 08                   jne    ffffffff804b7203 
<ip_queue_xmit+0x1be>
ffffffff804b71fb:      503      66 c7 43 06 40 00       movw   $0x40,0x6(%rbx)
ffffffff804b7201:     6743      eb 06                   jmp    ffffffff804b7209 
<ip_queue_xmit+0x1c4>
ffffffff804b7203:        0      66 c7 43 06 00 00       movw   $0x0,0x6(%rbx)
ffffffff804b7209:      118      0f bf 85 40 02 00 00    movswl 0x240(%rbp),%eax
ffffffff804b7210:    10867      48 8b 54 24 58          mov    0x58(%rsp),%rdx
ffffffff804b7215:      340      85 c0                   test   %eax,%eax
ffffffff804b7217:        0      79 06                   jns    ffffffff804b721f 
<ip_queue_xmit+0x1da>
ffffffff804b7219:   107464      8b 82 9c 00 00 00       mov    0x9c(%rdx),%eax
ffffffff804b721f:     4963      88 43 08                mov    %al,0x8(%rbx)
ffffffff804b7222:    26297      8a 45 39                mov    0x39(%rbp),%al
ffffffff804b7225:    76658      4d 85 ff                test   %r15,%r15
ffffffff804b7228:     1712      88 43 09                mov    %al,0x9(%rbx)
ffffffff804b722b:      148      48 8b 44 24 58          mov    0x58(%rsp),%rax
ffffffff804b7230:     2971      8b 80 20 01 00 00       mov    0x120(%rax),%eax
ffffffff804b7236:    14849      89 43 0c                mov    %eax,0xc(%rbx)
ffffffff804b7239:       84      48 8b 44 24 58          mov    0x58(%rsp),%rax
ffffffff804b723e:      360      8b 80 1c 01 00 00       mov    0x11c(%rax),%eax
ffffffff804b7244:      174      89 43 10                mov    %eax,0x10(%rbx)
ffffffff804b7247:       96      74 32                   je     ffffffff804b727b 
<ip_queue_xmit+0x236>
ffffffff804b7249:        0      41 8a 57 04             mov    0x4(%r15),%dl
ffffffff804b724d:        0      84 d2                   test   %dl,%dl
ffffffff804b724f:        0      74 2a                   je     ffffffff804b727b 
<ip_queue_xmit+0x236>
ffffffff804b7251:        0      c0 ea 02                shr    $0x2,%dl
ffffffff804b7254:        0      03 13                   add    (%rbx),%edx
ffffffff804b7256:        0      8a 03                   mov    (%rbx),%al
ffffffff804b7258:        0      45 31 c0                xor    %r8d,%r8d
ffffffff804b725b:        0      4c 89 fe                mov    %r15,%rsi
ffffffff804b725e:        0      4c 89 f7                mov    %r14,%rdi
ffffffff804b7261:        0      83 e0 f0                and    
$0xfffffffffffffff0,%eax
ffffffff804b7264:        0      83 e2 0f                and    $0xf,%edx
ffffffff804b7267:        0      09 d0                   or     %edx,%eax
ffffffff804b7269:        0      88 03                   mov    %al,(%rbx)
ffffffff804b726b:        0      48 8b 4c 24 58          mov    0x58(%rsp),%rcx
ffffffff804b7270:        0      8b 95 30 02 00 00       mov    0x230(%rbp),%edx
ffffffff804b7276:        0      e8 e4 d8 ff ff          callq  ffffffff804b4b5f 
<ip_options_build>
ffffffff804b727b:      541      41 8b 86 c8 00 00 00    mov    0xc8(%r14),%eax
ffffffff804b7282:      570      31 d2                   xor    %edx,%edx
ffffffff804b7284:        0      49 03 86 d0 00 00 00    add    0xd0(%r14),%rax
ffffffff804b728b:       34      8b 40 08                mov    0x8(%rax),%eax
ffffffff804b728e:      496      66 85 c0                test   %ax,%ax
ffffffff804b7291:       11      74 06                   je     ffffffff804b7299 
<ip_queue_xmit+0x254>
ffffffff804b7293:        9      0f b7 c0                movzwl %ax,%eax
ffffffff804b7296:      495      8d 50 ff                lea    -0x1(%rax),%edx
ffffffff804b7299:        2      f6 43 06 40             testb  $0x40,0x6(%rbx)
ffffffff804b729d:        9      48 8b 74 24 58          mov    0x58(%rsp),%rsi
ffffffff804b72a2:      497      74 34                   je     ffffffff804b72d8 
<ip_queue_xmit+0x293>
ffffffff804b72a4:        8      83 bd 30 02 00 00 00    cmpl   $0x0,0x230(%rbp)
ffffffff804b72ab:       10      74 23                   je     ffffffff804b72d0 
<ip_queue_xmit+0x28b>
ffffffff804b72ad:     1044      66 8b 85 52 02 00 00    mov    0x252(%rbp),%ax
ffffffff804b72b4:        7      66 c1 c0 08             rol    $0x8,%ax
ffffffff804b72b8:        8      66 89 43 04             mov    %ax,0x4(%rbx)
ffffffff804b72bc:      432      66 8b 85 52 02 00 00    mov    0x252(%rbp),%ax
ffffffff804b72c3:        9      ff c0                   inc    %eax
ffffffff804b72c5:       14      01 d0                   add    %edx,%eax
ffffffff804b72c7:     1141      66 89 85 52 02 00 00    mov    %ax,0x252(%rbp)
ffffffff804b72ce:        7      eb 10                   jmp    ffffffff804b72e0 
<ip_queue_xmit+0x29b>
ffffffff804b72d0:        0      66 c7 43 04 00 00       movw   $0x0,0x4(%rbx)
ffffffff804b72d6:        0      eb 08                   jmp    ffffffff804b72e0 
<ip_queue_xmit+0x29b>
ffffffff804b72d8:        0      48 89 df                mov    %rbx,%rdi
ffffffff804b72db:        0      e8 b7 9d ff ff          callq  ffffffff804b1097 
<__ip_select_ident>
ffffffff804b72e0:        6      8b 85 54 01 00 00       mov    0x154(%rbp),%eax
ffffffff804b72e6:      458      4c 89 f7                mov    %r14,%rdi
ffffffff804b72e9:        2      41 89 46 78             mov    %eax,0x78(%r14)
ffffffff804b72ed:        4      8b 85 f0 01 00 00       mov    0x1f0(%rbp),%eax
ffffffff804b72f3:      841      41 89 86 b0 00 00 00    mov    %eax,0xb0(%r14)
ffffffff804b72fa:       11      e8 30 f2 ff ff          callq  ffffffff804b652f 
<ip_local_out>
ffffffff804b72ff:        0      eb 44                   jmp    ffffffff804b7345 
<ip_queue_xmit+0x300>
ffffffff804b7301:        0      65 48 8b 04 25 10 00    mov    %gs:0x10,%rax
ffffffff804b7308:        0      00 00 
ffffffff804b730a:        0      8b 80 48 e0 ff ff       mov    
-0x1fb8(%rax),%eax
ffffffff804b7310:        0      4c 89 f7                mov    %r14,%rdi
ffffffff804b7313:        0      30 c0                   xor    %al,%al
ffffffff804b7315:        0      66 83 f8 01             cmp    $0x1,%ax
ffffffff804b7319:        0      48 19 c0                sbb    %rax,%rax
ffffffff804b731c:        0      83 e0 08                and    $0x8,%eax
ffffffff804b731f:        0      48 8b 90 a8 16 ab 80    mov    
-0x7f54e958(%rax),%rdx
ffffffff804b7326:        0      65 8b 04 25 24 00 00    mov    %gs:0x24,%eax
ffffffff804b732d:        0      00 
ffffffff804b732e:        0      89 c0                   mov    %eax,%eax
ffffffff804b7330:        0      48 f7 d2                not    %rdx
ffffffff804b7333:        0      48 8b 04 c2             mov    
(%rdx,%rax,8),%rax
ffffffff804b7337:        0      48 ff 40 68             incq   0x68(%rax)
ffffffff804b733b:        0      e8 b1 18 fd ff          callq  ffffffff80488bf1 
<kfree_skb>
ffffffff804b7340:        0      b8 8f ff ff ff          mov    $0xffffff8f,%eax
ffffffff804b7345:     9196      48 83 c4 68             add    $0x68,%rsp
ffffffff804b7349:      892      5b                      pop    %rbx
ffffffff804b734a:        0      5d                      pop    %rbp
ffffffff804b734b:      488      41 5c                   pop    %r12
ffffffff804b734d:        0      41 5d                   pop    %r13
ffffffff804b734f:        0      41 5e                   pop    %r14
ffffffff804b7351:      513      41 5f                   pop    %r15
ffffffff804b7353:        0      c3                      retq   

about 10% of this function's cost is artificial:

ffffffff804b7045:     1001 <ip_queue_xmit>:
ffffffff804b7045:     1001      41 57                   push   %r15
ffffffff804b7047:    36698      41 56                   push   %r14

there are profiler hits that leaked in via out-of-order execution from 
the callsites. The callsites are hard to map unfortunately, as this 
function is called via function pointers.

the most likely callsite is tcp_transmit_skb().

30% of the overhead of this function comes from:

ffffffff804b7203:        0      66 c7 43 06 00 00       movw   $0x0,0x6(%rbx)
ffffffff804b7209:      118      0f bf 85 40 02 00 00    movswl 0x240(%rbp),%eax
ffffffff804b7210:    10867      48 8b 54 24 58          mov    0x58(%rsp),%rdx
ffffffff804b7215:      340      85 c0                   test   %eax,%eax
ffffffff804b7217:        0      79 06                   jns    ffffffff804b721f 
<ip_queue_xmit+0x1da>
ffffffff804b7219:   107464      8b 82 9c 00 00 00       mov    0x9c(%rdx),%eax
ffffffff804b721f:     4963      88 43 08                mov    %al,0x8(%rbx)

the 16-bit movw looks a bit weird. It comes from line 372:

 0xffffffff804b7203 is in ip_queue_xmit (net/ipv4/ip_output.c:372).
 367            iph = ip_hdr(skb);
 368            *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 
0xff));
 369            if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 370                    iph->frag_off = htons(IP_DF);
 371            else
 372                    iph->frag_off = 0;
 373            iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 374            iph->protocol = sk->sk_protocol;
 375            iph->saddr    = rt->rt_src;
 376            iph->daddr    = rt->rt_dst;

the ip-header fragment flag setting to zero.

16-bit ops are an on-off love/hate affair on x86 CPUs. The trend is 
towards eliminating them as much as possible.

_But_, the real overhead probably comes from:

 ffffffff804b7210:    10867     48 8b 54 24 58          mov    0x58(%rsp),%rdx

which is the next line, the ttl field:

 373             iph->ttl      = ip_select_ttl(inet, &rt->u.dst);

this shows that we are doing a hard cachemiss on the net-localhost 
route dst structure cacheline. We do a plain load instruction from it 
here and get a hefty cachemiss. (because 16 CPUs are banging on that 
single route)

And let make sure we see this in perspective as well: that single 
cachemiss is _1.0 percent_ of the total tbench cost. (!) We could make 
the scheduler 10% slower straight away and it would have less of a 
real-life effect than this single iph->ttl field setting.

        Ingo
--
To unsubscribe from this list: send the line "unsubscribe kernel-testers" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to