Raymond Hettinger added the comment:

FWIW, my approach is to look at the most important code
paths to see if there is any work being done that isn't
essential for the result being computed.

Next, I look at the generated assembly to estimate speed
by counting memory accesses (and whether they are cached
fresh accesses or stale random accesses) and I look at
the branches (and whether they are predictable or not).

The table=so->table assignment was being done for all code
paths but was only needed around the rich compare.  Here
is the before and after for the most important path
(the first lookup).  Note that the change saves one memory
spill and one reload.

Before:
-------
_set_add_entry:
pushq   %r15
pushq   %r14
movq    %rdx, %r14
pushq   %r13
pushq   %r12
movq    %rdi, %r12
pushq   %rbp
movq    %rsi, %rbp
pushq   %rbx
subq    $56, %rsp
movq    40(%rdi), %rax
addq    $1, (%rsi)
movq    %rax, 16(%rsp)        <-- spill
movq    32(%r12), %rdx        
movq    %rdx, %r15
andq    %r14, %r15
movq    %r15, %rbx
salq    $4, %rbx
addq    16(%rsp), %rbx        <-- reload
movq    (%rbx), %rcx
testq   %rcx, %rcx
je  L430


AFTER
-----
_set_add_entry:
        pushq   %r15
        movq    %rdx, %r15
        pushq   %r14
        pushq   %r13
        pushq   %r12
        movq    %rdi, %r12
        pushq   %rbp
        movq    %rsi, %rbp
        pushq   %rbx
        subq    $56, %rsp
        movq    40(%rdi), %rdx
        addq    $1, (%rsi)        <-- no spill
        movq    %rdx, %r11
L428:
        movq    32(%r12), %rcx
        movq    %rcx, %r13
        andq    %r15, %r13
        movq    %r13, %rbx
        salq    $4, %rbx
        addq    %r11, %rbx         <-- from register
        movq    (%rbx), %r14
        testq   %r14, %r14
        je      L429


The code around the rich compare used to do memory
loads that weren't necessary for the most likely case
(since the 64-bit hash values match, it is very likely
that the comparison will report a match).

BEFORE
------

call    _PyObject_RichCompareBool
movq    24(%rsp), %rcx
movq    (%rcx), %rdi
leaq    -1(%rdi), %rdx
testq   %rdx, %rdx
movq    %rdx, (%rcx)
je  L489
testl   %eax, %eax            
js  L437                      <--- predictable error branch
movq    40(%r12), %rdx        <--- memory load 
cmpq    16(%rsp), %rdx        <--- memory load
jne L460                 
cmpq    (%rbx), %rcx          <--- memory load  
jne L429                      <--- predictable restart branch
testl   %eax, %eax            <--- predictable found_active branch            
jne L432                      <--- most common exit point
movq    32(%r12), %rdx


AFTER
-----

        call    _PyObject_RichCompareBool
        movq    16(%rsp), %rcx
        movq    (%rcx), %rdi
        leaq    -1(%rdi), %rdx
        testq   %rdx, %rdx
        movq    %rdx, (%rcx)
        je      L485
        cmpl    $0, %eax
        jg      L431                  <-- common exit before the memory loads!
L490:
        jne     L434
        movq    40(%r12), %rdx    <--- memory load 
        cmpq    %rdx, 24(%rsp)    <--- memory load 
        movq    %rdx, %r11
        jne     L428
        cmpq    (%rbx), %rcx      <--- memory load 
        jne     L428

----------

_______________________________________
Python tracker <rep...@bugs.python.org>
<http://bugs.python.org/issue24681>
_______________________________________
_______________________________________________
Python-bugs-list mailing list
Unsubscribe: 
https://mail.python.org/mailman/options/python-bugs-list/archive%40mail-archive.com

Reply via email to