On 04/03/2013 07:05 AM, Neil Hodgson wrote:
Dave Angel:

That would seem to imply that the speed regression on your data is NOT
caused by the differing size encodings. Perhaps it is the difference in
MSC compiler version, or other changes made between 3.2 and 3.3

    Its not caused by there actually being different size encodings but
that the code is checking encoding size 2-4 times for each character.

    Back in 3.2 the comparison loop looked like:

     while (len1 > 0 && len2 > 0) {
         Py_UNICODE c1, c2;

         c1 = *s1++;
         c2 = *s2++;

         if (c1 != c2)
             return (c1 < c2) ? -1 : 1;

         len1--; len2--;
     }

    For 3.3 this has changed to

     for (i = 0; i < len1 && i < len2; ++i) {
         Py_UCS4 c1, c2;
         c1 = PyUnicode_READ(kind1, data1, i);
         c2 = PyUnicode_READ(kind2, data2, i);

         if (c1 != c2)
             return (c1 < c2) ? -1 : 1;
     }

     with PyUnicode_READ being

#define PyUnicode_READ(kind, data, index) \
     ((Py_UCS4) \
     ((kind) == PyUnicode_1BYTE_KIND ? \
         ((const Py_UCS1 *)(data))[(index)] : \
         ((kind) == PyUnicode_2BYTE_KIND ? \
             ((const Py_UCS2 *)(data))[(index)] : \
             ((const Py_UCS4 *)(data))[(index)] \
         ) \
     ))

     There are either 1 or 2 kind checks in each call to PyUnicode_READ
and 2 calls to PyUnicode_READ inside the loop. A compiler may decide to
move the kind checks out of the loop and specialize the loop but MSVC
2010 appears to not do so.

I don't know how good MSC's template logic is, but it seems this would be a good case for an explicit template, typed on the 'kind's values. Or are all C++ features disabled when compiling Python? Failing that, just code up 9 cases, and do a switch on the kinds.

I'm also puzzled. I thought that the sort algorithm used a hash of all the items to be sorted, and only reverted to a raw comparison of the original values when the hash collided. Is that not the case? Or is the code you post here only used when the hash collides?


 The assembler (32-bit build) for each
PyUnicode_READ looks like

     mov    ecx, DWORD PTR _kind1$[ebp]
     cmp    ecx, 1
     jne    SHORT $LN17@unicode_co@2
     lea    ecx, DWORD PTR [ebx+eax]
     movzx    edx, BYTE PTR [ecx+edx]
     jmp    SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
     cmp    ecx, 2
     jne    SHORT $LN15@unicode_co@2
     movzx    edx, WORD PTR [ebx+edi]
     jmp    SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
     mov    edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:

It appears that the compiler is keeping the three pointers in three separate registers (eax, esi and edi) even though those are 3 aliases for the same pointer. This is preventing it from putting other values in those registers.

It'd probably do better if the C code manipulated the pointers, rather than using an index i each time. But if it did, perhaps gcc would generate worse code.

If I were coding the assembler by hand (Intel only), I'd be able to avoid the multiple cmp operations, simply by comparing first to 2, then doing a jne and a ja. I dunno whether the compiler would notice if I coded the equivalent in C. (make both comparisons to 2, one for less, and one for more)


    The kind1/kind2 variables aren't even going into registers and at
least one test+branch and a jump are executed for every character. Two
tests for 2 and 4 byte kinds. len1 and len2 don't get to go into
registers either.

    Here's the full assembler output for unicode_compare:

;    COMDAT _unicode_compare
_TEXT    SEGMENT
_kind2$ = -20                        ; size = 4
_kind1$ = -16                        ; size = 4
_len2$ = -12                        ; size = 4
_len1$ = -8                        ; size = 4
_data2$ = -4                        ; size = 4
_unicode_compare PROC                    ; COMDAT
; _str1$ = ecx
; _str2$ = eax

; 10417: {

     push    ebp
     mov    ebp, esp
     sub    esp, 20                    ; 00000014H
     push    ebx
     push    esi
     mov    esi, eax

; 10418:     int kind1, kind2;
; 10419:     void *data1, *data2;
; 10420:     Py_ssize_t len1, len2, i;
; 10421:
; 10422:     kind1 = PyUnicode_KIND(str1);

     mov    eax, DWORD PTR [ecx+16]
     mov    edx, eax
     shr    edx, 2
     and    edx, 7
     push    edi
     mov    DWORD PTR _kind1$[ebp], edx

; 10423:     kind2 = PyUnicode_KIND(str2);

     mov    edx, DWORD PTR [esi+16]
     mov    edi, edx
     shr    edi, 2
     and    edi, 7
     mov    DWORD PTR _kind2$[ebp], edi

; 10424:     data1 = PyUnicode_DATA(str1);

     test    al, 32                    ; 00000020H
     je    SHORT $LN9@unicode_co@2
     test    al, 64                    ; 00000040H
     je    SHORT $LN7@unicode_co@2
     lea    ebx, DWORD PTR [ecx+24]
     jmp    SHORT $LN10@unicode_co@2
$LN7@unicode_co@2:
     lea    ebx, DWORD PTR [ecx+36]
     jmp    SHORT $LN10@unicode_co@2
$LN9@unicode_co@2:
     mov    ebx, DWORD PTR [ecx+36]
$LN10@unicode_co@2:

; 10425:     data2 = PyUnicode_DATA(str2);

     test    dl, 32                    ; 00000020H
     je    SHORT $LN13@unicode_co@2
     test    dl, 64                    ; 00000040H
     je    SHORT $LN11@unicode_co@2
     lea    edx, DWORD PTR [esi+24]
     jmp    SHORT $LN30@unicode_co@2
$LN11@unicode_co@2:
     lea    eax, DWORD PTR [esi+36]
     mov    DWORD PTR _data2$[ebp], eax
     mov    edx, eax
     jmp    SHORT $LN14@unicode_co@2
$LN13@unicode_co@2:
     mov    edx, DWORD PTR [esi+36]
$LN30@unicode_co@2:
     mov    DWORD PTR _data2$[ebp], edx
$LN14@unicode_co@2:

; 10426:     len1 = PyUnicode_GET_LENGTH(str1);

     mov    edi, DWORD PTR [ecx+8]

; 10427:     len2 = PyUnicode_GET_LENGTH(str2);

     mov    ecx, DWORD PTR [esi+8]

; 10428:
; 10429:     for (i = 0; i < len1 && i < len2; ++i) {

     xor    eax, eax
     mov    DWORD PTR _len1$[ebp], edi
     mov    DWORD PTR _len2$[ebp], ecx
     test    edi, edi
     jle    SHORT $LN2@unicode_co@2

; 10426:     len1 = PyUnicode_GET_LENGTH(str1);

     mov    esi, edx
     mov    edi, edx

; 10428:
; 10429:     for (i = 0; i < len1 && i < len2; ++i) {

     sub    ebx, edx
     jmp    SHORT $LN4@unicode_co@2
$LL28@unicode_co@2:
     mov    edx, DWORD PTR _data2$[ebp]
$LN4@unicode_co@2:
     cmp    eax, ecx
     jge    SHORT $LN29@unicode_co@2

; 10430:         Py_UCS4 c1, c2;
; 10431:         c1 = PyUnicode_READ(kind1, data1, i);

     mov    ecx, DWORD PTR _kind1$[ebp]
     cmp    ecx, 1
     jne    SHORT $LN17@unicode_co@2
     lea    ecx, DWORD PTR [ebx+eax]
     movzx    edx, BYTE PTR [ecx+edx]
     jmp    SHORT $LN16@unicode_co@2
$LN17@unicode_co@2:
     cmp    ecx, 2
     jne    SHORT $LN15@unicode_co@2
     movzx    edx, WORD PTR [ebx+edi]
     jmp    SHORT $LN16@unicode_co@2
$LN15@unicode_co@2:
     mov    edx, DWORD PTR [ebx+esi]
$LN16@unicode_co@2:

; 10432:         c2 = PyUnicode_READ(kind2, data2, i);

     mov    ecx, DWORD PTR _kind2$[ebp]
     cmp    ecx, 1
     jne    SHORT $LN21@unicode_co@2
     mov    ecx, DWORD PTR _data2$[ebp]
     movzx    ecx, BYTE PTR [eax+ecx]
     jmp    SHORT $LN20@unicode_co@2
$LN21@unicode_co@2:
     cmp    ecx, 2
     jne    SHORT $LN19@unicode_co@2
     movzx    ecx, WORD PTR [edi]
     jmp    SHORT $LN20@unicode_co@2
$LN19@unicode_co@2:
     mov    ecx, DWORD PTR [esi]
$LN20@unicode_co@2:

; 10433:
; 10434:         if (c1 != c2)

     cmp    edx, ecx
     jne    SHORT $LN31@unicode_co@2
     mov    ecx, DWORD PTR _len2$[ebp]
     inc    eax
     add    edi, 2
     add    esi, 4
     cmp    eax, DWORD PTR _len1$[ebp]
     jl    SHORT $LL28@unicode_co@2
$LN29@unicode_co@2:
     mov    edi, DWORD PTR _len1$[ebp]
$LN2@unicode_co@2:

; 10436:     }
; 10437:
; 10438:     return (len1 < len2) ? -1 : (len1 != len2);

     cmp    edi, ecx
     jge    SHORT $LN23@unicode_co@2
     pop    edi
     pop    esi
     or    eax, -1
     pop    ebx

; 10439: }

     mov    esp, ebp
     pop    ebp
     ret    0
$LN31@unicode_co@2:

; 10435:             return (c1 < c2) ? -1 : 1;

     sbb    eax, eax
     pop    edi
     and    eax, -2                    ; fffffffeH
     pop    esi
     inc    eax
     pop    ebx

; 10439: }

     mov    esp, ebp
     pop    ebp
     ret    0
$LN23@unicode_co@2:

; 10436:     }
; 10437:
; 10438:     return (len1 < len2) ? -1 : (len1 != len2);

     xor    eax, eax
     cmp    edi, ecx
     pop    edi
     pop    esi
     setne    al
     pop    ebx

; 10439: }

     mov    esp, ebp
     pop    ebp
     ret    0
_unicode_compare ENDP

    Neil


--
DaveA
--
http://mail.python.org/mailman/listinfo/python-list

Reply via email to