On 12/23/2016 10:03 AM, hardreset wrote:
enum SIZE = 100000000;

void foo3(int* a, int* b)
{
    asm
    {
        mov   EAX,a;
        mov   EDX,b;
        sub   EDX,EAX;
        mov   ECX,EAX;
        add   ECX,SIZE*4;
    l1:;
        cmp   EAX,ECX;
        jae   done;
        mov   EBX,[EAX];
        and   EBX,[EAX+EDX];
        mov   [EAX],EBX;
        add   EAX,4;
        jmp l1;
    done:;
    }
}

I get..

456ms for unconditional
505ms for conditional
268ms for assembler

So the asm is about 40% faster than the best of the alternatives. The point
being that it's the generated code not the source that's the problem.

For this D code:

enum SIZE = 100000000;

void foo(int* a, int* b) {
    int* atop = a + 1000;
    ptrdiff_t offset = b - a;
    for (; a < atop; ++a)
        *a &= *(a + offset);
}

The following asm is generated by DMD:

                push    EBX
                mov     EBX,8[ESP]
                sub     EAX,EBX
                push    ESI
                cdq
                and     EDX,3
                add     EAX,EDX
                sar     EAX,2
                lea     ECX,0FA0h[EBX]
                mov     ESI,EAX
                cmp     EBX,ECX
                jae     L2C
L20:            mov     EDX,[ESI*4][EBX]
                and     [EBX],EDX
                add     EBX,4
                cmp     EBX,ECX
                jb      L20
L2C:            pop     ESI
                pop     EBX
                ret     4

The inner loop is 5 instructions, whereas the one you wrote is 7 instructions (I didn't benchmark it). With some more source code manipulation the divide can be eliminated, but that is irrelevant to the inner loop.

Reply via email to