http://d.puremagic.com/issues/show_bug.cgi?id=4393



--- Comment #1 from bearophile_h...@eml.cc 2011-04-28 18:55:10 PDT ---
Some code I have written for arrays of floats:


float dot(float[] a, float[] b) {
    enum size_t UNROLL_MASK = 0b111;
    assert(a.length == b.length, "dot(): the two array lengths differ.");

    typeof(a[0]) tot = void;
    auto a_ptr = a.ptr;
    auto b_ptr = b.ptr;

    assert((cast(size_t)a_ptr & cast(size_t)0b1111) == 0,
           "dot(): the first array is not aligned to 16 bytes.");
    assert((cast(size_t)b_ptr & cast(size_t)0b1111) == 0,
           "dot(): the second array is not aligned to 16 bytes.");

    size_t len = (a.length & (~UNROLL_MASK)) * a[0].sizeof;

    if (len) {
        asm {
            mov EAX, a_ptr;
            mov ECX, b_ptr;
            mov EDI, len;
            xorps XMM0, XMM0;
            xorps XMM3, XMM3;
            xor EDX, EDX;

            align 8;
          LOOP_START:
            movaps XMM1, [EAX + EDX];
            mulps XMM1,  [ECX + EDX];
            movaps XMM2, [EAX + EDX + 16];
            mulps XMM2,  [ECX + EDX + 16];
            add EDX, 32;
            addps XMM0, XMM1;
            cmp EDI, EDX;
            addps XMM3, XMM2;
            jne LOOP_START;

            addps XMM0, XMM3;

            // XMM0[0] = XMM0[0] + XMM0[1] + XMM0[2] + XMM0[3]
            movhlps XMM1, XMM0;
            addps XMM0, XMM1;
            pshufd XMM1, XMM0, 0b01_01_01_01;
            addss XMM0, XMM1;

            movss tot, XMM0;
        }
    } else
        tot = 0.0;

    size_t left = a.length & UNROLL_MASK;
    for (size_t i = left; i > 0; i--)
        tot += a[$ - i] * b[$ - i];
    return tot;
}



And for arrays of doubles:

import std.c.stdio;

void main() {
    double[] A = [0.7644108908809033, 0.96458177717869509,
                  0.51166055832523683, 0.098840360055908461,
                  0.40813780586233483, 0.32285857447088551,
                  0.59137950751990331, 0.59518178287473289];
    double[] B = [0.28061374331187983, 0.85036446787626108,
                  0.52498124274748326, 0.84170745998075014,
                  0.55819169392258683, 0.62586351111477134,
                  0.43021720539707864, 0.652708603473523];

    // >>> sum(a*b for a,b in zip(A, B))
    // 2.4593435789602185

    if (A.length % 4 != 0) throw new Error("");
    double tot1 = void,
           tot2 = void;
    auto a_ptr = cast(double*)A.ptr;
    auto b_ptr = cast(double*)B.ptr;
    size_t len = A.length * double.sizeof;

    asm {
        mov EAX, a_ptr;
        mov ECX, b_ptr;
        mov EDI, len;
        xorps XMM0, XMM0;
        xorps XMM3, XMM3;
        xor EDX, EDX;

        align 8;
      LOOP_START:
        movapd XMM1, [EAX + EDX];
        mulpd XMM1,  [ECX + EDX];
        movapd XMM2, [EAX + EDX + 16];
        mulpd XMM2,  [ECX + EDX + 16];
        add EDX, 32;
        addpd XMM0, XMM1;
        cmp EDI, EDX;
        addpd XMM3, XMM2;
        jne LOOP_START;

        addpd XMM0, XMM3; // XMM0 += XMM3

        movhpd tot1, XMM0;
        movlpd tot2, XMM0;
    }

    printf("%.15lf\n", tot1 + tot2);
}



See bug 5880 too.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------

Reply via email to