On Sat, 27 Nov 2010 22:08:29 -0500, bearophile <bearophileh...@lycos.com> wrote:
I have done another test:

Timings, dmd compiler, best of 4, seconds:
  D #1: 5.72
  D #4: 1.84
  D #5: 1.73
  Psy:  1.59
  D #2: 0.55
  D #6: 0.47
  D #3: 0.34


import std.file: read;
import std.c.stdio: printf;

int test(char[] data) {
    int count;
    foreach (i; 0 ..  data.length - 3) {
        char[] codon = data[i .. i + 3];
if ((codon.length == 3 && codon[0] == 'T' && codon[1] == 'A' && codon[2] == 'G') || (codon.length == 3 && codon[0] == 'T' && codon[1] == 'G' && codon[2] == 'A') || (codon.length == 3 && codon[0] == 'T' && codon[1] == 'A' && codon[2] == 'A'))
            count++;
    }
    return count;
}

void main() {
    char[] data0 = cast(char[])read("data.txt");
    int n = 300;
    char[] data = new char[data0.length * n];
    for (size_t pos; pos < data.length; pos += data0.length)
        data[pos .. pos+data0.length] = data0;

    printf("%d\n", test(data));
}


So when there is to compare among strings known at compile-time to be small (like < 6 char), the comparison shall be replaced with inlined single char comparisons. This makes the code longer so it increases code cache pressure, but seeing how much slow the alternative is, I think it's an improvement.

(A smart compiler is even able to remove the codon.length==3 test because the slice data[i..i+3] is always of length 3 (here mysteriously if you remove those three length tests the program compiled with dmd gets slower)).

Bye,
bearophile

Hi bearophile,
I've spent some time having fun this afternoon optimizing array-equals using vectorization techniques. I found that vectorizing using ulongs worked best on my system except with the shortest strings, where a simple Duff's device edged it out. If you'd like to try it out on your data set:


bool arrayComp3(bool useBitCompare = true,T)(const T[] a, const T[] b) pure nothrow {
    if(a.length != b.length) return false;

    static if(useBitCompare) {
        auto pab = cast(ubyte*)a.ptr;
        auto pbb = cast(ubyte*)b.ptr;
        if(pab is pbb) return true;

        auto byte_length = a.length*T.sizeof;
        auto pa_end      = cast(ulong*)(pab+byte_length);
            final switch (byte_length % ulong.sizeof) {
            case 7:             if(*pab++ != *pbb++ ) return false;
            case 6:             if(*pab++ != *pbb++ ) return false;
            case 5:             if(*pab++ != *pbb++ ) return false;
            case 4:             if(*pab++ != *pbb++ ) return false;
            case 3:             if(*pab++ != *pbb++ ) return false;
            case 2:             if(*pab++ != *pbb++ ) return false;
            case 1:             if(*pab++ != *pbb++ ) return false;
            case 0:
            }
        auto pa      = cast(ulong*)pab;
        auto pb      = cast(ulong*)pbb;
        while (pa < pa_end) {
            if(*pa++ != *pb++ ) return false;
        }
} else { // default to a short duff's device
        auto pa = a.ptr;
        auto pb = b.ptr;
        if(pa is pb) return true;
            auto n  = (a.length + 3) / 4;
            final switch (a.length % 4) {
            case 0:        do {  if(*pa++ != *pb++ ) return false;
            case 3:              if(*pa++ != *pb++ ) return false;
            case 2:              if(*pa++ != *pb++ ) return false;
            case 1:              if(*pa++ != *pb++ ) return false;
                      } while (--n > 0);
            }    }
    return true;
}

Reply via email to