------- Comment #4 from ubizjak at gmail dot com  2009-07-04 12:43 -------
(In reply to comment #1)
> Can you check numbers with vectorization disabled?  I see the regression as
> well on a AMD Fam 10 machine which supposedly has unaligned moves as fast
> as aligned moves (if the data turns out to be aligned).  Which means the
> data really is unaligned.

Without the vectorisation (adding -fno-tree-vectorize), there is no difference:

time ./a.out
  Benchmark running, hopefully as only ACTIVE task
Test1 - Gauss 2000 (101x101) inverts  2.2 sec  Err= 0.000000000000006
Test2 - Crout 2000 (101x101) inverts  2.5 sec  Err= 0.000000000000023
Test3 - Crout  2 (1001x1001) inverts  2.6 sec  Err= 0.000000000000031
Test4 - Lapack 2 (1001x1001) inverts  2.2 sec  Err= 0.000000000000250
                             total =  9.5 sec


real    0m9.760s
user    0m9.674s
sys     0m0.082s


> What is the difference in code generation?  Can you create a testcase from
> the hot loop(s)?


The first thing to catch my eye is missing CSE on memory references in the
_MAIN loop (added -march=barcelona to generate movupd insns:


.L14:
        movupd  equiv.0.1551(%rdx,%rax), %xmm8  #, tmp720
        movupd  a3.1557(%rdx,%rax), %xmm7       #, tmp722
        leaq    16(%rdx), %r10  #, tmp789
        movupd  equiv.0.1551(%r10,%rax), %xmm6  #, tmp867
        movupd  a3.1557(%r10,%rax), %xmm5       #, tmp869
        leaq    32(%rdx), %rcx  #, ivtmp.845
        subpd   %xmm8, %xmm7    # tmp720, tmp722
        movupd  equiv.0.1551(%rcx,%rax), %xmm3  #, tmp873
        movupd  a3.1557(%rcx,%rax), %xmm4       #, tmp875
        subpd   %xmm6, %xmm5    # tmp867, tmp869
        leaq    48(%rdx), %r9   #, ivtmp.845
        leaq    64(%rdx), %r8   #, ivtmp.845
        subpd   %xmm3, %xmm4    # tmp873, tmp875
        movupd  a3.1557(%r9,%rax), %xmm15       #, tmp881
        movupd  equiv.0.1551(%r9,%rax), %xmm2   #, tmp879
        movupd  a3.1557(%r8,%rax), %xmm13       #, tmp887
        movupd  equiv.0.1551(%r8,%rax), %xmm14  #, tmp885

and in regressed case:

.L13:
        movupd  (%rdx,%rax), %xmm15     #* ivtmp.792, tmp962
        movupd  (%r12,%rax), %xmm14     #* ivtmp.792, tmp964
        leaq    16(%rax), %rsi  #, tmp1050
        movupd  (%rdx,%rsi), %xmm13     #, tmp1114
        movupd  (%r12,%rsi), %xmm12     #, tmp1116
        leaq    32(%rax), %r15  #, ivtmp.792
        subpd   %xmm15, %xmm14  # tmp962, tmp964
        movupd  (%rdx,%r15), %xmm11     #* ivtmp.792, tmp1120
        movupd  (%r12,%r15), %xmm10     #* ivtmp.792, tmp1122
        subpd   %xmm13, %xmm12  # tmp1114, tmp1116
        leaq    48(%rax), %rcx  #, ivtmp.792
        leaq    64(%rax), %r13  #, ivtmp.792
        subpd   %xmm11, %xmm10  # tmp1120, tmp1122
        movupd  (%r12,%rcx), %xmm8      #* ivtmp.792, tmp1128
        movupd  (%rdx,%rcx), %xmm9      #* ivtmp.792, tmp1126
        movupd  (%r12,%r13), %xmm6      #* ivtmp.792, tmp1134
        movupd  (%rdx,%r13), %xmm7      #* ivtmp.792, tmp1132


-- 


http://gcc.gnu.org/bugzilla/show_bug.cgi?id=40648

Reply via email to