------- Comment #26 from victork at gcc dot gnu dot org 2008-02-11 13:41 ------- Probably, the small difference between vectorized and non-vectorized versions can be explained by the fact that big arrays do not fit the memory cache. Here is the version of the original program which shows that more than twice difference is remained for long runs as well if arrays are decreased to fit the cache:
#include <iostream> #include <stdio.h> #include <stdlib.h> typedef float ARRTYPE; int main ( int argc, char *argv[] ) { int m_nSamples = atoi( argv[1] ); int itBegin = atoi( argv[2] ); int itEnd = atoi( argv[3] ); int iSizeMain = atoi( argv[ 4 ] ); ARRTYPE *pSum1 = (ARRTYPE*) malloc (sizeof(ARRTYPE) *100000); ARRTYPE *pSum = (ARRTYPE*) malloc (sizeof(ARRTYPE) *100000); for ( int it = 0; it < 100000; it++ ) { pSum[ it ] = it / itBegin; pSum1[ it ] = itBegin / ( it + 1 ); } ARRTYPE *pVec1 = (ARRTYPE*) malloc (sizeof(ARRTYPE) *100000); for ( int i = 0, j = 0; i < m_nSamples - 5; i++ ) { for( int it = itBegin; it < itEnd; it++ ) pVec1[ it ] += pSum[ it ] + pSum1[ it ]; } free( pVec1 ); } [EMAIL PROTECTED]:~> $g -O3 -fdump-tree-vect-details -fno-tree-vectorize -m64 -o mnovec m.c [EMAIL PROTECTED]:~> $g -O3 -fdump-tree-vect-details -ftree-vectorize -maltivec -m64 -o mvec m.c [EMAIL PROTECTED]:~> time ./mnovec 400000 1 29720 1000 real 0m24.493s user 0m24.483s sys 0m0.007s [EMAIL PROTECTED]:~> time ./mvec 400000 1 29720 1000 real 0m10.777s user 0m10.771s sys 0m0.005s -- Victor -- http://gcc.gnu.org/bugzilla/show_bug.cgi?id=35117