Hi Pauli, in my PC I have tried this and some of the regressions disappear, maybe you can give it a try. At the present state is compiler- and architecture-dependent, therefore not the best choice. But it may be worth trying. Best, Luca
/* My additions are unindented */ /* * "Vectorized" reduction along an axis * * Evaluating the inner loop in smaller blocks interleaved with the * reduction loop aims to avoid cache misses in the loop->ret array. */ { typedef unsigned long long ticks; __inline__ ticks getticks(void) { unsigned a, d; /* return clock();*/ /* asm("cpuid");*/ asm volatile("rdtsc" : "=a" (a), "=d" (d)); return (((ticks)a) | (((ticks)d) << 32)); } npy_intp new_block_size; ticks t0, t1; int delta = 8; int speed, speed_p; /*t0 = getticks(); t0 = getticks();*/ t0 = getticks(); speed_p = 0.; block_size = 2 + (loop->bufsize / loop->outsize / 2); new_block_size = block_size; /*printf("was %d", block_size);*/ for (k = 0; k < loop->size; k += block_size) { char *bufptr[3]; block_size = new_block_size; /*printf(" then %d (speed_p %d)", block_size, speed_p);*/ bufptr[0] = loop->bufptr[0] + k * loop->steps[0]; bufptr[1] = loop->bufptr[1] + k * loop->steps[1]; bufptr[2] = loop->bufptr[2] + k * loop->steps[2]; if (k + block_size > loop->size) { block_size = loop->size - k; } for (i = i0; i <= loop->N; ++i) { bufptr[1] += loop->instrides; loop->function((char **)bufptr, &block_size, loop->steps, loop->funcdata); UFUNC_CHECK_ERROR(loop); } t1 = getticks(); speed = (block_size << 12) / (t1 - t0); if (speed < speed_p) delta = -delta; new_block_size = (1 + ((block_size * (128 + delta)) >> 10)) << 3; speed_p = speed; t0 = t1; } /*printf(" is %d (speed_p %d)\n", block_size, speed_p);*/ } PyArray_ITER_NEXT(loop->it); PyArray_ITER_NEXT(loop->rit); } _______________________________________________ NumPy-Discussion mailing list NumPy-Discussion@scipy.org http://mail.scipy.org/mailman/listinfo/numpy-discussion