http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58756
--- Comment #2 from Uroš Bizjak <ubizjak at gmail dot com> --- Some unscientific printf debugging yields following runtime difference: foo (int a, int b) { int j, c = 0; #pragma omp parallel for reduction(+: c) for (j = 0; j < a; j += b) { int l; #pragma omp simd reduction(+: c) for (l = 0; l < b; ++l) c += d[j + l]; printf ("%i\n", c); } printf ("tot %i\n", c); return c; } alpha: $ ./a.out 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 496 tot 1984 Aborted Alpha spawned 3 new threads (as reported by gdb), so 4 * 496 = 1984. x86_64: $ ./pr58392.exe 496 992 1488 496 992 1488 496 992 496 992 1488 496 992 1488 496 992 1488 496 992 496 992 1488 496 992 496 992 1488 496 992 496 992 1488 tot 15872