Hi! Compiling the following testcase with -O2 -fopenmp: int a[10000][128];
__attribute__((noipa)) void foo (void) { #pragma omp for simd schedule (simd: dynamic, 32) collapse(2) for (int i = 0; i < 10000; i++) for (int j = 0; j < 128; j++) a[i][j] += 3; } int main () { for (int i = 0; i < 10000; i++) for (int j = 0; j < 128; j++) { asm volatile ("" : : "r" (&a[0][0]) : "memory"); a[i][j] = i + j; } foo (); for (int i = 0; i < 10000; i++) for (int j = 0; j < 128; j++) if (a[i][j] != i + j + 3) __builtin_abort (); return 0; } doesn't seem result in the vectorization I was hoping to see. As has been changed recently, I'm only trying to vectorize now the innermost loop of the collapse with outer loops around it being normal scalar loops like those written in the source and with only omp simd it works fine, but for the combined constructs the current thread gets assigned some range of logical iterations, therefore I get a pair of in this case i and j starting values. At the end of ompexp I have: ... D.2106 = (unsigned int) D.2105; D.2107 = MIN_EXPR <D.2104, D.2106>; D.2103 = D.2107 + .iter.4; goto <bb 5>; [INV] ;; succ: 5 ;; basic block 4, loop depth 2 ;; pred: 5 i = i.0; j = j.1; _1 = a[i][j]; _2 = _1 + 3; a[i][j] = _2; .iter.4 = .iter.4 + 1; j.1 = j.1 + 1; ;; succ: 5 ;; basic block 5, loop depth 2 ;; pred: 4 ;; 3 ;; 7 if (.iter.4 < D.2103) goto <bb 4>; [87.50%] else goto <bb 6>; [12.50%] ;; succ: 4 ;; 6 ;; basic block 6, loop depth 2 ;; pred: 5 i.0 = i.0 + 1; if (i.0 < 10000) goto <bb 7>; [87.50%] else goto <bb 8>; [12.50%] ;; succ: 8 ;; 7 ;; basic block 7, loop depth 2 ;; pred: 6 j.1 = 0; D.2108 = D.2099 - .iter.4; D.2109 = MIN_EXPR <D.2108, 128>; D.2103 = D.2109 + .iter.4; goto <bb 5>; [INV] I was really hoping bbs 4 and 5 would be one loop (the one I set safelen and force_vectorize etc. for) and that basic blocks 6 and 7 would be together with that inner loop another loop, but apparently loop discovery thinks it is just one loop. Any ideas what I'm doing wrong or is there any way how to make it two loops (that would also survive all the cfg cleanups until vectorization)? Essentially, in C I'm trying to have: int a[10000][128]; void get_me_start_end (int *, int *); void foo (void) { int start, end, curend, i, j; get_me_start_end (&start, &end); i = start / 128; j = start % 128; curend = start + (end - start > 128 - j ? 128 - j : end - start); goto doit; for (i = 0; i < 10000; i++) { j = 0; curend = start + (end - start > 128 ? 128 : end - start); doit:; /* I'd use start < curend && j < 128 as condition here, but the vectorizer doesn't like that either. So I went to using a single IV. */ for (; start < curend; start++, j++) a[i][j] += 3; } } This isn't vectorized with -O3 either for the same reason. Jakub