Hi!

Compiling the following testcase with -O2 -fopenmp:
int a[10000][128];

__attribute__((noipa)) void
foo (void)
{
  #pragma omp for simd schedule (simd: dynamic, 32) collapse(2)
  for (int i = 0; i < 10000; i++)
    for (int j = 0; j < 128; j++)
      a[i][j] += 3;
}

int
main ()
{
  for (int i = 0; i < 10000; i++)
    for (int j = 0; j < 128; j++)
      {
        asm volatile ("" : : "r" (&a[0][0]) : "memory");
        a[i][j] = i + j;
      }
  foo ();
  for (int i = 0; i < 10000; i++)
    for (int j = 0; j < 128; j++)
      if (a[i][j] != i + j + 3)
        __builtin_abort ();
  return 0;
}
doesn't seem result in the vectorization I was hoping to see.
As has been changed recently, I'm only trying to vectorize now the
innermost loop of the collapse with outer loops around it being normal
scalar loops like those written in the source and with only omp simd
it works fine, but for the combined constructs the current thread gets
assigned some range of logical iterations, therefore I get a pair of
in this case i and j starting values.

At the end of ompexp I have:
...
  D.2106 = (unsigned int) D.2105;
  D.2107 = MIN_EXPR <D.2104, D.2106>;
  D.2103 = D.2107 + .iter.4;
  goto <bb 5>; [INV]
;;    succ:       5

;;   basic block 4, loop depth 2
;;    pred:       5
  i = i.0;
  j = j.1;
  _1 = a[i][j];
  _2 = _1 + 3;
  a[i][j] = _2;
  .iter.4 = .iter.4 + 1;
  j.1 = j.1 + 1;
;;    succ:       5

;;   basic block 5, loop depth 2
;;    pred:       4
;;                3
;;                7
  if (.iter.4 < D.2103)
    goto <bb 4>; [87.50%]
  else
    goto <bb 6>; [12.50%]
;;    succ:       4
;;                6

;;   basic block 6, loop depth 2
;;    pred:       5
  i.0 = i.0 + 1;
  if (i.0 < 10000)
    goto <bb 7>; [87.50%]
  else
    goto <bb 8>; [12.50%]
;;    succ:       8
;;                7

;;   basic block 7, loop depth 2
;;    pred:       6
  j.1 = 0;
  D.2108 = D.2099 - .iter.4;
  D.2109 = MIN_EXPR <D.2108, 128>;
  D.2103 = D.2109 + .iter.4;
  goto <bb 5>; [INV]

I was really hoping bbs 4 and 5 would be one loop (the one I set safelen
and force_vectorize etc. for) and that basic blocks 6 and 7 would be
together with that inner loop another loop, but apparently loop discovery
thinks it is just one loop.
Any ideas what I'm doing wrong or is there any way how to make it two loops
(that would also survive all the cfg cleanups until vectorization)?

Essentially, in C I'm trying to have:
int a[10000][128];
void get_me_start_end (int *, int *);
void
foo (void)
{
  int start, end, curend, i, j;
  get_me_start_end (&start, &end);
  i = start / 128;
  j = start % 128;
  curend = start + (end - start > 128 - j ? 128 - j : end - start);
  goto doit;
  for (i = 0; i < 10000; i++)
    {
      j = 0;
      curend = start + (end - start > 128 ? 128 : end - start);
      doit:;
      /* I'd use start < curend && j < 128 as condition here, but
         the vectorizer doesn't like that either.  So I went to
         using a single IV.  */
      for (; start < curend; start++, j++)
        a[i][j] += 3;
    }
}

This isn't vectorized with -O3 either for the same reason.

        Jakub

Reply via email to