On 10/2/20 3:21 PM, Tom de Vries wrote:
> Hi,
> 
> Consider the test-case libgomp.c/pr81778.c added in this commit, with
> this core loop (note: CANARY_SIZE set to 0 for simplicity):
> ...
>   int s = 1;
>   #pragma omp target simd
>   for (int i = N - 1; i > -1; i -= s)
>     a[i] = 1;
> ...
> which, given that N is 32, sets a[0..31] to 1.
> 
> After omp-expand, this looks like:
> ...
>   <bb 5> :
>   simduid.7 = .GOMP_SIMT_ENTER (simduid.7);
>   .omp_simt.8 = .GOMP_SIMT_ENTER_ALLOC (simduid.7);
>   D.3193 = -s;
>   s.9 = s;
>   D.3204 = .GOMP_SIMT_LANE ();
>   D.3205 = -s.9;
>   D.3206 = (int) D.3204;
>   D.3207 = D.3205 * D.3206;
>   i = D.3207 + 31;
>   D.3209 = 0;
>   D.3210 = -s.9;
>   D.3211 = D.3210 - i;
>   D.3210 = -s.9;
>   D.3212 = D.3211 / D.3210;
>   D.3213 = (unsigned int) D.3212;
>   D.3213 = i >= 0 ? D.3213 : 0;
> 
>   <bb 19> :
>   if (D.3209 < D.3213)
>     goto <bb 6>; [87.50%]
>   else
>     goto <bb 7>; [12.50%]
> 
>   <bb 6> :
>   a[i] = 1;
>   D.3215 = -s.9;
>   D.3219 = .GOMP_SIMT_VF ();
>   D.3216 = (int) D.3219;
>   D.3220 = D.3215 * D.3216;
>   i = D.3220 + i;
>   D.3209 = D.3209 + 1;
>   goto <bb 19>; [100.00%]
> ...
> 
> On nvptx, the first time bb6 is executed, i is in the 0..31 range (depending
> on the lane that is executing) at bb entry.
> 
> So we have the following sequence:
> - a[0..31] is set to 1
> - i is updated to -32..-1
> - D.3209 is updated to 1 (being 0 initially)
> - bb19 is executed, and if condition (D.3209 < D.3213) == (1 < 32) evaluates
>   to true
> - bb6 is once more executed, which should not happen because all the elements
>   that needed to be handled were already handled.
> - consequently, elements that should not be written are written
> - with CANARY_SIZE == 0, we may run into a libgomp error:
>   ...
>   libgomp: cuCtxSynchronize error: an illegal memory access was encountered
>   ...
>   and with CANARY_SIZE unmodified, we run into:
>   ...
>   Expected 0, got 1 at base[-961]
>   Aborted (core dumped)
>   ...
> 
> The cause of this is as follows:
> - because the step s is a variable rather than a constant, an alternative
>   IV (D.3209 in our example) is generated in expand_omp_simd, and the
>   loop condition is tested in terms of the alternative IV rather than
>   the original IV (i in our example).
> - the SIMT code in expand_omp_simd works by modifying step and initial value.
> - The initial value fd->loop.n1 is loaded into a variable n1, which is
>   modified by the SIMT code and then used there-after.
> - The step fd->loop.step is loaded into a variable step, which is is modified
>   by the SIMT code, but afterwards there are uses of both step and
>   fd->loop.step.
> - There are uses of fd->loop.step in the alternative IV handling code,
>   which should use step instead.
> 
> Fix this by introducing an additional variable orig_step, which is not
> modified by the SIMT code and replacing all remaining uses of fd->loop.step
> by either step or orig_step.
> 
> Build on x86_64-linux with nvptx accelerator, tested libgomp.
> 
> This fixes for-5.c and for-6.c FAILs I'm currently seeing on a quadro m1200
> with driver 450.66.
> 
> OK for trunk?
> 

Ping.

Thanks,
- Tom

> [omp, simt] Handle alternative IV
> 
> gcc/ChangeLog:
> 
> 2020-10-02  Tom de Vries  <tdevr...@suse.de>
> 
>       * omp-expand.c (expand_omp_simd): Add step_orig, and replace uses of
>       fd->loop.step by either step or orig_step.
> 
> libgomp/ChangeLog:
> 
> 2020-10-02  Tom de Vries  <tdevr...@suse.de>
> 
>       * testsuite/libgomp.c/pr81778.c: New test.
> 
> ---
>  gcc/omp-expand.c                      | 11 ++++----
>  libgomp/testsuite/libgomp.c/pr81778.c | 48 
> +++++++++++++++++++++++++++++++++++
>  2 files changed, 54 insertions(+), 5 deletions(-)
> 
> diff --git a/gcc/omp-expand.c b/gcc/omp-expand.c
> index 99cb4f9dda4..80e35ac0294 100644
> --- a/gcc/omp-expand.c
> +++ b/gcc/omp-expand.c
> @@ -6307,6 +6307,7 @@ expand_omp_simd (struct omp_region *region, struct 
> omp_for_data *fd)
>        n2 = OMP_CLAUSE_DECL (innerc);
>      }
>    tree step = fd->loop.step;
> +  tree orig_step = step; /* May be different from step if is_simt.  */
>  
>    bool is_simt = omp_find_clause (gimple_omp_for_clauses (fd->for_stmt),
>                                 OMP_CLAUSE__SIMT_);
> @@ -6455,7 +6456,7 @@ expand_omp_simd (struct omp_region *region, struct 
> omp_for_data *fd)
>    tree altv = NULL_TREE, altn2 = NULL_TREE;
>    if (fd->collapse == 1
>        && !broken_loop
> -      && TREE_CODE (fd->loops[0].step) != INTEGER_CST)
> +      && TREE_CODE (orig_step) != INTEGER_CST)
>      {
>        /* The vectorizer currently punts on loops with non-constant steps
>        for the main IV (can't compute number of iterations and gives up
> @@ -6471,7 +6472,7 @@ expand_omp_simd (struct omp_region *region, struct 
> omp_for_data *fd)
>       itype = signed_type_for (itype);
>        t = build_int_cst (itype, (fd->loop.cond_code == LT_EXPR ? -1 : 1));
>        t = fold_build2 (PLUS_EXPR, itype,
> -                    fold_convert (itype, fd->loop.step), t);
> +                    fold_convert (itype, step), t);
>        t = fold_build2 (PLUS_EXPR, itype, t, fold_convert (itype, n2));
>        t = fold_build2 (MINUS_EXPR, itype, t,
>                      fold_convert (itype, fd->loop.v));
> @@ -6479,10 +6480,10 @@ expand_omp_simd (struct omp_region *region, struct 
> omp_for_data *fd)
>       t = fold_build2 (TRUNC_DIV_EXPR, itype,
>                        fold_build1 (NEGATE_EXPR, itype, t),
>                        fold_build1 (NEGATE_EXPR, itype,
> -                                   fold_convert (itype, fd->loop.step)));
> +                                   fold_convert (itype, step)));
>        else
>       t = fold_build2 (TRUNC_DIV_EXPR, itype, t,
> -                      fold_convert (itype, fd->loop.step));
> +                      fold_convert (itype, step));
>        t = fold_convert (TREE_TYPE (altv), t);
>        altn2 = create_tmp_var (TREE_TYPE (altv));
>        expand_omp_build_assign (&gsi, altn2, t);
> @@ -6630,7 +6631,7 @@ expand_omp_simd (struct omp_region *region, struct 
> omp_for_data *fd)
>    if (is_simt)
>      {
>        gsi = gsi_start_bb (l2_bb);
> -      step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), fd->loop.step, step);
> +      step = fold_build2 (MINUS_EXPR, TREE_TYPE (step), orig_step, step);
>        if (POINTER_TYPE_P (type))
>       t = fold_build_pointer_plus (fd->loop.v, step);
>        else
> diff --git a/libgomp/testsuite/libgomp.c/pr81778.c 
> b/libgomp/testsuite/libgomp.c/pr81778.c
> new file mode 100644
> index 00000000000..571668eb36a
> --- /dev/null
> +++ b/libgomp/testsuite/libgomp.c/pr81778.c
> @@ -0,0 +1,48 @@
> +/* Minimized from for-5.c.  */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +
> +/* Size of array we want to write.  */
> +#define N 32
> +
> +/* Size of extra space before and after.  */
> +#define CANARY_SIZE (N * 32)
> +
> +/* Start of array we want to write.  */
> +#define BASE (CANARY_SIZE)
> +
> +// Total size to be allocated.
> +#define ALLOC_SIZE (CANARY_SIZE + N + CANARY_SIZE)
> +
> +#pragma omp declare target
> +int a[ALLOC_SIZE];
> +#pragma omp end declare target
> +
> +int
> +main (void)
> +{
> +  /* Use variable step in for loop.  */
> +  int s = 1;
> +
> +#pragma omp target update to(a)
> +
> +  /* Write a[BASE] .. a[BASE + N - 1].  */
> +#pragma omp target simd
> +  for (int i = N - 1; i > -1; i -= s)
> +    a[BASE + i] = 1;
> +
> +#pragma omp target update from(a)
> +
> +  for (int i = 0; i < ALLOC_SIZE; i++)
> +    {
> +      int expected = (BASE <= i && i < BASE + N) ? 1 : 0;
> +      if (a[i] == expected)
> +     continue;
> +
> +      printf ("Expected %d, got %d at base[%d]\n", expected, a[i], i - BASE);
> +      abort ();
> +    }
> +
> +  return 0;
> +}
> 

Reply via email to