The complex/float && and || reduction patch missed a target testcase (→ attached) which revealed that also a SIMT needed some special handling, but just runs on non-SIMT systems.
The omp-low.c patch is rather simple - and I think it semantically okay. [Note to the change: It looks more completed than it is: - moving 'zero' decl out of the 'if' block - moving that if block before the 'if (sctx.is_simt)' block - 'if (is_fp_and_or)' to the 'if (sctx.is_simt)' block.] I think at least the testcase should be added, possibly also the omp-low.c change – albeit I get a later ICE (see below), which needs either an XFAIL or a fix. * * * ICE with NVPTX: When the device lto1 starts, it fails when expanding the intrinsic XCHG_BFLY function. We have 'ivar' = complex float, which at rtx level is converted to a concatenation (via gen_reg_rtx()). In omp-low.c: IFN_GOMP_SIMT_XCHG_BFLY (TREE_TYPE(ivar), ivar, simt_lane) Later in expand_GOMP_SIMT_XCHG_BFLY, we call: 371 expand_insn (targetm.code_for_omp_simt_xchg_bfly, 3, ops); which fails by running into unreachable of 'expand_insn' 7844 if (!maybe_expand_insn (icode, nops, ops)) 7845 gcc_unreachable (); icode = CODE_FOR_omp_simt_xchg_bfly nops = 3 (gdb) p ops[0]->type $3 = EXPAND_OUTPUT (gdb) p debug(ops[0]->value) (concat:SC (reg:SF 85) (reg:SF 86)) (gdb) p ops[1]->type $5 = EXPAND_INPUT (gdb) p debug(ops[1]->value) (concat:SC (reg:SF 26 [ orfc ]) (reg:SF 27 [ orfc+4 ])) (gdb) p ops[2]->type $7 = EXPAND_INPUT (gdb) p debug(ops[2]->value) (reg:SI 52 [ _74 ]) The mentioned concat happens in How to fix this? Or does this fall into the same category as PR100321 (fixed by: r12-395, Disable SIMT for user-defined reduction) with its follow-up PR 100408? Small testcase is: _Complex float rcf[1024]; int reduction_or () { _Complex float orfc = 0; for (int i=0; i < 1024; ++i) orfc = orfc || rcf[i]; return __real__ orfc; } Tobias ----------------- Mentor Graphics (Deutschland) GmbH, Arnulfstrasse 201, 80634 München Registergericht München HRB 106955, Geschäftsführer: Thomas Heurung, Frank Thürauf
OpenMP: Fix SIMT for complex/float reduction with && and || gcc/ChangeLog: * omp-low.c (lower_rec_input_clauses): Also handle SIMT part for complex/float recution with && and ||. libgomp/ChangeLog: * testsuite/libgomp.c-c++-common/reduction-5.c: New test, testing complex/floating-point || + && recduction with 'omp target'. gcc/omp-low.c | 30 ++-- .../testsuite/libgomp.c-c++-common/reduction-5.c | 192 +++++++++++++++++++++ 2 files changed, 210 insertions(+), 12 deletions(-) diff --git a/gcc/omp-low.c b/gcc/omp-low.c index 26ceaf7..46220c5 100644 --- a/gcc/omp-low.c +++ b/gcc/omp-low.c @@ -6432,28 +6432,34 @@ lower_rec_input_clauses (tree clauses, gimple_seq *ilist, gimple_seq *dlist, gimplify_assign (unshare_expr (ivar), x, &llist[0]); - if (sctx.is_simt) - { - if (!simt_lane) - simt_lane = create_tmp_var (unsigned_type_node); - x = build_call_expr_internal_loc - (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_BFLY, - TREE_TYPE (ivar), 2, ivar, simt_lane); - x = build2 (code, TREE_TYPE (ivar), ivar, x); - gimplify_assign (ivar, x, &llist[2]); - } tree ivar2 = ivar; tree ref2 = ref; + tree zero = NULL_TREE; if (is_fp_and_or) { - tree zero = build_zero_cst (TREE_TYPE (ivar)); + zero = build_zero_cst (TREE_TYPE (ivar)); ivar2 = fold_build2_loc (clause_loc, NE_EXPR, integer_type_node, ivar, zero); ref2 = fold_build2_loc (clause_loc, NE_EXPR, integer_type_node, ref, zero); } - x = build2 (code, TREE_TYPE (ref), ref2, ivar2); + if (sctx.is_simt) + { + if (!simt_lane) + simt_lane = create_tmp_var (unsigned_type_node); + x = build_call_expr_internal_loc + (UNKNOWN_LOCATION, IFN_GOMP_SIMT_XCHG_BFLY, + TREE_TYPE (ivar), 2, ivar, simt_lane); + if (is_fp_and_or) + x = fold_build2_loc (clause_loc, NE_EXPR, + integer_type_node, x, zero); + x = build2 (code, TREE_TYPE (ivar2), ivar2, x); + if (is_fp_and_or) + x = fold_convert (TREE_TYPE (ivar), x); + gimplify_assign (ivar, x, &llist[2]); + } + x = build2 (code, TREE_TYPE (ref2), ref2, ivar2); if (is_fp_and_or) x = fold_convert (TREE_TYPE (ref), x); ref = build_outer_var_ref (var, ctx); diff --git a/libgomp/testsuite/libgomp.c-c++-common/reduction-5.c b/libgomp/testsuite/libgomp.c-c++-common/reduction-5.c new file mode 100644 index 0000000..346c882 --- /dev/null +++ b/libgomp/testsuite/libgomp.c-c++-common/reduction-5.c @@ -0,0 +1,192 @@ +/* C / C++'s logical AND and OR operators take any scalar argument + which compares (un)equal to 0 - the result 1 or 0 and of type int. + + In this testcase, the int result is again converted to a floating-poing + or complex type. + + While having a floating-point/complex array element with || and && can make + sense, having a non-integer/non-bool reduction variable is odd but valid. + + Test: FP reduction variable + FP array. */ + +#define N 1024 +_Complex float rcf[N]; +_Complex double rcd[N]; +float rf[N]; +double rd[N]; + +int +reduction_or () +{ + float orf = 0; + double ord = 0; + _Complex float orfc = 0; + _Complex double ordc = 0; + + #pragma omp target parallel reduction(||: orf) + for (int i=0; i < N; ++i) + orf = orf || rf[i]; + + #pragma omp target parallel for reduction(||: ord) + for (int i=0; i < N; ++i) + ord = ord || rcd[i]; + + #pragma omp target parallel for simd reduction(||: orfc) + for (int i=0; i < N; ++i) + orfc = orfc || rcf[i]; + + #pragma omp target parallel loop reduction(||: ordc) + for (int i=0; i < N; ++i) + ordc = ordc || rcd[i]; + + return orf + ord + __real__ orfc + __real__ ordc; +} + +int +reduction_or_teams () +{ + float orf = 0; + double ord = 0; + _Complex float orfc = 0; + _Complex double ordc = 0; + + #pragma omp target teams distribute parallel for reduction(||: orf) + for (int i=0; i < N; ++i) + orf = orf || rf[i]; + + #pragma omp target teams distribute parallel for simd reduction(||: ord) + for (int i=0; i < N; ++i) + ord = ord || rcd[i]; + + #pragma omp target teams distribute parallel for reduction(||: orfc) + for (int i=0; i < N; ++i) + orfc = orfc || rcf[i]; + + #pragma omp target teams distribute parallel for simd reduction(||: ordc) + for (int i=0; i < N; ++i) + ordc = ordc || rcd[i]; + + return orf + ord + __real__ orfc + __real__ ordc; +} + +int +reduction_and () +{ + float andf = 1; + double andd = 1; + _Complex float andfc = 1; + _Complex double anddc = 1; + + #pragma omp target parallel reduction(&&: andf) + for (int i=0; i < N; ++i) + andf = andf && rf[i]; + + #pragma omp target parallel for reduction(&&: andd) + for (int i=0; i < N; ++i) + andd = andd && rcd[i]; + + #pragma omp target parallel for simd reduction(&&: andfc) + for (int i=0; i < N; ++i) + andfc = andfc && rcf[i]; + + #pragma omp target parallel loop reduction(&&: anddc) + for (int i=0; i < N; ++i) + anddc = anddc && rcd[i]; + + return andf + andd + __real__ andfc + __real__ anddc; +} + +int +reduction_and_teams () +{ + float andf = 1; + double andd = 1; + _Complex float andfc = 1; + _Complex double anddc = 1; + + #pragma omp target teams distribute parallel for reduction(&&: andf) + for (int i=0; i < N; ++i) + andf = andf && rf[i]; + + #pragma omp target teams distribute parallel for simd reduction(&&: andd) + for (int i=0; i < N; ++i) + andd = andd && rcd[i]; + + #pragma omp target teams distribute parallel for reduction(&&: andfc) + for (int i=0; i < N; ++i) + andfc = andfc && rcf[i]; + + #pragma omp target teams distribute parallel for simd reduction(&&: anddc) + for (int i=0; i < N; ++i) + anddc = anddc && rcd[i]; + + return andf + andd + __real__ andfc + __real__ anddc; +} + +int +main () +{ + for (int i = 0; i < N; ++i) + { + rf[i] = 0; + rd[i] = 0; + rcf[i] = 0; + rcd[i] = 0; + } + + if (reduction_or () != 0) + __builtin_abort (); + if (reduction_or_teams () != 0) + __builtin_abort (); + if (reduction_and () != 0) + __builtin_abort (); + if (reduction_and_teams () != 0) + __builtin_abort (); + + rf[10] = 1.0; + rd[15] = 1.0; + rcf[10] = 1.0; + rcd[15] = 1.0i; + + if (reduction_or () != 4) + __builtin_abort (); + if (reduction_or_teams () != 4) + __builtin_abort (); + if (reduction_and () != 0) + __builtin_abort (); + if (reduction_and_teams () != 0) + __builtin_abort (); + + for (int i = 0; i < N; ++i) + { + rf[i] = 1; + rd[i] = 1; + rcf[i] = 1; + rcd[i] = 1; + } + + if (reduction_or () != 4) + __builtin_abort (); + if (reduction_or_teams () != 4) + __builtin_abort (); + if (reduction_and () != 4) + __builtin_abort (); + if (reduction_and_teams () != 4) + __builtin_abort (); + + rf[10] = 0.0; + rd[15] = 0.0; + rcf[10] = 0.0; + rcd[15] = 0.0; + + if (reduction_or () != 4) + __builtin_abort (); + if (reduction_or_teams () != 4) + __builtin_abort (); + if (reduction_and () != 0) + __builtin_abort (); + if (reduction_and_teams () != 0) + __builtin_abort (); + + return 0; +}