I've committed this patch to:
1) Remove the partitioning field I recently added to call_insn patterns. It
will not be needed.
2) Change the fork/join and associated instructions and machinery to have a
partioning mask, rather than a mode. This allows them to specify multiple modes
concurrently. That will be required for routine handling. The interface from
the middle-end remains unchanged.
nathan
2015-08-21 Nathan Sidwell <nat...@codesourcery.com>
* config/nvptx/nvptx.md (call_operation): Remove partitioning
handling.
(oacc_fork, oacc_join): Adjust expand calls.
* config/nvptx/nvptx.c (write_func_decl_from_isnsn): Remove
partitioning handling.
(nvptx_expand_call): Remove partitioning handling, generate mask
instead.
(nvptx_expand_oacc_fork, nvptx_expand_oacc_join): Take unsigned
mask, not RTX and emit all necessary insns.
(nvptx_output_call_insn): Remove partitioning handling.
(struct parallel): Replace mode with mask.
(nvptx_dump_pars): Adjust.
(nvptx_discover_pars): Adjust for insns having mask not mode.
(nvptx_process_pars): Adjust for parallel containing mask not
mode.
(nvptx_neuter_pars): Likewise.
* config/nvptx/nvptx-protos.h (nvptx_expand_oacc_fork,
nvptx_expand_oacc_join): Pass mask not rtx.
Index: config/nvptx/nvptx-protos.h
===================================================================
--- config/nvptx/nvptx-protos.h (revision 227038)
+++ config/nvptx/nvptx-protos.h (working copy)
@@ -32,8 +32,8 @@ extern void nvptx_register_pragmas (void
extern const char *nvptx_section_for_decl (const_tree);
#ifdef RTX_CODE
-extern void nvptx_expand_oacc_fork (rtx);
-extern void nvptx_expand_oacc_join (rtx);
+extern void nvptx_expand_oacc_fork (unsigned);
+extern void nvptx_expand_oacc_join (unsigned);
extern void nvptx_expand_oacc_lock_unlock (rtx, bool);
extern void nvptx_expand_call (rtx, rtx);
extern rtx nvptx_expand_compare (rtx);
Index: config/nvptx/nvptx.c
===================================================================
--- config/nvptx/nvptx.c (revision 227038)
+++ config/nvptx/nvptx.c (working copy)
@@ -762,29 +762,30 @@ write_func_decl_from_insn (std::stringst
s << name;
- int nargs = XVECLEN (pat, 0) - 1;
- if (nargs > 0)
+ int arg_end = XVECLEN (pat, 0);
+
+ if (1 < arg_end)
{
+ const char *comma = "";
s << " (";
- for (int i = 0; i < nargs; i++)
+ for (int i = 1; i < arg_end; i++)
{
- rtx t = XEXP (XVECEXP (pat, 0, i + 1), 0);
+ rtx t = XEXP (XVECEXP (pat, 0, i), 0);
machine_mode mode = GET_MODE (t);
int count = maybe_split_mode (&mode);
- while (count-- > 0)
+ while (count--)
{
- s << ".param";
+ s << comma << ".param";
s << nvptx_ptx_type_from_mode (mode, false);
s << " ";
if (callprototype)
s << "_";
else
- s << "%arg" << i;
+ s << "%arg" << i - 1;
if (mode == QImode || mode == HImode)
s << "[1]";
- if (i + 1 < nargs || count > 0)
- s << ", ";
+ comma = ", ";
}
}
s << ")";
@@ -853,9 +854,9 @@ nvptx_expand_call (rtx retval, rtx addre
rtx pat, t;
rtvec vec;
bool external_decl = false;
- rtx partitioning = NULL_RTX;
rtx varargs = NULL_RTX;
tree decl_type = NULL_TREE;
+ unsigned parallel = 0;
for (t = cfun->machine->call_args; t; t = XEXP (t, 1))
nargs++;
@@ -886,7 +887,9 @@ nvptx_expand_call (rtx retval, rtx addre
if (TREE_PURPOSE (dims)
&& !integer_zerop (TREE_PURPOSE (dims)))
{
- partitioning = GEN_INT (ix);
+ parallel = GOMP_DIM_MASK (GOMP_DIM_MAX) - 1;
+ if (ix)
+ parallel ^= GOMP_DIM_MASK (ix - 1) - 1;
break;
}
dims = TREE_CHAIN (dims);
@@ -894,6 +897,7 @@ nvptx_expand_call (rtx retval, rtx addre
}
}
}
+
if (cfun->machine->funtype
/* It's possible to construct testcases where we call a variable.
See compile/20020129-1.c. stdarg_p will crash so avoid calling it
@@ -909,8 +913,7 @@ nvptx_expand_call (rtx retval, rtx addre
emit_move_insn (varargs, stack_pointer_rtx);
cfun->machine->has_call_with_varargs = true;
}
- vec = rtvec_alloc (nargs + 1
- + (partitioning ? 1 : 0) + (varargs ? 1 : 0));
+ vec = rtvec_alloc (nargs + 1 + (varargs ? 1 : 0));
pat = gen_rtx_PARALLEL (VOIDmode, vec);
int vec_pos = 0;
@@ -925,9 +928,6 @@ nvptx_expand_call (rtx retval, rtx addre
}
XVECEXP (pat, 0, vec_pos++) = t;
- if (partitioning)
- XVECEXP (pat, 0, vec_pos++) = partitioning;
-
/* Construct the call insn, including a USE for each argument pseudo
register. These will be used when printing the insn. */
for (rtx arg = cfun->machine->call_args; arg; arg = XEXP (arg, 1))
@@ -1169,18 +1169,34 @@ nvptx_expand_compare (rtx compare)
/* Expand the oacc fork & join primitive into ptx-required unspecs. */
void
-nvptx_expand_oacc_fork (rtx mode)
+nvptx_expand_oacc_fork (unsigned mask)
{
- /* Emit fork for worker level. */
- if (UINTVAL (mode) == GOMP_DIM_WORKER)
- emit_insn (gen_nvptx_fork (mode));
+ mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+ | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+ if (mask)
+ {
+ rtx op = GEN_INT (mask);
+
+ /* Emit fork for worker level. */
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
+ emit_insn (gen_nvptx_fork (op));
+ emit_insn (gen_nvptx_forked (op));
+ }
}
void
-nvptx_expand_oacc_join (rtx mode)
+nvptx_expand_oacc_join (unsigned mask)
{
- /* Emit joining for all pars. */
- emit_insn (gen_nvptx_joining (mode));
+ mask &= (GOMP_DIM_MASK (GOMP_DIM_WORKER)
+ | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
+ if (mask)
+ {
+ rtx op = GEN_INT (mask);
+
+ /* Emit joining for all pars. */
+ emit_insn (gen_nvptx_joining (op));
+ emit_insn (gen_nvptx_join (op));
+ }
}
/* Expander for reduction locking and unlocking. We expect SRC to be
@@ -1836,18 +1852,7 @@ nvptx_output_call_insn (rtx_insn *insn,
bool needs_tgt = register_operand (callee, Pmode);
rtx pat = PATTERN (insn);
int arg_end = XVECLEN (pat, 0);
- int arg_start = 1;
tree decl = NULL_TREE;
- rtx partitioning = NULL_RTX;
-
- if (arg_end > 1)
- {
- partitioning = XVECEXP (pat, 0, 1);
- if (GET_CODE (partitioning) == CONST_INT)
- arg_start++;
- else
- partitioning = NULL_RTX;
- }
fprintf (asm_out_file, "\t{\n");
if (result != NULL)
@@ -1873,7 +1878,7 @@ nvptx_output_call_insn (rtx_insn *insn,
fputs (s.str().c_str(), asm_out_file);
}
- for (int i = arg_start, argno = 0; i < arg_end; i++)
+ for (int i = 1, argno = 0; i < arg_end; i++)
{
rtx t = XEXP (XVECEXP (pat, 0, i), 0);
machine_mode mode = GET_MODE (t);
@@ -1884,7 +1889,7 @@ nvptx_output_call_insn (rtx_insn *insn,
nvptx_ptx_type_from_mode (mode, false), argno++,
mode == QImode || mode == HImode ? "[1]" : "");
}
- for (int i = arg_start, argno = 0; i < arg_end; i++)
+ for (int i = 1, argno = 0; i < arg_end; i++)
{
rtx t = XEXP (XVECEXP (pat, 0, i), 0);
gcc_assert (REG_P (t));
@@ -1918,12 +1923,12 @@ nvptx_output_call_insn (rtx_insn *insn,
else
output_address (callee);
- if (arg_end > arg_start || (decl && DECL_STATIC_CHAIN (decl)))
+ if (arg_end > 1 || (decl && DECL_STATIC_CHAIN (decl)))
{
const char *comma = "";
fprintf (asm_out_file, ", (");
- for (int i = arg_start, argno = 0; i < arg_end; i++)
+ for (int i = 1, argno = 0; i < arg_end; i++)
{
rtx t = XEXP (XVECEXP (pat, 0, i), 0);
machine_mode mode = GET_MODE (t);
@@ -2357,8 +2362,8 @@ struct parallel
/* First child parallel. */
parallel *inner;
- /* Partitioning mode of the parallel. */
- unsigned mode;
+ /* Partitioning mask of the parallel. */
+ unsigned mask;
/* Partitioning used within inner parallels. */
unsigned inner_mask;
@@ -2388,8 +2393,8 @@ public:
/* Constructor links the new parallel into it's parent's chain of
children. */
-parallel::parallel (parallel *parent_, unsigned mode_)
- :parent (parent_), next (0), inner (0), mode (mode_), inner_mask (0)
+parallel::parallel (parallel *parent_, unsigned mask_)
+ :parent (parent_), next (0), inner (0), mask (mask_), inner_mask (0)
{
forked_block = join_block = 0;
forked_insn = join_insn = 0;
@@ -2450,7 +2455,7 @@ nvptx_split_blocks (bb_insn_map_t *map)
case CODE_FOR_nvptx_forked:
case CODE_FOR_nvptx_join:
break;
-
+
case CODE_FOR_return:
/* We also need to split just before return insns, as
that insn needs executing by all threads, but the
@@ -2514,8 +2519,8 @@ nvptx_discover_pre (basic_block block, i
static void
nvptx_dump_pars (parallel *par, unsigned depth)
{
- fprintf (dump_file, "%u: mode %d head=%d, tail=%d\n",
- depth, par->mode,
+ fprintf (dump_file, "%u: mask %d head=%d, tail=%d\n",
+ depth, par->mask,
par->forked_block ? par->forked_block->index : -1,
par->join_block ? par->join_block->index : -1);
@@ -2544,7 +2549,7 @@ typedef auto_vec<bb_par_t> bb_par_vec_t;
static parallel *
nvptx_discover_pars (bb_insn_map_t *map)
{
- parallel *outer_par = new parallel (0, GOMP_DIM_MAX);
+ parallel *outer_par = new parallel (0, 0);
bb_par_vec_t worklist;
basic_block block;
@@ -2583,12 +2588,13 @@ nvptx_discover_pars (bb_insn_map_t *map)
/* Loop head, create a new inner loop and add it into
our parent's child list. */
{
- unsigned mode = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
-
- l = new parallel (l, mode);
+ unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+
+ gcc_assert (mask);
+ l = new parallel (l, mask);
l->forked_block = block;
l->forked_insn = end;
- if (mode == GOMP_DIM_WORKER)
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
l->fork_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_fork);
}
@@ -2598,12 +2604,12 @@ nvptx_discover_pars (bb_insn_map_t *map)
/* A loop tail. Finish the current loop and return to
parent. */
{
- unsigned mode = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
+ unsigned mask = UINTVAL (XVECEXP (PATTERN (end), 0, 0));
- gcc_assert (l->mode == mode);
+ gcc_assert (l->mask == mask);
l->join_block = block;
l->join_insn = end;
- if (mode == GOMP_DIM_WORKER)
+ if (mask & GOMP_DIM_MASK (GOMP_DIM_WORKER))
l->joining_insn
= nvptx_discover_pre (block, CODE_FOR_nvptx_joining);
l = l->parent;
@@ -2981,7 +2987,7 @@ nvptx_skip_par (unsigned mask, parallel
static unsigned
nvptx_process_pars (parallel *par)
{
- unsigned inner_mask = GOMP_DIM_MASK (par->mode);
+ unsigned inner_mask = par->mask;
/* Do the inner parallels first. */
if (par->inner)
@@ -2990,17 +2996,17 @@ nvptx_process_pars (parallel *par)
inner_mask |= par->inner_mask;
}
- switch (par->mode)
+ switch (par->mask)
{
- case GOMP_DIM_MAX:
+ case 0:
/* Dummy parallel. */
break;
- case GOMP_DIM_VECTOR:
+ case GOMP_DIM_MASK (GOMP_DIM_VECTOR):
nvptx_vpropagate (par->forked_block, par->forked_insn);
break;
- case GOMP_DIM_WORKER:
+ case GOMP_DIM_MASK (GOMP_DIM_WORKER):
{
nvptx_wpropagate (false, par->forked_block,
par->forked_insn);
@@ -3011,9 +3017,6 @@ nvptx_process_pars (parallel *par)
}
break;
- case GOMP_DIM_GANG:
- break;
-
default:gcc_unreachable ();
}
@@ -3030,9 +3033,8 @@ nvptx_process_pars (parallel *par)
static void
nvptx_neuter_pars (parallel *par, unsigned modes, unsigned outer)
{
- unsigned me = (GOMP_DIM_MASK (par->mode)
- & (GOMP_DIM_MASK (GOMP_DIM_WORKER)
- | GOMP_DIM_MASK (GOMP_DIM_VECTOR)));
+ unsigned me = par->mask
+ & (GOMP_DIM_MASK (GOMP_DIM_WORKER) | GOMP_DIM_MASK (GOMP_DIM_VECTOR));
unsigned skip_mask = 0, neuter_mask = 0;
if (par->inner)
Index: config/nvptx/nvptx.md
===================================================================
--- config/nvptx/nvptx.md (revision 227038)
+++ config/nvptx/nvptx.md (working copy)
@@ -211,14 +211,9 @@
(define_predicate "call_operation"
(match_code "parallel")
{
- int arg_start = 1;
int arg_end = XVECLEN (op, 0);
- /* Skip optional routine partitioning information. */
- if (arg_end > 1 && GET_CODE (XVECEXP (op, 0, 1)) == CONST_INT)
- arg_start++;
-
- for (int i = arg_start; i < arg_end; i++)
+ for (int i = 1; i < arg_end; i++)
{
rtx elt = XVECEXP (op, 0, i);
@@ -1423,7 +1418,8 @@
UNSPECV_FORKED)]
""
{
- nvptx_expand_oacc_fork (operands[0]);
+ nvptx_expand_oacc_fork (GOMP_DIM_MASK (INTVAL (operands[0])));
+ DONE;
})
(define_expand "oacc_join"
@@ -1431,7 +1427,8 @@
UNSPECV_JOIN)]
""
{
- nvptx_expand_oacc_join (operands[0]);
+ nvptx_expand_oacc_join (GOMP_DIM_MASK (INTVAL (operands[0])));
+ DONE;
})
;; only 32-bit shuffles exist.