Re: [AArch64][PATCH 4/5] Use atomic load-operate instructions for fetch-update patterns.

James Greenhalgh Fri, 18 Sep 2015 01:57:46 -0700

On Thu, Sep 17, 2015 at 05:47:43PM +0100, Matthew Wahab wrote:
> Hello,
> 
> ARMv8.1 adds atomic swap and atomic load-operate instructions with
> optional memory ordering specifiers. This patch uses the ARMv8.1 atomic
> load-operate instructions to implement the atomic_fetch_<op>
> patterns. This patch also updates the implementation of the atomic_<op>
> patterns, which are treated as versions of the atomic_fetch_<op> which
> discard the loaded data.
> 
> The general form of the code generated for an atomic_fetch_<op>, with
> destination D, source S, memory address A and memory order MO, depends
> on whether the operation is directly supported by the instruction. If
> <op> is one of PLUS, IOR or XOR, the code generated is:
> 
>      ld<opc><mo> S, D, [A]
> 
> where
>    <opc> is one {add, set, eor}
>    <mo> is one of {'', 'a', 'l', 'al'} depending on memory order MO.
>    <sz> is one of {'', 'b', 'h'} depending on the data size.
> 
> If <op> is SUB, the code generated, with scratch register r, is:
> 
>      neg r, S
>      ldadd<mo><sz> r, D, [A]
> 
> If <op> is AND, the code generated is:
>      not r, S
>      ldclr<mo><sz> r, D, [A]
> 
> Any operation not in {PLUS, IOR, XOR, SUB, AND} is passed to the
> existing aarch64_split_atomic_op function, to implement the operation
> using sequences built with the ARMv8 load-exclusive/store-exclusive
> instructions
> 
> Tested the series for aarch64-none-linux-gnu with native bootstrap and
> make check. Also tested for aarch64-none-elf with cross-compiled
> check-gcc on an ARMv8.1 emulator with +lse enabled by default.
> 
> Ok for trunk?


Some comments in line below.

> From c4b8eb6d2ca62c57f4a011e06335b918f603ad2a Mon Sep 17 00:00:00 2001
> From: Matthew Wahab <matthew.wa...@arm.com>
> Date: Fri, 7 Aug 2015 17:10:42 +0100
> Subject: [PATCH 4/5] Use atomic instructions for fetch-update patterns.
> 
> Change-Id: I39759f02e61039067ccaabfd52039e4804eddf2f
> ---
>  gcc/config/aarch64/aarch64-protos.h                |   2 +
>  gcc/config/aarch64/aarch64.c                       | 176 
> ++++++++++++++++++++-
>  gcc/config/aarch64/atomics.md                      | 109 ++++++++++++-
>  .../gcc.target/aarch64/atomic-inst-ldadd.c         |  58 +++++++
>  .../gcc.target/aarch64/atomic-inst-ldlogic.c       | 109 +++++++++++++
>  5 files changed, 444 insertions(+), 10 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/atomic-inst-ldadd.c
>  create mode 100644 gcc/testsuite/gcc.target/aarch64/atomic-inst-ldlogic.c
> 
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index eba4c76..76ebd6f 100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -378,6 +378,8 @@ rtx aarch64_load_tp (rtx);
>  void aarch64_expand_compare_and_swap (rtx op[]);
>  void aarch64_split_compare_and_swap (rtx op[]);
>  void aarch64_gen_atomic_cas (rtx, rtx, rtx, rtx, rtx);
> +
> +bool aarch64_atomic_ldop_supported_p (enum rtx_code);
>  void aarch64_gen_atomic_ldop (enum rtx_code, rtx, rtx, rtx, rtx);
>  void aarch64_split_atomic_op (enum rtx_code, rtx, rtx, rtx, rtx, rtx, rtx);
>  
> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
> index dc05c6e..33f9ef3 100644
> --- a/gcc/config/aarch64/aarch64.c
> +++ b/gcc/config/aarch64/aarch64.c
> @@ -11064,6 +11064,33 @@ aarch64_expand_compare_and_swap (rtx operands[])
>    emit_insn (gen_rtx_SET (bval, x));
>  }
>  
> +/* Test whether the target supports using a atomic load-operate instruction.
> +   CODE is the operation and AFTER is TRUE if the data in memory after the
> +   operation should be returned and FALSE if the data before the operation
> +   should be returned.  Returns FALSE if the operation isn't supported by the
> +   architecture.
> +  */

Stray newline, leave the */ on the line before.

> +
> +bool
> +aarch64_atomic_ldop_supported_p (enum rtx_code code)
> +{
> +  if (!TARGET_LSE)
> +    return false;
> +
> +  switch (code)
> +    {
> +    case SET:
> +    case AND:
> +    case IOR:
> +    case XOR:
> +    case MINUS:
> +    case PLUS:
> +      return true;
> +    default:
> +      return false;
> +    }
> +}
> +
>  /* Emit a barrier, that is appropriate for memory model MODEL, at the end of 
> a
>     sequence implementing an atomic operation.  */
>  
> @@ -11206,26 +11233,169 @@ aarch64_emit_atomic_swap (machine_mode mode, rtx 
> dst, rtx value,
>    emit_insn (gen (dst, mem, value, model));
>  }
>  
> -/* Emit an atomic operation where the architecture supports it.  */
> +/* Operations supported by aarch64_emit_atomic_load_op.  */
> +
> +enum aarch64_atomic_load_op_code
> +{
> +  AARCH64_LDOP_PLUS, /* A + B  */
> +  AARCH64_LDOP_XOR,  /* A ^ B  */
> +  AARCH64_LDOP_OR,   /* A | B  */
> +  AARCH64_LDOP_BIC   /* A & ~B  */
> +};

I have a small preference to calling these the same name as the
instructions they will generate, so AARCH64_LDOP_ADD, AARCH64_LDOP_EOR,
AARCH64_LDOP_SET and AARCH64_LDOP_CLR, but I'm happy fo you to leave it
this way if you prefer.

> +
> +/* Emit an atomic load-operate.  */
> +
> +static void
> +aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
> +                          machine_mode mode, rtx dst, rtx src,
> +                          rtx mem, rtx model)
> +{
> +  typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
> +  const aarch64_atomic_load_op_fn plus[] =
> +  {
> +    gen_aarch64_atomic_loadaddqi,
> +    gen_aarch64_atomic_loadaddhi,
> +    gen_aarch64_atomic_loadaddsi,
> +    gen_aarch64_atomic_loadadddi
> +  };
> +  const aarch64_atomic_load_op_fn eor[] =
> +  {
> +    gen_aarch64_atomic_loadeorqi,
> +    gen_aarch64_atomic_loadeorhi,
> +    gen_aarch64_atomic_loadeorsi,
> +    gen_aarch64_atomic_loadeordi
> +  };
> +  const aarch64_atomic_load_op_fn ior[] =
> +  {
> +    gen_aarch64_atomic_loadsetqi,
> +    gen_aarch64_atomic_loadsethi,
> +    gen_aarch64_atomic_loadsetsi,
> +    gen_aarch64_atomic_loadsetdi
> +  };
> +  const aarch64_atomic_load_op_fn bic[] =
> +  {
> +    gen_aarch64_atomic_loadclrqi,
> +    gen_aarch64_atomic_loadclrhi,
> +    gen_aarch64_atomic_loadclrsi,
> +    gen_aarch64_atomic_loadclrdi
> +  };
> +  aarch64_atomic_load_op_fn gen;
> +  int idx = 0;
> +
> +  switch (mode)
> +    {
> +    case QImode: idx = 0; break;
> +    case HImode: idx = 1; break;
> +    case SImode: idx = 2; break;
> +    case DImode: idx = 3; break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  switch (code)
> +    {
> +    case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
> +    case AARCH64_LDOP_XOR: gen = eor[idx]; break;
> +    case AARCH64_LDOP_OR: gen = ior[idx]; break;
> +    case AARCH64_LDOP_BIC: gen = bic[idx]; break;
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  emit_insn (gen (dst, mem, src, model));
> +}
> +
> +/* Emit an atomic load+operate.  CODE is the operation.  OUT_DATA is the
> +   location to store the data read from memory.  MEM is the memory location 
> to
> +   read and modify.  MODEL_RTX is the memory ordering to use.  VALUE is the
> +   second operand for the operation.  Either OUT_DATA or OUT_RESULT, but not
> +   both, can be NULL.  */
>  
>  void
>  aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data,
>                        rtx mem, rtx value, rtx model_rtx)
>  {
>    machine_mode mode = GET_MODE (mem);
> +  machine_mode wmode = (mode == DImode ? DImode : SImode);
> +  const bool short_mode = (mode < SImode);
> +  aarch64_atomic_load_op_code ldop_code;
> +  rtx src;
> +  rtx x;
>  
> -  out_data = gen_lowpart (mode, out_data);
> +  if (out_data)
> +    out_data = gen_lowpart (mode, out_data);
> +
> +  /* Make sure the value is in a register, putting it into a destination
> +     register if it needs to be manipulated.  */
> +  if (!register_operand (value, mode)
> +      || code == AND || code == MINUS)
> +    {
> +      src = out_data;
> +      emit_move_insn (src, gen_lowpart (mode, value));
> +    }
> +  else
> +    src = value;
> +  gcc_assert (register_operand (src, mode));
>  
> +  /* Preprocess the data for the operation as necessary.  If the operation is
> +     a SET then emit a swap instruction and finish.  */
>    switch (code)
>      {
>      case SET:
> -      aarch64_emit_atomic_swap (mode, out_data, value, mem, model_rtx);
> +      aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
>        return;
>  
> +    case MINUS:
> +      /* Negate the value and treat it as a PLUS.  */
> +      {
> +     rtx neg_src;
> +
> +     /* Resize the value if necessary.  */
> +     if (short_mode)
> +       src = gen_lowpart (wmode, src);
> +
> +     neg_src = gen_rtx_NEG (wmode, src);
> +     emit_insn (gen_rtx_SET (src, neg_src));
> +
> +     if (short_mode)
> +       src = gen_lowpart (mode, src);
> +      }
> +      /* Fall-through.  */
> +    case PLUS:
> +      ldop_code = AARCH64_LDOP_PLUS;
> +      break;
> +
> +    case IOR:
> +      ldop_code = AARCH64_LDOP_OR;
> +      break;
> +
> +    case XOR:
> +      ldop_code = AARCH64_LDOP_XOR;
> +      break;
> +
> +    case AND:
> +      {
> +     rtx not_src;
> +
> +     /* Resize the value if necessary.  */
> +     if (short_mode)
> +       src = gen_lowpart (wmode, src);
> +
> +     not_src = gen_rtx_NOT (wmode, src);
> +     emit_insn (gen_rtx_SET (src, not_src));
> +
> +     if (short_mode)
> +       src = gen_lowpart (mode, src);
> +      }
> +      ldop_code = AARCH64_LDOP_BIC;
> +      break;
> +
>      default:
>        /* The operation can't be done with atomic instructions.  */
>        gcc_unreachable ();
>      }
> +
> +  aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, 
> model_rtx);
>  }
>  
>  /* Split an atomic operation.  */
> diff --git a/gcc/config/aarch64/atomics.md b/gcc/config/aarch64/atomics.md
> index b7b6fb5..5a10cf9 100644
> --- a/gcc/config/aarch64/atomics.md
> +++ b/gcc/config/aarch64/atomics.md
> @@ -230,23 +230,67 @@
>    }
>  )
>  
> -(define_insn_and_split "atomic_<atomic_optab><mode>"
> +(define_expand "atomic_<atomic_optab><mode>"
> + [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "")
> +   (unspec_volatile:ALLI
> +    [(atomic_op:ALLI (match_dup 0)
> +      (match_operand:ALLI 1 "<atomic_op_operand>" ""))
> +     (match_operand:SI 2 "const_int_operand")]
> +    UNSPECV_ATOMIC_OP))
> +  (clobber (reg:CC CC_REGNUM))]

This is not needed for the LSE forms of these instructions and may result
in less optimal code genmeration. On the other hand, that will only be in
a corner case and this is only a define_expand. Because of that, it would
be clearer to a reader if you dropped the detailed description of this
in RTL (which is never used) and rewrote it using just the uses of the
operands, as so:

> +(define_expand "atomic_<atomic_optab><mode>"
> + [(match_operand:ALLI 0 "aarch64_sync_memory_operand" "")
> +  (match_operand:ALLI 1 "<atomic_op_operand>" "")
> +  (match_operand:SI 2 "const_int_operand")]




> +(define_insn_and_split "aarch64_atomic_<atomic_optab><mode>"
> + [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
> +   (unspec_volatile:ALLI
> +    [(atomic_op:ALLI (match_dup 0)
> +      (match_operand:ALLI 1 "<atomic_op_operand>" "r<const_atomic>"))
> +     (match_operand:SI 2 "const_int_operand")]
> +    UNSPECV_ATOMIC_OP))
> +  (clobber (reg:CC CC_REGNUM))
> +  (clobber (match_scratch:ALLI 3 "=&r"))
> +  (clobber (match_scratch:SI 4 "=&r"))]
> +  ""

TARGET_LSE ?

> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_split_atomic_op (<CODE>, NULL, operands[3], operands[0],
> +                          operands[1], operands[2], operands[4]);
> +    DONE;
> +  }
> +)
> +
> +(define_insn_and_split "aarch64_atomic_<atomic_optab><mode>_lse"
>    [(set (match_operand:ALLI 0 "aarch64_sync_memory_operand" "+Q")
>      (unspec_volatile:ALLI
>        [(atomic_op:ALLI (match_dup 0)
>       (match_operand:ALLI 1 "<atomic_op_operand>" "r<const_atomic>"))
> -       (match_operand:SI 2 "const_int_operand")]             ;; model
> +       (match_operand:SI 2 "const_int_operand")]
>        UNSPECV_ATOMIC_OP))
> -       (clobber (reg:CC CC_REGNUM))
>     (clobber (match_scratch:ALLI 3 "=&r"))
> -   (clobber (match_scratch:SI 4 "=&r"))]
> +   (clobber (reg:CC CC_REGNUM))]
>    ""

TARGET_LSE?

>    "#"
>    "&& reload_completed"
>    [(const_int 0)]
>    {
> -    aarch64_split_atomic_op (<CODE>, NULL, operands[3], operands[0],
> -                          operands[1], operands[2], operands[4]);
> +    aarch64_gen_atomic_ldop (<CODE>, operands[3], operands[0],
> +                          operands[1], operands[2]);
>      DONE;
>    }
>  )
> @@ -273,7 +317,37 @@
>    }
>  )
>  
> -(define_insn_and_split "atomic_fetch_<atomic_optab><mode>"
> +;; Load-operate-store, returning the updated memory data.
> +
> +(define_expand "atomic_fetch_<atomic_optab><mode>"
> + [(parallel
> +   [(set
> +     (match_operand:ALLI 0 "register_operand" "")
> +     (match_operand:ALLI 1 "aarch64_sync_memory_operand" ""))
> +   (set
> +    (match_dup 1)
> +    (unspec_volatile:ALLI
> +     [(atomic_op:ALLI
> +       (match_dup 1)
> +       (match_operand:ALLI 2 "<atomic_op_operand>" ""))
> +      (match_operand:SI 3 "const_int_operand")]
> +     UNSPECV_ATOMIC_OP))])]

This again seems very detailed for an expand pattern which won't use
it.

> + ""
> +{
> +  rtx (*gen) (rtx, rtx, rtx, rtx);
> +
> +  /* Use an atomic load-operate instruction when possible.  */
> +  if (aarch64_atomic_ldop_supported_p (<CODE>))
> +    gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>_lse;
> +  else
> +    gen = gen_aarch64_atomic_fetch_<atomic_optab><mode>;
> +
> +  emit_insn (gen (operands[0], operands[1], operands[2], operands[3]));
> +
> +  DONE;
> +})
> +
> +(define_insn_and_split "aarch64_atomic_fetch_<atomic_optab><mode>"
>    [(set (match_operand:ALLI 0 "register_operand" "=&r")
>      (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
>     (set (match_dup 1)
> @@ -296,6 +370,27 @@
>    }
>  )
>  
> +(define_insn_and_split "aarch64_atomic_fetch_<atomic_optab><mode>_lse"
> +  [(set (match_operand:ALLI 0 "register_operand" "=&r")
> +    (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
> +   (set (match_dup 1)
> +    (unspec_volatile:ALLI
> +      [(atomic_op:ALLI (match_dup 1)
> +     (match_operand:ALLI 2 "<atomic_op_operand>" "r<const_atomic>"))
> +       (match_operand:SI 3 "const_int_operand")]
> +      UNSPECV_ATOMIC_LDOP))
> +   (clobber (reg:CC CC_REGNUM))]
> +  "TARGET_LSE"
> +  "#"
> +  "&& reload_completed"
> +  [(const_int 0)]
> +  {
> +    aarch64_gen_atomic_ldop (<CODE>, operands[0], operands[1],
> +                          operands[2], operands[3]);
> +    DONE;
> +  }
> +)
> +
>  (define_insn_and_split "atomic_fetch_nand<mode>"
>    [(set (match_operand:ALLI 0 "register_operand" "=&r")
>      (match_operand:ALLI 1 "aarch64_sync_memory_operand" "+Q"))
>


Thanks,
James

Re: [AArch64][PATCH 4/5] Use atomic load-operate instructions for fetch-update patterns.

Reply via email to