Re: [PATCH][AArch64] Cleanup move immediate code

Richard Sandiford via Gcc-patches Thu, 24 Nov 2022 04:20:50 -0800

Sorry for the very long delay in reviewing this.

Wilco Dijkstra <wilco.dijks...@arm.com> writes:
> Hi Richard,
>
> Here is the immediate cleanup splitoff from the previous patch:
>
> Simplify, refactor and improve various move immediate functions.
> Allow 32-bit MOVZ/N as a valid 64-bit immediate which removes special
> cases in aarch64_internal_mov_immediate.  Add new constraint so the movdi
> pattern only needs a single alternative for move immediate.


Just to make sure I understand: isn't it really just MOVN?  I would have
expected a 32-bit MOVZ to be equivalent to (and add no capabilities over)
a 64-bit MOVZ.

> Passes bootstrap and regress, OK for commit?
>
> gcc/ChangeLog:
>
>         * config/aarch64/aarch64.cc (aarch64_bitmask_imm): Use unsigned type.
>         (aarch64_zeroextended_move_imm): New function.
>         (aarch64_move_imm): Refactor, assert mode is SImode or DImode.
>         (aarch64_internal_mov_immediate): Assert mode is SImode or DImode.
>         Simplify special cases.
>         (aarch64_uimm12_shift): Simplify code.
>         (aarch64_clamp_to_uimm12_shift): Likewise.
>         (aarch64_movw_imm): Remove.
>         (aarch64_float_const_rtx_p): Pass either SImode or DImode to
>         aarch64_internal_mov_immediate.
>         (aarch64_rtx_costs): Likewise.
>         * config/aarch64/aarch64.md (movdi_aarch64): Merge 'N' and 'M'
>         constraints into single 'O'.
>         (mov<mode>_aarch64): Likewise.
>         * config/aarch64/aarch64-protos.h (aarch64_move_imm): Use unsigned.
>         (aarch64_bitmask_imm): Likewise.
>         (aarch64_uimm12_shift): Likewise.
>         (aarch64_zeroextended_move_imm): New prototype.
>         * config/aarch64/constraints.md: Add 'O' for 32/64-bit immediates,
>         limit 'N' to 64-bit only moves.
>
> ---
>
> diff --git a/gcc/config/aarch64/aarch64-protos.h 
> b/gcc/config/aarch64/aarch64-protos.h
> index 
> 1a71f02284137c64e7115b26e6aa00447596f105..a73bfa20acb9b92ae0475794c3f11c67d22feb97
>  100644
> --- a/gcc/config/aarch64/aarch64-protos.h
> +++ b/gcc/config/aarch64/aarch64-protos.h
> @@ -755,7 +755,7 @@ void aarch64_post_cfi_startproc (void);
>  poly_int64 aarch64_initial_elimination_offset (unsigned, unsigned);
>  int aarch64_get_condition_code (rtx);
>  bool aarch64_address_valid_for_prefetch_p (rtx, bool);
> -bool aarch64_bitmask_imm (HOST_WIDE_INT val, machine_mode);
> +bool aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode);
>  unsigned HOST_WIDE_INT aarch64_and_split_imm1 (HOST_WIDE_INT val_in);
>  unsigned HOST_WIDE_INT aarch64_and_split_imm2 (HOST_WIDE_INT val_in);
>  bool aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode 
> mode);
> @@ -792,7 +792,7 @@ bool aarch64_masks_and_shift_for_bfi_p (scalar_int_mode, 
> unsigned HOST_WIDE_INT,
>                                         unsigned HOST_WIDE_INT,
>                                         unsigned HOST_WIDE_INT);
>  bool aarch64_zero_extend_const_eq (machine_mode, rtx, machine_mode, rtx);
> -bool aarch64_move_imm (HOST_WIDE_INT, machine_mode);
> +bool aarch64_move_imm (unsigned HOST_WIDE_INT, machine_mode);
>  machine_mode aarch64_sve_int_mode (machine_mode);
>  opt_machine_mode aarch64_sve_pred_mode (unsigned int);
>  machine_mode aarch64_sve_pred_mode (machine_mode);
> @@ -842,8 +842,9 @@ bool aarch64_sve_float_arith_immediate_p (rtx, bool);
>  bool aarch64_sve_float_mul_immediate_p (rtx);
>  bool aarch64_split_dimode_const_store (rtx, rtx);
>  bool aarch64_symbolic_address_p (rtx);
> -bool aarch64_uimm12_shift (HOST_WIDE_INT);
> +bool aarch64_uimm12_shift (unsigned HOST_WIDE_INT);
>  int aarch64_movk_shift (const wide_int_ref &, const wide_int_ref &);
> +bool aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT);
>  bool aarch64_use_return_insn_p (void);
>  const char *aarch64_output_casesi (rtx *);
>
> diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
> index 
> 5d1ab5aa42b2cda0a655d2bc69c4df19da457ab3..798363bcc449c414de5bbb4f26b8e1c64a0cf71a
>  100644
> --- a/gcc/config/aarch64/aarch64.cc
> +++ b/gcc/config/aarch64/aarch64.cc
> @@ -5558,12 +5558,10 @@ aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
>
>  /* Return true if VAL is a valid bitmask immediate for MODE.  */
>  bool
> -aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
> +aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
>  {
>    if (mode == DImode)
> -    return aarch64_bitmask_imm (val_in);
> -
> -  unsigned HOST_WIDE_INT val = val_in;
> +    return aarch64_bitmask_imm (val);
>
>    if (mode == SImode)
>      return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
> @@ -5602,51 +5600,60 @@ aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
>  }
>
>
> -/* Return true if val is an immediate that can be loaded into a
> -   register by a MOVZ instruction.  */
> -static bool
> -aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
> +/* Return true if immediate VAL can only be created by using a 32-bit
> +   zero-extended move immediate, not by a 64-bit move.  */
> +bool
> +aarch64_zeroextended_move_imm (unsigned HOST_WIDE_INT val)
>  {
> -  if (GET_MODE_SIZE (mode) > 4)
> -    {
> -      if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
> -          || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
> -       return 1;
> -    }
> -  else
> -    {
> -      /* Ignore sign extension.  */
> -      val &= (HOST_WIDE_INT) 0xffffffff;
> -    }
> -  return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
> -         || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
> +  if (val < 65536 || (val >> 32) != 0 || (val & 0xffff) == 0)
> +    return false;
> +  return !aarch64_bitmask_imm (val);
>  }
>
>
> -/* Return true if VAL is an immediate that can be loaded into a
> -   register in a single instruction.  */
> +/* Return true if VAL is an immediate that can be created by a single
> +   MOV instruction.  */
>  bool
> -aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
> +aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
>  {
> -  scalar_int_mode int_mode;
> -  if (!is_a <scalar_int_mode> (mode, &int_mode))
> -    return false;
> +  unsigned HOST_WIDE_INT val2;
>
> -  if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
> -    return 1;
> -  return aarch64_bitmask_imm (val, int_mode);
> +  gcc_assert (mode == SImode || mode == DImode);
> +
> +  if (val < 65536)
> +    return true;
> +
> +  val2 = val ^ ((HOST_WIDE_INT) val >> 63);

I think we're supposed to avoid relying on >> being an arithmetic shift,
since C++11 doesn't guarantee it.  So I think it needs to be something like:

  val2 = ((HOST_WIDE_INT) val < 0 ? ~val : val);

> +  if ((val2 >> (__builtin_ctzll (val2) & 48)) < 65536)
> +    return true;

We should use ctz_hwi here, instead of using the GCC builtin directly.

> +
> +  /* Special case 0xyyyyffffffffffff. */
> +  if (((val2 + 1) << 16) == 0)
> +    return true;
> +
> +  /* Special case immediates 0xffffyyyy and 0xyyyyffff.  */
> +  val2 = (mode == DImode) ? val : val2;
> +  if (((val2 + 1) & ~(unsigned HOST_WIDE_INT) 0xffff0000) == 0
> +      || (val2 >> 16) == 0xffff)
> +    return true;
> +
> +  if (mode == SImode || (val >> 32) == 0)
> +    val = (val & 0xffffffff) | (val << 32);
> +  return aarch64_bitmask_imm (val);

I agree the ctz trick is more elegant than (and an improvement over)
the current approach to testing for movz.  But I think the overall logic
is harder to follow than it was in the original version.  Initially
canonicalising val2 based on the sign bit seems unintuitive since we
still need to handle all four combinations of (top bit set, top bit clear)
x (low 48 bits set, low 48 bits clear).  I preferred the original
approach of testing once with the original value (for MOVZ) and once
with the inverted value (for MOVN).

Don't the new cases boil down to: if mode is DImode and the upper 32 bits
are clear, we can test based on SImode instead?  In other words, couldn't
the "(val >> 32) == 0" part of the final test be done first, with the
effect of changing the mode to SImode?  Something like:

  gcc_assert (mode == SImode || mode == DImode);

  /* We can't use a 64-bit MOVN if the upper 32 bits are clear, but we might
     be able to use a 32-bit MOVN.  We could also use a 32-bit ORR.  */
  if (mode == DImode && (val >> 32) == 0)
    mode = SImode;
  
  if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
    return true;

  if (mode == SImode)
    val = (val & 0xffffffff) | (val << 32);

  return aarch64_bitmask_imm (val);

with movw_imm rewritten to use ctz?  (Completely untested :-))

Or am I missing a case that the patch handles and the code above doesn't?

Thanks,
Richard


>  }
>
>
>  static int
>  aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
> -                               scalar_int_mode mode)
> +                               machine_mode mode)
>  {
>    int i;
>    unsigned HOST_WIDE_INT val, val2, mask;
>    int one_match, zero_match;
>    int num_insns;
>
> +  gcc_assert (mode == SImode || mode == DImode);
> +
>    val = INTVAL (imm);
>
>    if (aarch64_move_imm (val, mode))
> @@ -5656,31 +5663,6 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, 
> bool generate,
>        return 1;
>      }
>
> -  /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
> -     (with XXXX non-zero). In that case check to see if the move can be done 
> in
> -     a smaller mode.  */
> -  val2 = val & 0xffffffff;
> -  if (mode == DImode
> -      && aarch64_move_imm (val2, SImode)
> -      && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
> -    {
> -      if (generate)
> -       emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> -
> -      /* Check if we have to emit a second instruction by checking to see
> -        if any of the upper 32 bits of the original DI mode value is set.  */
> -      if (val == val2)
> -       return 1;
> -
> -      i = (val >> 48) ? 48 : 32;
> -
> -      if (generate)
> -        emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> -                                   GEN_INT ((val >> i) & 0xffff)));
> -
> -      return 2;
> -    }
> -
>    if ((val >> 32) == 0 || mode == SImode)
>      {
>        if (generate)
> @@ -5704,24 +5686,31 @@ aarch64_internal_mov_immediate (rtx dest, rtx imm, 
> bool generate,
>    one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
>      ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
>
> +  /* Try a bitmask immediate and a movk to generate the immediate
> +     in 2 instructions.  */
> +
>    if (zero_match < 2 && one_match < 2)
>      {
> -      /* Try emitting a bitmask immediate with a movk replacing 16 bits.
> -        For a 64-bit bitmask try whether changing 16 bits to all ones or
> -        zeroes creates a valid bitmask.  To check any repeated bitmask,
> -        try using 16 bits from the other 32-bit half of val.  */
> -
>        for (i = 0; i < 64; i += 16)
> -       if (aarch64_check_bitmask (val, val2, mask << i))
> -         {
> -           if (generate)
> -             {
> -               emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> -               emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> -                                          GEN_INT ((val >> i) & 0xffff)));
> -             }
> -           return 2;
> -         }
> +       {
> +         if (aarch64_check_bitmask (val, val2, mask << i))
> +           break;
> +
> +         val2 = val & ~(mask << i);
> +         if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
> +           break;
> +       }
> +
> +      if (i != 64)
> +       {
> +         if (generate)
> +           {
> +             emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
> +             emit_insn (gen_insv_immdi (dest, GEN_INT (i),
> +                                        GEN_INT ((val >> i) & 0xffff)));
> +           }
> +         return 2;
> +       }
>      }
>
>    /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions.  
> */
> @@ -5790,26 +5779,24 @@ aarch64_mov128_immediate (rtx imm)
>  /* Return true if val can be encoded as a 12-bit unsigned immediate with
>     a left shift of 0 or 12 bits.  */
>  bool
> -aarch64_uimm12_shift (HOST_WIDE_INT val)
> +aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
>  {
> -  return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
> -         || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
> -         );
> +  return val < 4096 || (val & 0xfff000) == val;
>  }
>
>  /* Returns the nearest value to VAL that will fit as a 12-bit unsigned 
> immediate
>     that can be created with a left shift of 0 or 12.  */
>  static HOST_WIDE_INT
> -aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
> +aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
>  {
>    /* Check to see if the value fits in 24 bits, as that is the maximum we can
>       handle correctly.  */
> -  gcc_assert ((val & 0xffffff) == val);
> +  gcc_assert (val < 0x1000000);
>
> -  if (((val & 0xfff) << 0) == val)
> +  if (val < 4096)
>      return val;
>
> -  return val & (0xfff << 12);
> +  return val & 0xfff000;
>  }
>
>
> @@ -6957,8 +6944,7 @@ aarch64_expand_mov_immediate (rtx dest, rtx imm)
>        return;
>      }
>
> -  aarch64_internal_mov_immediate (dest, imm, true,
> -                                 as_a <scalar_int_mode> (mode));
> +  aarch64_internal_mov_immediate (dest, imm, true, mode);
>  }
>
>  /* Return the MEM rtx that provides the canary value that should be used
> @@ -11130,9 +11116,7 @@ aarch64_float_const_rtx_p (rtx x)
>        && SCALAR_FLOAT_MODE_P (mode)
>        && aarch64_reinterpret_float_as_int (x, &ival))
>      {
> -      scalar_int_mode imode = (mode == HFmode
> -                              ? SImode
> -                              : int_mode_for_mode (mode).require ());
> +      machine_mode imode = (mode == DFmode) ? DImode : SImode;
>        int num_instr = aarch64_internal_mov_immediate
>                         (NULL_RTX, gen_int_mode (ival, imode), false, imode);
>        return num_instr < 3;
> @@ -13790,10 +13774,9 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
> outer ATTRIBUTE_UNUSED,
>              proportionally expensive to the number of instructions
>              required to build that constant.  This is true whether we
>              are compiling for SPEED or otherwise.  */
> -         if (!is_a <scalar_int_mode> (mode, &int_mode))
> -           int_mode = word_mode;
> +         machine_mode imode = (mode == SImode) ? SImode : DImode;
>           *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
> -                                (NULL_RTX, x, false, int_mode));
> +                                (NULL_RTX, x, false, imode));
>         }
>        return true;
>
> @@ -13809,9 +13792,7 @@ aarch64_rtx_costs (rtx x, machine_mode mode, int 
> outer ATTRIBUTE_UNUSED,
>           bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
>           gcc_assert (succeed);
>
> -         scalar_int_mode imode = (mode == HFmode
> -                                  ? SImode
> -                                  : int_mode_for_mode (mode).require ());
> +         machine_mode imode = (mode == DFmode) ? DImode : SImode;
>           int ncost = aarch64_internal_mov_immediate
>                 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
>           *cost += COSTS_N_INSNS (ncost);
> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
> index 
> f2e3d905dbbeb2949f2947f5cfd68208c94c9272..604a67d87a7ef525efebed48f39de43c97f2a397
>  100644
> --- a/gcc/config/aarch64/aarch64.md
> +++ b/gcc/config/aarch64/aarch64.md
> @@ -1309,16 +1309,15 @@ (define_insn_and_split "*movsi_aarch64"
>  )
>
>  (define_insn_and_split "*movdi_aarch64"
> -  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r,r, r,w, 
> m,m,   r,  r,  r, w,r,w, w")
> -       (match_operand:DI 1 "aarch64_mov_operand"  " 
> r,r,k,N,M,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
> +  [(set (match_operand:DI 0 "nonimmediate_operand" "=r,k,r,r,r,r, r,w, m,m,  
>  r,  r,  r, w,r,w, w")
> +       (match_operand:DI 1 "aarch64_mov_operand"  " 
> r,r,k,O,n,Usv,m,m,rZ,w,Usw,Usa,Ush,rZ,w,w,Dd"))]
>    "(register_operand (operands[0], DImode)
>      || aarch64_reg_or_zero (operands[1], DImode))"
>    "@
>     mov\\t%x0, %x1
>     mov\\t%0, %x1
>     mov\\t%x0, %1
> -   mov\\t%x0, %1
> -   mov\\t%w0, %1
> +   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? 
> \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";
>     #
>     * return aarch64_output_sve_cnt_immediate (\"cnt\", \"%x0\", operands[1]);
>     ldr\\t%x0, %1
> @@ -1340,11 +1339,11 @@ (define_insn_and_split "*movdi_aarch64"
>         DONE;
>      }"
>    ;; The "mov_imm" type for CNTD is just a placeholder.
> -  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,mov_imm,
> +  [(set_attr "type" "mov_reg,mov_reg,mov_reg,mov_imm,mov_imm,mov_imm,
>                      load_8,load_8,store_8,store_8,load_8,adr,adr,f_mcr,f_mrc,
>                      fmov,neon_move")
> -   (set_attr "arch"   "*,*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
> -   (set_attr "length" "4,4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
> +   (set_attr "arch"   "*,*,*,*,*,sve,*,fp,*,fp,*,*,*,fp,fp,fp,simd")
> +   (set_attr "length" "4,4,4,4,*,  4,4, 4,4, 4,8,4,4, 4, 4, 4,   4")]
>  )
>
>  (define_insn "insv_imm<mode>"
> @@ -1508,7 +1507,7 @@ (define_insn "*mov<mode>_aarch64"
>
>  (define_insn "*mov<mode>_aarch64"
>    [(set (match_operand:DFD 0 "nonimmediate_operand" "=w, w  ,?r,w,w  ,w  
> ,w,m,r,m ,r,r")
> -       (match_operand:DFD 1 "general_operand"      "Y , ?rY, 
> w,w,Ufc,Uvi,m,w,m,rY,r,N"))]
> +       (match_operand:DFD 1 "general_operand"      "Y , ?rY, 
> w,w,Ufc,Uvi,m,w,m,rY,r,O"))]
>    "TARGET_FLOAT && (register_operand (operands[0], <MODE>mode)
>      || aarch64_reg_or_fp_zero (operands[1], <MODE>mode))"
>    "@
> @@ -1523,7 +1522,7 @@ (define_insn "*mov<mode>_aarch64"
>     ldr\\t%x0, %1
>     str\\t%x1, %0
>     mov\\t%x0, %x1
> -   mov\\t%x0, %1"
> +   * return aarch64_zeroextended_move_imm (INTVAL (operands[1])) ? 
> \"mov\\t%w0, %1\" : \"mov\\t%x0, %1\";"
>    [(set_attr "type" "neon_move,f_mcr,f_mrc,fmov,fconstd,neon_move,\
>                      f_loadd,f_stored,load_8,store_8,mov_reg,\
>                      fconstd")
> diff --git a/gcc/config/aarch64/constraints.md 
> b/gcc/config/aarch64/constraints.md
> index 
> ee7587cca1673208e2bfd6b503a21d0c8b69bf75..e91c7eab0b3674ca34ac2f790c38fcd27986c35f
>  100644
> --- a/gcc/config/aarch64/constraints.md
> +++ b/gcc/config/aarch64/constraints.md
> @@ -106,6 +106,12 @@ (define_constraint "M"
>
>  (define_constraint "N"
>   "A constant that can be used with a 64-bit MOV immediate operation."
> + (and (match_code "const_int")
> +      (match_test "aarch64_move_imm (ival, DImode)")
> +      (match_test "!aarch64_zeroextended_move_imm (ival)")))
> +
> +(define_constraint "O"
> + "A constant that can be used with a 32 or 64-bit MOV immediate operation."
>   (and (match_code "const_int")
>        (match_test "aarch64_move_imm (ival, DImode)")))

Re: [PATCH][AArch64] Cleanup move immediate code

Reply via email to