On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> "H.J. Lu" <hjl.to...@gmail.com> writes:
> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> > index 39ab139b7e1..1972301ce3c 100644
> > --- a/gcc/builtins.c
> > +++ b/gcc/builtins.c
> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, 
> > machine_mode target_mode)
> >
> >  static rtx
> >  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> > -                      scalar_int_mode mode)
> > +                      fixed_size_mode mode)
> >  {
> >    /* The REPresentation pointed to by DATA need not be a nul-terminated
> >       string but the caller guarantees it's large enough for MODE.  */
> >    const char *rep = (const char *) data;
> >
> > -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> > +     the memset expander.  */
>
> Sorry to nitpick, but I guess this might get out out-of-date.  Maybe:
>
>   /* The by-pieces infrastructure does not try to pick a vector mode
>      for memcpy expansion.  */

Fixed.

> > +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> > +                 /*nul_terminated=*/false);
> >  }
> >
> >  /* LEN specify length of the block of memcpy/memset operation.
> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
> >
> >  rtx
> >  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> > -                       scalar_int_mode mode)
> > +                       fixed_size_mode mode)
> >  {
> >    const char *str = (const char *) data;
> >
> >    if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> >      return const0_rtx;
> >
> > -  return c_readstr (str + offset, mode);
> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> > +     the memset expander.  */
>
> Similarly here for strncpy expansion.

Fixed.

> > +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> >  }
> >
> >  /* Helper to check the sizes of sequences and the destination of calls
> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
> >    return NULL_RTX;
> >  }
> >
> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> > -   bytes from constant string DATA + OFFSET and return it as target
> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> > +/* Return the RTL of a register in MODE generated from PREV in the
> >     previous iteration.  */
> >
> > -rtx
> > -builtin_memset_read_str (void *data, void *prevp,
> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> > -                      scalar_int_mode mode)
> > +static rtx
> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
> >  {
> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> > +  rtx target = nullptr;
> >    if (prev != nullptr && prev->data != nullptr)
> >      {
> >        /* Use the previous data in the same mode.  */
> >        if (prev->mode == mode)
> >       return prev->data;
> > +
> > +      fixed_size_mode prev_mode = prev->mode;
> > +
> > +      /* Don't use the previous data to write QImode if it is in a
> > +      vector mode.  */
> > +      if (VECTOR_MODE_P (prev_mode) && mode == QImode)
> > +     return target;
> > +
> > +      rtx prev_rtx = prev->data;
> > +
> > +      if (REG_P (prev_rtx)
> > +       && HARD_REGISTER_P (prev_rtx)
> > +       && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> > +     {
> > +       /* If we can't put a hard register in MODE, first generate a
> > +          subreg of word mode if the previous mode is wider than word
> > +          mode and word mode is wider than MODE.  */
> > +       if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
> > +           && UNITS_PER_WORD > GET_MODE_SIZE (mode))
> > +         {
> > +           prev_rtx = lowpart_subreg (word_mode, prev_rtx,
> > +                                      prev_mode);
> > +           if (prev_rtx != nullptr)
> > +             prev_mode = word_mode;
> > +         }
> > +       else
> > +         prev_rtx = nullptr;
>
> I don't understand this.  Why not just do the:
>
>       if (REG_P (prev_rtx)
>           && HARD_REGISTER_P (prev_rtx)
>           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
>         prev_rtx = copy_to_reg (prev_rtx);
>
> that I suggested in the previous review?

But for
---
extern void *ops;

void
foo (int c)
{
  __builtin_memset (ops, c, 18);
}
---
I got

vpbroadcastb %edi, %xmm31
vmovdqa64 %xmm31, -24(%rsp)
movq ops(%rip), %rax
movzwl -24(%rsp), %edx
vmovdqu8 %xmm31, (%rax)
movw %dx, 16(%rax)
ret

I want to avoid store and load.  I am testing

      if (REG_P (prev_rtx)
          && HARD_REGISTER_P (prev_rtx)
          && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
        {
          /* Find the smallest subreg mode in the same mode class which
             is not narrower than MODE and narrower than PREV_MODE.  */
          machine_mode m;
          fixed_size_mode candidate;
          FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode))
            if (is_a<fixed_size_mode> (m, &candidate))
              {
                if (GET_MODE_SIZE (candidate)
                    >= GET_MODE_SIZE (prev_mode))
                  break;
                if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode)
                    && lowpart_subreg_regno (REGNO (prev_rtx),
                                             prev_mode, candidate) >= 0)
                  {
                    target = lowpart_subreg (candidate, prev_rtx,
                                             prev_mode);
                    prev_rtx = target;
                    prev_mode = candidate;
                    break;
                  }
              }
          if (target == nullptr)
            prev_rtx = copy_to_reg (prev_rtx);
        }

to get

movq ops(%rip), %rax
vpbroadcastb %edi, %xmm31
vmovd %xmm31, %edx
movw %dx, 16(%rax)
vmovdqu8 %xmm31, (%rax)
ret


> IMO anything that relies on a sequence of two subreg operations is
> doing something wrong.
>
> > +     }
> > +      if (prev_rtx != nullptr)
> > +     target = lowpart_subreg (mode, prev_rtx, prev_mode);
> >      }
> > +  return target;
> > +}
> > +
> > […]
> > @@ -769,21 +769,41 @@ alignment_for_piecewise_move (unsigned int 
> > max_pieces, unsigned int align)
> >    return align;
> >  }
> >
> > -/* Return the widest integer mode that is narrower than SIZE bytes.  */
> > +/* Return the widest QI vector, if QI_MODE is true, or integer mode
> > +   that is narrower than SIZE bytes.  */
> >
> > -static scalar_int_mode
> > -widest_int_mode_for_size (unsigned int size)
> > +static fixed_size_mode
> > +widest_fixed_size_mode_for_size (unsigned int size, bool qi_vector)
> >  {
> > -  scalar_int_mode result = NARROWEST_INT_MODE;
> > +  machine_mode result = NARROWEST_INT_MODE;
> >
> >    gcc_checking_assert (size > 1);
> >
> > +  /* Use QI vector only if size is wider than a WORD.  */
> > +  if (qi_vector && size > UNITS_PER_WORD)
> > +    {
> > +      machine_mode mode;
> > +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > +     if (GET_MODE_INNER (mode) == QImode
> > +         && GET_MODE_SIZE (mode).is_constant ())
> > +       {
> > +         if (GET_MODE_SIZE (mode).to_constant () >= size)
> > +           break;
> > +         if (optab_handler (vec_duplicate_optab, mode)
> > +             != CODE_FOR_nothing)
> > +           result = mode;
> > +       }
> > +
> > +      if (result != NARROWEST_INT_MODE)
> > +     return as_a <fixed_size_mode> (result);
> > +    }
> > +
> >    opt_scalar_int_mode tmode;
> >    FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
> >      if (GET_MODE_SIZE (tmode.require ()) < size)
> >        result = tmode.require ();
> >
> > -  return result;
> > +  return as_a <fixed_size_mode> (result);
> >  }
>
> A less cast-heavy way to write this would be:
>
>   fixed_size_mode result = NARROWEST_INT_MODE;
>
>   gcc_checking_assert (size > 1);
>
>   /* Use QI vector only if size is wider than a WORD.  */
>   if (qi_vector && size > UNITS_PER_WORD)
>     {
>       machine_mode mode;
>       fixed_size_mode candidate;
>       FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
>         if (is_a<fixed_size_mode> (mode, &candidate)
>             && GET_MODE_INNER (candidate) == QImode)
>           {
>             if (GET_MODE_SIZE (candidate) >= size)
>               break;
>             if (optab_handler (vec_duplicate_optab, candidate)
>                 != CODE_FOR_nothing)
>               result = candidate;
>           }
>
>       if (result != NARROWEST_INT_MODE)
>         return result;
>     }

Fixed.

>   opt_scalar_int_mode tmode;
>   FOR_EACH_MODE_IN_CLASS (tmode, MODE_INT)
>     if (GET_MODE_SIZE (tmode.require ()) < size)
>       result = tmode.require ();
>
>   return result;
>
> > @@ -1148,13 +1178,43 @@ op_by_pieces_d::get_usable_mode (scalar_int_mode 
> > mode, unsigned int len)
> >        size = GET_MODE_SIZE (mode);
> >        if (len >= size && prepare_mode (mode, m_align))
> >       break;
> > -      /* NB: widest_int_mode_for_size checks SIZE > 1.  */
> > -      mode = widest_int_mode_for_size (size);
> > +      /* NB: widest_fixed_size_mode_for_size checks SIZE > 1.  */
> > +      mode = widest_fixed_size_mode_for_size (size, m_qi_vector_mode);
> >      }
> >    while (1);
> >    return mode;
> >  }
> >
> > +/* Return the smallest integer or QI vector mode that is not narrower
> > +   than SIZE bytes.  */
> > +
> > +fixed_size_mode
> > +op_by_pieces_d::smallest_fixed_size_mode_for_size (unsigned int size)
> > +{
> > +  /* Use QI vector only for > size of WORD.  */
> > +  if (m_qi_vector_mode && size > UNITS_PER_WORD)
> > +    {
> > +      machine_mode mode;
> > +      FOR_EACH_MODE_IN_CLASS (mode, MODE_VECTOR_INT)
> > +     if (GET_MODE_INNER (mode) == QImode
> > +         && GET_MODE_SIZE (mode).is_constant ())
> > +       {
> > +         unsigned int mode_size = GET_MODE_SIZE (mode).to_constant ();
> > +
> > +         /* NB: Don't return a mode wider than M_LEN.  */
> > +         if (mode_size > m_len)
> > +           break;
> > +
> > +         if (mode_size >= size
> > +             && (optab_handler (vec_duplicate_optab, mode)
> > +                 != CODE_FOR_nothing))
> > +           return as_a <fixed_size_mode> (mode);
> > +       }
> > +    }
>
> Same idea here.

Fixed.

> I realise this is pre-existing, but the heavy use of “NB:” seems a bit odd.
> I don't think it's something that the patch needs to adopt.

Noted.

> Otherwise it looks good, thanks.  I think the main sticking point is
> the subreg thing.
>
> Richard
>
>

Thanks.

-- 
H.J.

Reply via email to