On Wed, Jul 21, 2021 at 12:42 PM Richard Sandiford
<richard.sandif...@arm.com> wrote:
>
> Richard Sandiford <richard.sandif...@arm.com> writes:
> > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes:
> >> On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford
> >> <richard.sandif...@arm.com> wrote:
> >>>
> >>> "H.J. Lu" <hjl.to...@gmail.com> writes:
> >>> > diff --git a/gcc/builtins.c b/gcc/builtins.c
> >>> > index 39ab139b7e1..1972301ce3c 100644
> >>> > --- a/gcc/builtins.c
> >>> > +++ b/gcc/builtins.c
> >>> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, 
> >>> > machine_mode target_mode)
> >>> >
> >>> >  static rtx
> >>> >  builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> >>> > -                      scalar_int_mode mode)
> >>> > +                      fixed_size_mode mode)
> >>> >  {
> >>> >    /* The REPresentation pointed to by DATA need not be a nul-terminated
> >>> >       string but the caller guarantees it's large enough for MODE.  */
> >>> >    const char *rep = (const char *) data;
> >>> >
> >>> > -  return c_readstr (rep + offset, mode, /*nul_terminated=*/false);
> >>> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> >>> > +     the memset expander.  */
> >>>
> >>> Sorry to nitpick, but I guess this might get out out-of-date.  Maybe:
> >>>
> >>>   /* The by-pieces infrastructure does not try to pick a vector mode
> >>>      for memcpy expansion.  */
> >>
> >> Fixed.
> >>
> >>> > +  return c_readstr (rep + offset, as_a <scalar_int_mode> (mode),
> >>> > +                 /*nul_terminated=*/false);
> >>> >  }
> >>> >
> >>> >  /* LEN specify length of the block of memcpy/memset operation.
> >>> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx)
> >>> >
> >>> >  rtx
> >>> >  builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset,
> >>> > -                       scalar_int_mode mode)
> >>> > +                       fixed_size_mode mode)
> >>> >  {
> >>> >    const char *str = (const char *) data;
> >>> >
> >>> >    if ((unsigned HOST_WIDE_INT) offset > strlen (str))
> >>> >      return const0_rtx;
> >>> >
> >>> > -  return c_readstr (str + offset, mode);
> >>> > +  /* NB: Vector mode in the by-pieces infrastructure is only used by
> >>> > +     the memset expander.  */
> >>>
> >>> Similarly here for strncpy expansion.
> >>
> >> Fixed.
> >>
> >>> > +  return c_readstr (str + offset, as_a <scalar_int_mode> (mode));
> >>> >  }
> >>> >
> >>> >  /* Helper to check the sizes of sequences and the destination of calls
> >>> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target)
> >>> >    return NULL_RTX;
> >>> >  }
> >>> >
> >>> > -/* Callback routine for store_by_pieces.  Read GET_MODE_BITSIZE (MODE)
> >>> > -   bytes from constant string DATA + OFFSET and return it as target
> >>> > -   constant.  If PREV isn't nullptr, it has the RTL info from the
> >>> > +/* Return the RTL of a register in MODE generated from PREV in the
> >>> >     previous iteration.  */
> >>> >
> >>> > -rtx
> >>> > -builtin_memset_read_str (void *data, void *prevp,
> >>> > -                      HOST_WIDE_INT offset ATTRIBUTE_UNUSED,
> >>> > -                      scalar_int_mode mode)
> >>> > +static rtx
> >>> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode)
> >>> >  {
> >>> > -  by_pieces_prev *prev = (by_pieces_prev *) prevp;
> >>> > +  rtx target = nullptr;
> >>> >    if (prev != nullptr && prev->data != nullptr)
> >>> >      {
> >>> >        /* Use the previous data in the same mode.  */
> >>> >        if (prev->mode == mode)
> >>> >       return prev->data;
> >>> > +
> >>> > +      fixed_size_mode prev_mode = prev->mode;
> >>> > +
> >>> > +      /* Don't use the previous data to write QImode if it is in a
> >>> > +      vector mode.  */
> >>> > +      if (VECTOR_MODE_P (prev_mode) && mode == QImode)
> >>> > +     return target;
> >>> > +
> >>> > +      rtx prev_rtx = prev->data;
> >>> > +
> >>> > +      if (REG_P (prev_rtx)
> >>> > +       && HARD_REGISTER_P (prev_rtx)
> >>> > +       && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >>> > +     {
> >>> > +       /* If we can't put a hard register in MODE, first generate a
> >>> > +          subreg of word mode if the previous mode is wider than word
> >>> > +          mode and word mode is wider than MODE.  */
> >>> > +       if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode)
> >>> > +           && UNITS_PER_WORD > GET_MODE_SIZE (mode))
> >>> > +         {
> >>> > +           prev_rtx = lowpart_subreg (word_mode, prev_rtx,
> >>> > +                                      prev_mode);
> >>> > +           if (prev_rtx != nullptr)
> >>> > +             prev_mode = word_mode;
> >>> > +         }
> >>> > +       else
> >>> > +         prev_rtx = nullptr;
> >>>
> >>> I don't understand this.  Why not just do the:
> >>>
> >>>       if (REG_P (prev_rtx)
> >>>           && HARD_REGISTER_P (prev_rtx)
> >>>           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >>>         prev_rtx = copy_to_reg (prev_rtx);
> >>>
> >>> that I suggested in the previous review?
> >>
> >> But for
> >> ---
> >> extern void *ops;
> >>
> >> void
> >> foo (int c)
> >> {
> >>   __builtin_memset (ops, c, 18);
> >> }
> >> ---
> >> I got
> >>
> >> vpbroadcastb %edi, %xmm31
> >> vmovdqa64 %xmm31, -24(%rsp)
> >> movq ops(%rip), %rax
> >> movzwl -24(%rsp), %edx
> >> vmovdqu8 %xmm31, (%rax)
> >> movw %dx, 16(%rax)
> >> ret
> >>
> >> I want to avoid store and load.  I am testing
> >>
> >>       if (REG_P (prev_rtx)
> >>           && HARD_REGISTER_P (prev_rtx)
> >>           && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0)
> >>         {
> >>           /* Find the smallest subreg mode in the same mode class which
> >>              is not narrower than MODE and narrower than PREV_MODE.  */
> >>           machine_mode m;
> >>           fixed_size_mode candidate;
> >>           FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode))
> >>             if (is_a<fixed_size_mode> (m, &candidate))
> >>               {
> >>                 if (GET_MODE_SIZE (candidate)
> >>                     >= GET_MODE_SIZE (prev_mode))
> >>                   break;
> >>                 if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode)
> >>                     && lowpart_subreg_regno (REGNO (prev_rtx),
> >>                                              prev_mode, candidate) >= 0)
> >>                   {
> >>                     target = lowpart_subreg (candidate, prev_rtx,
> >>                                              prev_mode);
> >>                     prev_rtx = target;
> >>                     prev_mode = candidate;
> >>                     break;
> >>                   }
> >>               }
> >
> > That doesn't seem better though.  There are still two subregs involved.
>
> Actually, I take that back.  I guess it does make sense, and is
> definitely better than hard-coding word_mode.  How about a comment
> along the lines of the following, instead of the one above:
>
>            /* This case occurs when PREV_MODE is a vector and when
>               MODE is too small to store using vector operations.
>               After register allocation, the code will need to move the
>               lowpart of the vector register into a non-vector register.
>
>               Also, the target has chosen to use a hard register
>               instead of going with the default choice of using a
>               pseudo register.  We should respect that choice and try to
>               avoid creating a pseudo register with the same mode as the
>               current hard register.
>
>               In principle, we could just use a lowpart MODE subreg of
>               the vector register.  However, the vector register mode might
>               be too wide for non-vector registers, and we already know
>               that the non-vector mode is too small for vector registers.
>               It's therefore likely that we'd need to spill to memory in
>               the vector mode and reload the non-vector value from there.
>
>               Try to avoid that by reducing the vector register to the
>               smallest size that it can hold.  This should increase the
>               chances that non-vector registers can hold both the inner
>               and outer modes of the subreg that we generate later.  */

Fixed.

> Thanks,
> Richard

I sent the v4 patch.

Thanks.

-- 
H.J.

Reply via email to