On Wed, Jul 21, 2021 at 12:42 PM Richard Sandiford <richard.sandif...@arm.com> wrote: > > Richard Sandiford <richard.sandif...@arm.com> writes: > > "H.J. Lu via Gcc-patches" <gcc-patches@gcc.gnu.org> writes: > >> On Wed, Jul 21, 2021 at 7:50 AM Richard Sandiford > >> <richard.sandif...@arm.com> wrote: > >>> > >>> "H.J. Lu" <hjl.to...@gmail.com> writes: > >>> > diff --git a/gcc/builtins.c b/gcc/builtins.c > >>> > index 39ab139b7e1..1972301ce3c 100644 > >>> > --- a/gcc/builtins.c > >>> > +++ b/gcc/builtins.c > >>> > @@ -3890,13 +3890,16 @@ expand_builtin_strnlen (tree exp, rtx target, > >>> > machine_mode target_mode) > >>> > > >>> > static rtx > >>> > builtin_memcpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >>> > - scalar_int_mode mode) > >>> > + fixed_size_mode mode) > >>> > { > >>> > /* The REPresentation pointed to by DATA need not be a nul-terminated > >>> > string but the caller guarantees it's large enough for MODE. */ > >>> > const char *rep = (const char *) data; > >>> > > >>> > - return c_readstr (rep + offset, mode, /*nul_terminated=*/false); > >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >>> > + the memset expander. */ > >>> > >>> Sorry to nitpick, but I guess this might get out out-of-date. Maybe: > >>> > >>> /* The by-pieces infrastructure does not try to pick a vector mode > >>> for memcpy expansion. */ > >> > >> Fixed. > >> > >>> > + return c_readstr (rep + offset, as_a <scalar_int_mode> (mode), > >>> > + /*nul_terminated=*/false); > >>> > } > >>> > > >>> > /* LEN specify length of the block of memcpy/memset operation. > >>> > @@ -6478,14 +6481,16 @@ expand_builtin_stpncpy (tree exp, rtx) > >>> > > >>> > rtx > >>> > builtin_strncpy_read_str (void *data, void *, HOST_WIDE_INT offset, > >>> > - scalar_int_mode mode) > >>> > + fixed_size_mode mode) > >>> > { > >>> > const char *str = (const char *) data; > >>> > > >>> > if ((unsigned HOST_WIDE_INT) offset > strlen (str)) > >>> > return const0_rtx; > >>> > > >>> > - return c_readstr (str + offset, mode); > >>> > + /* NB: Vector mode in the by-pieces infrastructure is only used by > >>> > + the memset expander. */ > >>> > >>> Similarly here for strncpy expansion. > >> > >> Fixed. > >> > >>> > + return c_readstr (str + offset, as_a <scalar_int_mode> (mode)); > >>> > } > >>> > > >>> > /* Helper to check the sizes of sequences and the destination of calls > >>> > @@ -6686,30 +6691,117 @@ expand_builtin_strncpy (tree exp, rtx target) > >>> > return NULL_RTX; > >>> > } > >>> > > >>> > -/* Callback routine for store_by_pieces. Read GET_MODE_BITSIZE (MODE) > >>> > - bytes from constant string DATA + OFFSET and return it as target > >>> > - constant. If PREV isn't nullptr, it has the RTL info from the > >>> > +/* Return the RTL of a register in MODE generated from PREV in the > >>> > previous iteration. */ > >>> > > >>> > -rtx > >>> > -builtin_memset_read_str (void *data, void *prevp, > >>> > - HOST_WIDE_INT offset ATTRIBUTE_UNUSED, > >>> > - scalar_int_mode mode) > >>> > +static rtx > >>> > +gen_memset_value_from_prev (by_pieces_prev *prev, fixed_size_mode mode) > >>> > { > >>> > - by_pieces_prev *prev = (by_pieces_prev *) prevp; > >>> > + rtx target = nullptr; > >>> > if (prev != nullptr && prev->data != nullptr) > >>> > { > >>> > /* Use the previous data in the same mode. */ > >>> > if (prev->mode == mode) > >>> > return prev->data; > >>> > + > >>> > + fixed_size_mode prev_mode = prev->mode; > >>> > + > >>> > + /* Don't use the previous data to write QImode if it is in a > >>> > + vector mode. */ > >>> > + if (VECTOR_MODE_P (prev_mode) && mode == QImode) > >>> > + return target; > >>> > + > >>> > + rtx prev_rtx = prev->data; > >>> > + > >>> > + if (REG_P (prev_rtx) > >>> > + && HARD_REGISTER_P (prev_rtx) > >>> > + && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >>> > + { > >>> > + /* If we can't put a hard register in MODE, first generate a > >>> > + subreg of word mode if the previous mode is wider than word > >>> > + mode and word mode is wider than MODE. */ > >>> > + if (UNITS_PER_WORD < GET_MODE_SIZE (prev_mode) > >>> > + && UNITS_PER_WORD > GET_MODE_SIZE (mode)) > >>> > + { > >>> > + prev_rtx = lowpart_subreg (word_mode, prev_rtx, > >>> > + prev_mode); > >>> > + if (prev_rtx != nullptr) > >>> > + prev_mode = word_mode; > >>> > + } > >>> > + else > >>> > + prev_rtx = nullptr; > >>> > >>> I don't understand this. Why not just do the: > >>> > >>> if (REG_P (prev_rtx) > >>> && HARD_REGISTER_P (prev_rtx) > >>> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >>> prev_rtx = copy_to_reg (prev_rtx); > >>> > >>> that I suggested in the previous review? > >> > >> But for > >> --- > >> extern void *ops; > >> > >> void > >> foo (int c) > >> { > >> __builtin_memset (ops, c, 18); > >> } > >> --- > >> I got > >> > >> vpbroadcastb %edi, %xmm31 > >> vmovdqa64 %xmm31, -24(%rsp) > >> movq ops(%rip), %rax > >> movzwl -24(%rsp), %edx > >> vmovdqu8 %xmm31, (%rax) > >> movw %dx, 16(%rax) > >> ret > >> > >> I want to avoid store and load. I am testing > >> > >> if (REG_P (prev_rtx) > >> && HARD_REGISTER_P (prev_rtx) > >> && lowpart_subreg_regno (REGNO (prev_rtx), prev_mode, mode) < 0) > >> { > >> /* Find the smallest subreg mode in the same mode class which > >> is not narrower than MODE and narrower than PREV_MODE. */ > >> machine_mode m; > >> fixed_size_mode candidate; > >> FOR_EACH_MODE_IN_CLASS (m, GET_MODE_CLASS (mode)) > >> if (is_a<fixed_size_mode> (m, &candidate)) > >> { > >> if (GET_MODE_SIZE (candidate) > >> >= GET_MODE_SIZE (prev_mode)) > >> break; > >> if (GET_MODE_SIZE (candidate) >= GET_MODE_SIZE (mode) > >> && lowpart_subreg_regno (REGNO (prev_rtx), > >> prev_mode, candidate) >= 0) > >> { > >> target = lowpart_subreg (candidate, prev_rtx, > >> prev_mode); > >> prev_rtx = target; > >> prev_mode = candidate; > >> break; > >> } > >> } > > > > That doesn't seem better though. There are still two subregs involved. > > Actually, I take that back. I guess it does make sense, and is > definitely better than hard-coding word_mode. How about a comment > along the lines of the following, instead of the one above: > > /* This case occurs when PREV_MODE is a vector and when > MODE is too small to store using vector operations. > After register allocation, the code will need to move the > lowpart of the vector register into a non-vector register. > > Also, the target has chosen to use a hard register > instead of going with the default choice of using a > pseudo register. We should respect that choice and try to > avoid creating a pseudo register with the same mode as the > current hard register. > > In principle, we could just use a lowpart MODE subreg of > the vector register. However, the vector register mode might > be too wide for non-vector registers, and we already know > that the non-vector mode is too small for vector registers. > It's therefore likely that we'd need to spill to memory in > the vector mode and reload the non-vector value from there. > > Try to avoid that by reducing the vector register to the > smallest size that it can hold. This should increase the > chances that non-vector registers can hold both the inner > and outer modes of the subreg that we generate later. */
Fixed. > Thanks, > Richard I sent the v4 patch. Thanks. -- H.J.