On Wed, 16 Sep 2020, Jakub Jelinek wrote:
> Hi!
>
> As the testcases show, if we have something like:
> MEM <char[12]> [&b + 8B] = {};
> MEM[(short *) &b] = 5;
> _5 = *x_4(D);
> MEM <long long unsigned int> [&b + 2B] = _5;
> MEM[(char *)&b + 16B] = 88;
> MEM[(int *)&b + 20B] = 1;
> then in sort_by_bitpos the stores are almost like in the given order,
> except the first store is after the = _5; store.
> We can't coalesce the = 5; store with = _5;, because the latter is MEM_REF,
> while the former INTEGER_CST, and we can't coalesce the = _5 store with
> the = {} store because the former is MEM_REF, the latter INTEGER_CST.
> But we happily coalesce the remaining 3 stores, which is wrong, because the
> = _5; store overlaps those and is in between them in the program order.
> We already have code to deal with similar cases in check_no_overlap, but we
> deal only with the following stores in sort_by_bitpos order, not the earlier
> ones.
>
> The following patch checks also the earlier ones. In
> coalesce_immediate_stores
> it computes the first one that needs to be checked (all the ones whose
> bitpos + bitsize is smaller or equal to merged_store->start don't need to be
> checked and don't need to be checked even for any following attempts because
> of the sort_by_bitpos sorting) and the end of that (that is the first store
> in the merged_store).
>
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?
OK.
Thanks,
Richard.
> 2020-09-16 Jakub Jelinek <[email protected]>
>
> PR tree-optimization/97053
> * gimple-ssa-store-merging.c (check_no_overlap): Add FIRST_ORDER,
> START, FIRST_EARLIER and LAST_EARLIER arguments. Return false if
> any stores between FIRST_EARLIER inclusive and LAST_EARLIER exclusive
> has order in between FIRST_ORDER and LAST_ORDER and overlaps the to
> be merged store.
> (imm_store_chain_info::try_coalesce_bswap): Add FIRST_EARLIER argument.
> Adjust check_no_overlap caller.
> (imm_store_chain_info::coalesce_immediate_stores): Add first_earlier
> and last_earlier variables, adjust them during iterations. Adjust
> check_no_overlap callers, call check_no_overlap even when extending
> overlapping stores by extra INTEGER_CST stores.
>
> * gcc.dg/store_merging_31.c: New test.
> * gcc.dg/store_merging_32.c: New test.
>
> --- gcc/gimple-ssa-store-merging.c.jj 2020-08-12 12:45:46.000000000 +0200
> +++ gcc/gimple-ssa-store-merging.c 2020-09-15 16:51:11.393453396 +0200
> @@ -2116,7 +2116,8 @@ public:
> }
> }
> bool terminate_and_process_chain ();
> - bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int);
> + bool try_coalesce_bswap (merged_store_group *, unsigned int, unsigned int,
> + unsigned int);
> bool coalesce_immediate_stores ();
> bool output_merged_store (merged_store_group *);
> bool output_merged_stores ();
> @@ -2443,14 +2444,39 @@ gather_bswap_load_refs (vec<tree> *refs,
> into the group. That way it will be its own store group and will
> not be touched. If ALL_INTEGER_CST_P and there are overlapping
> INTEGER_CST stores, those are mergeable using merge_overlapping,
> - so don't return false for those. */
> + so don't return false for those.
> +
> + Similarly, check stores from FIRST_EARLIER (inclusive) to END_EARLIER
> + (exclusive), whether they don't overlap the bitrange START to END
> + and have order in between FIRST_ORDER and LAST_ORDER. This is to
> + prevent merging in cases like:
> + MEM <char[12]> [&b + 8B] = {};
> + MEM[(short *) &b] = 5;
> + _5 = *x_4(D);
> + MEM <long long unsigned int> [&b + 2B] = _5;
> + MEM[(char *)&b + 16B] = 88;
> + MEM[(int *)&b + 20B] = 1;
> + The = {} store comes in sort_by_bitpos before the = 88 store, and can't
> + be merged with it, because the = _5 store overlaps these and is in between
> + them in sort_by_order ordering. If it was merged, the merged store would
> + go after the = _5 store and thus change behavior. */
>
> static bool
> check_no_overlap (vec<store_immediate_info *> m_store_info, unsigned int i,
> - bool all_integer_cst_p, unsigned int last_order,
> - unsigned HOST_WIDE_INT end)
> + bool all_integer_cst_p, unsigned int first_order,
> + unsigned int last_order, unsigned HOST_WIDE_INT start,
> + unsigned HOST_WIDE_INT end, unsigned int first_earlier,
> + unsigned end_earlier)
> {
> unsigned int len = m_store_info.length ();
> + for (unsigned int j = first_earlier; j < end_earlier; j++)
> + {
> + store_immediate_info *info = m_store_info[j];
> + if (info->order > first_order
> + && info->order < last_order
> + && info->bitpos + info->bitsize > start)
> + return false;
> + }
> for (++i; i < len; ++i)
> {
> store_immediate_info *info = m_store_info[i];
> @@ -2471,7 +2497,8 @@ check_no_overlap (vec<store_immediate_in
> bool
> imm_store_chain_info::try_coalesce_bswap (merged_store_group *merged_store,
> unsigned int first,
> - unsigned int try_size)
> + unsigned int try_size,
> + unsigned int first_earlier)
> {
> unsigned int len = m_store_info.length (), last = first;
> unsigned HOST_WIDE_INT width = m_store_info[first]->bitsize;
> @@ -2611,7 +2638,8 @@ imm_store_chain_info::try_coalesce_bswap
> if (n.base_addr == NULL_TREE && !is_gimple_val (n.src))
> return false;
>
> - if (!check_no_overlap (m_store_info, last, false, last_order, end))
> + if (!check_no_overlap (m_store_info, last, false, first_order, last_order,
> + merged_store->start, end, first_earlier, first))
> return false;
>
> /* Don't handle memory copy this way if normal non-bswap processing
> @@ -2703,6 +2731,8 @@ imm_store_chain_info::coalesce_immediate
>
> store_immediate_info *info;
> unsigned int i, ignore = 0;
> + unsigned int first_earlier = 0;
> + unsigned int end_earlier = 0;
>
> /* Order the stores by the bitposition they write to. */
> m_store_info.qsort (sort_by_bitpos);
> @@ -2719,6 +2749,12 @@ imm_store_chain_info::coalesce_immediate
> if (i <= ignore)
> goto done;
>
> + while (first_earlier < end_earlier
> + && (m_store_info[first_earlier]->bitpos
> + + m_store_info[first_earlier]->bitsize
> + <= merged_store->start))
> + first_earlier++;
> +
> /* First try to handle group of stores like:
> p[0] = data >> 24;
> p[1] = data >> 16;
> @@ -2733,7 +2769,8 @@ imm_store_chain_info::coalesce_immediate
> {
> unsigned int try_size;
> for (try_size = 64; try_size >= 16; try_size >>= 1)
> - if (try_coalesce_bswap (merged_store, i - 1, try_size))
> + if (try_coalesce_bswap (merged_store, i - 1, try_size,
> + first_earlier))
> break;
>
> if (try_size >= 16)
> @@ -2741,7 +2778,10 @@ imm_store_chain_info::coalesce_immediate
> ignore = i + merged_store->stores.length () - 1;
> m_merged_store_groups.safe_push (merged_store);
> if (ignore < m_store_info.length ())
> - merged_store = new merged_store_group (m_store_info[ignore]);
> + {
> + merged_store = new merged_store_group (m_store_info[ignore]);
> + end_earlier = ignore;
> + }
> else
> merged_store = NULL;
> goto done;
> @@ -2776,12 +2816,16 @@ imm_store_chain_info::coalesce_immediate
> && merged_store->only_constants
> && info->lp_nr == merged_store->lp_nr)
> {
> + unsigned int first_order
> + = MIN (merged_store->first_order, info->order);
> unsigned int last_order
> = MAX (merged_store->last_order, info->order);
> unsigned HOST_WIDE_INT end
> = MAX (merged_store->start + merged_store->width,
> info->bitpos + info->bitsize);
> - if (check_no_overlap (m_store_info, i, true, last_order, end))
> + if (check_no_overlap (m_store_info, i, true, first_order,
> + last_order, merged_store->start, end,
> + first_earlier, end_earlier))
> {
> /* check_no_overlap call above made sure there are no
> overlapping stores with non-INTEGER_CST rhs_code
> @@ -2810,6 +2854,7 @@ imm_store_chain_info::coalesce_immediate
> do
> {
> unsigned int max_order = 0;
> + unsigned int min_order = first_order;
> unsigned first_nonmergeable_int_order = ~0U;
> unsigned HOST_WIDE_INT this_end = end;
> k = i;
> @@ -2836,6 +2881,7 @@ imm_store_chain_info::coalesce_immediate
> break;
> }
> k = j;
> + min_order = MIN (min_order, info2->order);
> this_end = MAX (this_end,
> info2->bitpos + info2->bitsize);
> }
> @@ -2852,6 +2898,12 @@ imm_store_chain_info::coalesce_immediate
> first_nonmergeable_order
> = MIN (first_nonmergeable_order, info2->order);
> }
> + if (k > i
> + && !check_no_overlap (m_store_info, len - 1, true,
> + min_order, try_order,
> + merged_store->start, this_end,
> + first_earlier, end_earlier))
> + k = 0;
> if (k == 0)
> {
> if (last_order == try_order)
> @@ -2937,9 +2989,12 @@ imm_store_chain_info::coalesce_immediate
> info->ops_swapped_p = true;
> }
> if (check_no_overlap (m_store_info, i, false,
> + MIN (merged_store->first_order, info->order),
> MAX (merged_store->last_order, info->order),
> + merged_store->start,
> MAX (merged_store->start + merged_store->width,
> - info->bitpos + info->bitsize)))
> + info->bitpos + info->bitsize),
> + first_earlier, end_earlier))
> {
> /* Turn MEM_REF into BIT_INSERT_EXPR for bit-field stores. */
> if (info->rhs_code == MEM_REF && infof->rhs_code != MEM_REF)
> @@ -2985,6 +3040,7 @@ imm_store_chain_info::coalesce_immediate
> delete merged_store;
>
> merged_store = new merged_store_group (info);
> + end_earlier = i;
> if (dump_file && (dump_flags & TDF_DETAILS))
> fputs ("New store group\n", dump_file);
>
> --- gcc/testsuite/gcc.dg/store_merging_31.c.jj 2020-09-15
> 16:18:47.451623297 +0200
> +++ gcc/testsuite/gcc.dg/store_merging_31.c 2020-09-15 16:18:31.460846430
> +0200
> @@ -0,0 +1,27 @@
> +/* PR tree-optimization/97053 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +
> +struct S { short a; char b[9]; int c; char d; int e; };
> +
> +__attribute__((noipa)) void
> +foo (char *x, char *y)
> +{
> + if (__builtin_strcmp (x, "ABCDXXXX") != 0
> + || __builtin_strcmp (y, "ABCDXXXX") != 0)
> + __builtin_abort ();
> +}
> +
> +int
> +main ()
> +{
> + char a[9] = "XXXXXXXX";
> + struct S b = {};
> + __builtin_memcpy (a, "ABCD", 4);
> + b.a = 5;
> + __builtin_memcpy (b.b, a, 8);
> + b.d = 'X';
> + b.e = 1;
> + foo (a, b.b);
> + return 0;
> +}
> --- gcc/testsuite/gcc.dg/store_merging_32.c.jj 2020-09-15
> 16:18:51.381568453 +0200
> +++ gcc/testsuite/gcc.dg/store_merging_32.c 2020-09-15 15:38:27.827420970
> +0200
> @@ -0,0 +1,129 @@
> +/* PR tree-optimization/97053 */
> +/* { dg-do run } */
> +/* { dg-options "-O2 -fno-tree-dse" } */
> +
> +struct __attribute__((packed, may_alias)) S { long long s; };
> +struct __attribute__((packed, may_alias)) T { short t; };
> +
> +__attribute__((noipa)) void
> +test (char *p, char *q, int s)
> +{
> + if ((s & 1) == 0)
> + {
> + if (*(short __attribute__((may_alias)) *) &p[sizeof (short)]
> + != *(short __attribute__((may_alias)) *) &q[sizeof (short)]
> + || (((struct S __attribute__((may_alias)) *) &p[1])->s
> + != ((struct S __attribute__((may_alias)) *) &q[1])->s)
> + || (*(short __attribute__((may_alias)) *) &p[2 * sizeof (short)]
> + != *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)]))
> + __builtin_abort ();
> + }
> + else
> + {
> + if (*(short __attribute__((may_alias)) *) &p[sizeof (short)]
> + != *(short __attribute__((may_alias)) *) &q[sizeof (short)]
> + || (((struct S __attribute__((may_alias)) *) &p[1])->s
> + != ((struct S __attribute__((may_alias)) *) &q[1])->s)
> + || (((struct T __attribute__((may_alias)) *) &p[2 * sizeof (short) -
> 1])->t
> + != ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short)
> - 1])->t)
> + || p[3 * sizeof (short) - 2] != q[3 * sizeof (short) - 2])
> + __builtin_abort ();
> + }
> +}
> +
> +__attribute__((noipa)) void
> +foo (long long *p, char *q, char *r, char *s)
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
> + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2;
> + *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1;
> + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &s[2 * sizeof (short)] = 2;
> + test (a, q, 0);
> +}
> +
> +__attribute__((noipa)) void
> +bar (long long *p, char *q, char *r, char *s, char *t)
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
> + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
> + ((struct T __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1])->t =
> 2;
> + a[3 * sizeof (short) - 2] = 3;
> + *(short __attribute__((may_alias)) *) &q[sizeof (short)] = 1;
> + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
> + ((struct T __attribute__((may_alias)) *) &s[2 * sizeof (short) - 1])->t =
> 2;
> + t[3 * sizeof (short) - 2] = 3;
> + test (a, q, 1);
> +}
> +
> +__attribute__((noipa)) void
> +baz (long long *p, char *q, char *r, char *s)
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = 2;
> + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
> + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = 2;
> + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = 1;
> + test (a, q, 2);
> +}
> +
> +__attribute__((noipa)) void
> +qux (long long *p, char *q, char *r, char *s, char *t)
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short) - 1] = 2;
> + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
> + a[3 * sizeof (short) - 2] = 3;
> + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = 1;
> + ((struct T __attribute__((may_alias)) *) &q[2 * sizeof (short) - 1])->t =
> 2;
> + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
> + s[3 * sizeof (short) - 2] = 3;
> + ((struct T __attribute__((may_alias)) *) &t[sizeof (short)])->t = 1;
> + test (a, q, 3);
> +}
> +
> +__attribute__((noipa)) void
> +corge (long long *p, char *q, char *r, char *s, short u[3])
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2];
> + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1];
> + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2];
> + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1];
> + test (a, q, 4);
> +}
> +
> +__attribute__((noipa)) void
> +garply (long long *p, char *q, char *r, char *s, short u[3])
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + *(short __attribute__((may_alias)) *) &a[sizeof (short)] = u[1];
> + ((struct S __attribute__((may_alias)) *) &a[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &a[2 * sizeof (short)] = u[2];
> + *(short __attribute__((may_alias)) *) &s[sizeof (short)] = u[1];
> + ((struct S __attribute__((may_alias)) *) &r[1])->s = p[0];
> + *(short __attribute__((may_alias)) *) &q[2 * sizeof (short)] = u[2];
> + test (a, q, 6);
> +}
> +
> +int
> +main ()
> +{
> + char a[64] __attribute__((aligned (__alignof (short))));
> + long long p = -1LL;
> + short u[] = { 1, 2, 3 };
> + foo (&p, &a[0], &a[0], &a[0]);
> + bar (&p, &a[0], &a[0], &a[0], &a[0]);
> + baz (&p, &a[0], &a[0], &a[0]);
> + qux (&p, &a[0], &a[0], &a[0], &a[0]);
> + corge (&p, &a[0], &a[0], &a[0], u);
> + garply (&p, &a[0], &a[0], &a[0], u);
> + return 0;
> +}
>
> Jakub
>
>
--
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)