https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115534

            Bug ID: 115534
           Summary: intermediate stack use not eliminated
           Product: gcc
           Version: 15.0
            Status: UNCONFIRMED
          Keywords: missed-optimization
          Severity: normal
          Priority: P3
         Component: tree-optimization
          Assignee: unassigned at gcc dot gnu.org
          Reporter: tnfchris at gcc dot gnu.org
  Target Milestone: ---

Consider the following example:

#include <stdint.h>

typedef struct _pixel_t
{
  double red, green, blue, opacity;
} pixel_t;

typedef struct _PixelPacket
{
  unsigned short blue, green, red, opacity;
} PixelPacket;

pixel_t f (unsigned height, unsigned width, unsigned virt_width,
           uint8_t *restrict k, const PixelPacket *restrict k_pixels)
{
    pixel_t result = {};
    for (unsigned u=0; u < (width & -4); u++, k--) {
        result.red     += (*k)*k_pixels[u].red;
        result.green   += (*k)*k_pixels[u].green;
        result.blue    += (*k)*k_pixels[u].blue;
        result.opacity += (*k)*k_pixels[u].opacity;
        k_pixels += virt_width;
    }
    return result;
}

---

compiled with -O3 vectorizes as good, but the epilogue code is very
inefficient:

        fadd    v29.2d, v29.2d, v30.2d
        fadd    v28.2d, v28.2d, v31.2d
        cmp     w5, w1
        bhi     .L3
        mov     v31.16b, v28.16b
        ins     v31.d[1], v29.d[1]
        ins     v29.d[1], v28.d[1]
        stp     q31, q29, [sp, 32]
        ldp     d0, d1, [sp, 32]
        ldp     d2, d3, [sp, 48]
        add     sp, sp, 64
        ret
.L4:
        movi    v29.2d, 0
        mov     v31.16b, v29.16b
        stp     q31, q29, [sp, 32]
        ldp     d0, d1, [sp, 32]
        ldp     d2, d3, [sp, 48]
        add     sp, sp, 64
        ret

as in it goes through the stack to create the return registers.  This looks
like  at gimple we still have the store:

  <bb 5> [local count: 105119324]:
  _33 = VEC_PERM_EXPR <vect__10.16_41, vect__10.16_42, { 0, 3 }>;
  _31 = VEC_PERM_EXPR <vect__10.16_42, vect__10.16_41, { 0, 3 }>;

  <bb 6> [local count: 118111600]:
  # vect_result_red_64.18_28 = PHI <_33(5), { 0.0, 0.0 }(2)>
  # vect_result_red_64.18_105 = PHI <_31(5), { 0.0, 0.0 }(2)>
  MEM <vector(2) double> [(double *)&D.4535] = vect_result_red_64.18_28;
  MEM <vector(2) double> [(double *)&D.4535 + 16B] = vect_result_red_64.18_105;
  return D.4535;

clang is able to generate much better code here:

        fadd    v0.2d, v0.2d, v1.2d
        fadd    v2.2d, v2.2d, v3.2d
        b.ne    .LBB0_2
.LBB0_3:
        mov     d1, v2.d[1]
        mov     d3, v0.d[1]
        ret

The vectorized code gets reg-alloc'ed so that d0 an d2 are already in the right
registers at the end of the vector loop, and the epilogue only has to split the
registers up to get d1 and d3.

I think we would generate the same if we were to elide the intermediate stack
store.

See https://godbolt.org/z/ocqchWWs5

Reply via email to