https://gcc.gnu.org/bugzilla/show_bug.cgi?id=121051

--- Comment #5 from Robin Dapp <rdapp at gcc dot gnu.org> ---
A reduced example is

#include <riscv_vector.h>

vuint16m8_t
trans8x8_vslide(vuint16m8_t v)
{
  size_t VL = __riscv_vsetvlmax_e64m4();
  vbool16_t modd  = __riscv_vreinterpret_b16(
                                             __riscv_vmv_v_x_u8m1(0b10101010,
__riscv_vsetvlmax_e8m1()));
  vbool16_t meven = __riscv_vmnot(modd, VL);
  vbool16_t m;

  vuint64m4_t v4l = __riscv_vreinterpret_u64m4(__riscv_vget_u16m4(v, 0));
  vuint64m4_t v4h = __riscv_vreinterpret_u64m4(__riscv_vget_u16m4(v, 1));
  vuint64m4_t v4lt = v4l;
  m = modd;
  v4l = __riscv_vslide1up_mu(m, v4l, v4h, 0, VL);
  m = meven;
  v4h = __riscv_vslide1down_mu(m, v4h, v4lt, 0, VL);

  vuint32m2_t v2ll = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4l, 0));
  vuint32m2_t v2lh = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4l, 1));
  vuint32m2_t v2hl = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4h, 0));
  vuint32m2_t v2hh = __riscv_vreinterpret_u32m2(__riscv_vget_u64m2(v4h, 1));
  vuint32m2_t v2llt = v2lh, v2hlt = v2hh;
  v2hh = __riscv_vslide1down_mu(m, v2hh, v2hl, 0, VL);
  m = modd;
  v2ll = __riscv_vslide1up_mu(m, v2ll, v2llt, 0, VL);
  v2hl = __riscv_vslide1up_mu(m, v2hl, v2hlt, 0, VL);

  vuint16m1_t v1lll = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2ll, 0));
  vuint16m1_t v1llh = __riscv_vreinterpret_u16m1(__riscv_vget_u32m1(v2ll, 1));
  v1lll = __riscv_vslide1up_mu(m, v1lll, v1llh, 0, VL);

  return __riscv_vcreate_v_u16m1_u16m8(
                                       v1lll, v1lll, v1lll, v1lll,
                                       v1lll, v1lll, v1lll, v1llh);
}

With all my local changes I'm now getting:

        vsetvli a3,zero,e8,m1,ta,ma
        li      a4,-86
        vmv.v.x v0,a4
        vsetvli a5,zero,e64,m4,ta,mu
        vslide1up.vx    v8,v12,zero,v0.t
        vsetvli zero,zero,e32,m2,ta,mu
        vslide1up.vx    v8,v10,zero,v0.t
        vsetvli zero,zero,e16,m1,ta,mu
        vslide1up.vx    v8,v9,zero,v0.t
        vmv1r.v v16,v8
        vmv1r.v v17,v8
        vmv1r.v v18,v8
        vmv1r.v v19,v8
        vmv1r.v v20,v8
        vmv1r.v v21,v8
        vmv1r.v v22,v8
        vmv1r.v v23,v9
        vsetivli        zero,0,e16,m8,ta,ma
        vmv8r.v v8,v16
        ret

and no reloads around the slideups anymore, so a bit of progress.

It's very involved though, and upstreaming all of this is going to be a huge
effort.

slides are more difficult because of the overwrite and we use earlyclobber to
model this.  Right now, though, earlyclobber is not really prepared to operate
on sub-reg granularity.  As are 100 other special cases.

Reply via email to