https://gcc.gnu.org/bugzilla/show_bug.cgi?id=101041

            Bug ID: 101041
           Summary: z13: Inefficient handling of vector register passed to
                    function
           Product: gcc
           Version: 8.3.1
            Status: UNCONFIRMED
          Severity: normal
          Priority: P3
         Component: target
          Assignee: unassigned at gcc dot gnu.org
          Reporter: jens.seifert at de dot ibm.com
  Target Milestone: ---

#include <vecintrin.h>
vector unsigned long long mul64(vector unsigned long long a, vector unsigned
long long b)
{
   return a * b;
}

creates:
_Z5mul64Dv2_yS_:
.LFB9:
        .cfi_startproc
        ldgr    %f4,%r15
        .cfi_register 15, 18
        lay     %r15,-192(%r15)
        .cfi_def_cfa_offset 352
        vst     %v24,160(%r15),3
        vst     %v26,176(%r15),3
        lg      %r2,160(%r15)
        lg      %r1,176(%r15)
        lgr     %r4,%r2
        lg      %r0,168(%r15)
        lgr     %r2,%r1
        lg      %r1,184(%r15)
        lgr     %r5,%r0
        lgr     %r3,%r1
        vlvgp   %v2,%r4,%r5
        vlvgp   %v0,%r2,%r3
        vlgvg   %r4,%v2,0
        vlgvg   %r1,%v2,1
        vlgvg   %r2,%v0,0
        vlgvg   %r3,%v0,1
        msgr    %r2,%r4
        msgr    %r1,%r3
        lgdr    %r15,%f4
        .cfi_restore 15
        .cfi_def_cfa_offset 160
        vlvgp   %v24,%r2,%r1
        br      %r14

Store to stack of v24,v26, then lg+lgr for all 4 parts, then constructing new
vector register v0 and v2 and then extract the 4 elements again using vlgvg.

Expected 4 * vlgvg + 2 * msgr + vlvgp

Reply via email to