https://gcc.gnu.org/bugzilla/show_bug.cgi?id=113104

Feng Xue <fxue at os dot amperecomputing.com> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
         Resolution|FIXED                       |---
             Status|RESOLVED                    |REOPENED

--- Comment #7 from Feng Xue <fxue at os dot amperecomputing.com> ---
Partial permutation (especially extract-low/high) would incur inefficient
splicing in some situation even the vector mode with lowest cost is used.

  int test(unsigned array[4][4]);

  int foo(unsigned short *a, unsigned long n)
  {
    unsigned array[4][4];

    for (unsigned i = 0; i < 4; i++, a += n)
      {
        array[i][0] = (a[0] << 3) - (a[4] << 6);
        array[i][1] = (a[1] << 3) - (a[5] << 6);
        array[i][2] = (a[2] << 3) - (a[6] << 6);
        array[i][3] = (a[3] << 3) - (a[7] << 6);
      }

    return test(array);
  }

        // After vect-compare-cost fix
        mov     x2, x0
        stp     x29, x30, [sp, -80]!
        add     x3, x2, x1, lsl 1
        lsl     x1, x1, 1
        mov     x29, sp
        add     x4, x3, x1
        add     x0, sp, 16
        ldr     q5, [x2]
        ldr     q31, [x4, x1]
        ldr     q0, [x3, x1]
        ldr     q1, [x2, x1]
        movi    v28.4s, 0                //
        zip1    v29.2d, v0.2d, v31.2d    //
        zip1    v2.2d, v5.2d, v1.2d      //
        zip2    v31.2d, v0.2d, v31.2d    //
        zip2    v1.2d, v5.2d, v1.2d      // 
        zip1    v30.8h, v29.8h, v28.8h   //
        zip1    v4.8h, v2.8h, v28.8h     //  superfluous
        zip1    v27.8h, v31.8h, v28.8h   //
        zip1    v3.8h, v1.8h, v28.8h     //
        zip2    v29.8h, v29.8h, v28.8h   //
        zip2    v31.8h, v31.8h, v28.8h   //
        zip2    v2.8h, v2.8h, v28.8h     //
        zip2    v1.8h, v1.8h, v28.8h     //
        shl     v30.4s, v30.4s, 3
        shl     v29.4s, v29.4s, 3
        shl     v4.4s, v4.4s, 3
        shl     v2.4s, v2.4s, 3
        shl     v27.4s, v27.4s, 6
        shl     v31.4s, v31.4s, 6
        shl     v3.4s, v3.4s, 6
        shl     v1.4s, v1.4s, 6
        sub     v27.4s, v30.4s, v27.4s
        sub     v31.4s, v29.4s, v31.4s
        sub     v3.4s, v4.4s, v3.4s
        sub     v1.4s, v2.4s, v1.4s
        stp     q27, q31, [sp, 48]
        stp     q3, q1, [sp, 16]
        bl      test
        ldp     x29, x30, [sp], 80
        ret


        // Expect it to be optimized as:
        lsl     x3, x1, 1
        mov     x2, x0
        stp     x29, x30, [sp, -80]!
        add     x1, x2, x1, lsl 1
        add     x4, x1, x3
        mov     x29, sp
        add     x0, sp, 16
        ldr     q30, [x2, x3]
        ldr     q0, [x2]
        ushll   v31.4s, v30.4h, 3
        ushll2  v30.4s, v30.8h, 6
        ushll   v29.4s, v0.4h, 3
        ushll2  v0.4s, v0.8h, 6
        sub     v30.4s, v31.4s, v30.4s
        sub     v0.4s, v29.4s, v0.4s
        str     q0, [sp, 16]
        ldr     q0, [x1, x3]
        str     q30, [sp, 32]
        ldr     q30, [x4, x3]
        ushll   v29.4s, v0.4h, 3
        ushll2  v0.4s, v0.8h, 6
        ushll   v31.4s, v30.4h, 3
        ushll2  v30.4s, v30.8h, 6
        sub     v0.4s, v29.4s, v0.4s
        sub     v30.4s, v31.4s, v30.4s
        stp     q0, q30, [sp, 48]
        bl      test
        ldp     x29, x30, [sp], 80
        ret

Based on cost arising from splicing, we still need a way to select the most
profitable vector mode per slp node, over the global vector mode specified in
vinfo.

Reply via email to