https://gcc.gnu.org/bugzilla/show_bug.cgi?id=111848

--- Comment #1 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Sorry, it pick LMUL = 4:

f3:
        ble     a4,zero,.L11
        csrr    t0,vlenb
        slli    t1,t0,4
        csrr    a6,vlenb
        sub     sp,sp,t1
        csrr    a5,vlenb
        slli    a6,a6,3
        slli    a5,a5,2
        add     a6,a6,sp
        vsetvli a7,zero,e16,m8,ta,ma
        slli    a4,a4,3
        vid.v   v8
        addi    t6,a5,-1
        vand.vi v8,v8,-2
        neg     t5,a5
        vs8r.v  v8,0(sp)
        vadd.vi v8,v8,1
        vs8r.v  v8,0(a6)
        j       .L7
.L14:
        vsetvli a7,zero,e16,m8,ta,ma
.L7:
        csrr    t0,vlenb
        slli    t0,t0,3
        vl8re16.v       v16,0(sp)
        add     t0,t0,sp
        vmv.v.x v8,t6
        mv      t1,a4
        vand.vv v24,v16,v8
        mv      a6,a4
        vl8re16.v       v16,0(t0)
        vand.vv v8,v16,v8
        bleu    a4,a5,.L6
        mv      a6,a5
.L6:
        vsetvli zero,a6,e8,m4,ta,ma
        vle8.v  v20,0(a2)
        vle8.v  v16,0(a3)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v24
        vadd.vv v4,v16,v4
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a0)
        vle8.v  v20,0(a2)
        vsetvli a7,zero,e8,m4,ta,ma
        vrgatherei16.vv v4,v20,v8
        vadd.vv v4,v4,v16
        vsetvli zero,a6,e8,m4,ta,ma
        vse8.v  v4,0(a1)


Ideally LMUL should be 2:

f3:
        ble     a4,zero,.L9
        csrr    a5,vlenb
        slli    a5,a5,1
        vsetvli a7,zero,e16,m4,ta,ma
        slli    a4,a4,3
        vid.v   v12
        addi    t6,a5,-1
        vand.vi v12,v12,-2
        neg     t5,a5
        vadd.vi v16,v12,1
        j       .L7
.L10:
        vsetvli a7,zero,e16,m4,ta,ma
.L7:
        vmv.v.x v4,t6
        mv      t1,a4
        vand.vv v20,v12,v4
        mv      a6,a4
        vand.vv v4,v16,v4
        bleu    a4,a5,.L6
        mv      a6,a5
.L6:
        vsetvli zero,a6,e8,m2,ta,ma
        vle8.v  v10,0(a2)
        vle8.v  v8,0(a3)
        vsetvli a7,zero,e8,m2,ta,ma
        vrgatherei16.vv v2,v10,v20
        vadd.vv v2,v8,v2
        vsetvli zero,a6,e8,m2,ta,ma
        vse8.v  v2,0(a0)
        vle8.v  v10,0(a2)
        vsetvli a7,zero,e8,m2,ta,ma
        vrgatherei16.vv v2,v10,v4
        vadd.vv v2,v2,v8
        vsetvli zero,a6,e8,m2,ta,ma
        vse8.v  v2,0(a1)
        add     a4,a4,t5
        add     a0,a0,a5
        add     a3,a3,a5
        add     a1,a1,a5
        add     a2,a2,a5
        bgtu    t1,a5,.L10
.L9:
        ret

Reply via email to