https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112401

--- Comment #2 from JuzheZhong <juzhe.zhong at rivai dot ai> ---
Add more test:

void matrix_4x4_transpose_segmented_load(float* dst, float* src)
{
    vfloat32m1x4_t data = __riscv_vlseg4e32_v_f32m1x4(src, 4);
    vfloat32m1_t data0 = __riscv_vget_v_f32m1x4_f32m1(data, 0);
    vfloat32m1_t data1 = __riscv_vget_v_f32m1x4_f32m1(data, 1);
    vfloat32m1_t data2 = __riscv_vget_v_f32m1x4_f32m1(data, 2);
    vfloat32m1_t data3 = __riscv_vget_v_f32m1x4_f32m1(data, 3);
    vfloat32m4_t packedData = __riscv_vcreate_v_f32m1_f32m4(data0,
                                                            data1,
                                                            data2,
                                                            data3);
    __riscv_vse32_v_f32m4(dst, packedData, 16);
}

matrix_4x4_transpose_segmented_load:
        vsetivli        zero,4,e32,m1,ta,ma
        vlseg4e32.v     v8,(a1)
        vsetivli        zero,16,e32,m4,ta,ma
        vmv1r.v v4,v8
        vmv1r.v v5,v9
        vmv1r.v v6,v10
        vmv1r.v v7,v11
        vse32.v v4,0(a0)
        ret

Reply via email to