https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95254
Bug ID: 95254 Summary: aarch64: gcc generate inefficient code with fixed sve vector length Product: gcc Version: 11.0 Status: UNCONFIRMED Severity: normal Priority: P3 Component: target Assignee: unassigned at gcc dot gnu.org Reporter: felix.yang at huawei dot com Target Milestone: --- Target: aarch64 Test case: typedef short __attribute__((vector_size (8))) v4hi; typedef union U4HI { v4hi v; short a[4]; } u4hi; short b[4]; void pass_v4hi (v4hi v) { int i; u4hi u; u.v = v; for (i = 0; i < 4; i++) b[i] = u.a[i]; }; $ gcc -O2 -ftree-slp-vectorize -S -march=armv8.2-a+sve foo.c assembly code: pass_v4hi: .LFB0: .cfi_startproc adrp x0, .LANCHOR0 str d0, [x0, #:lo12:.LANCHOR0] ret .cfi_endproc $ gcc -O2 -ftree-slp-vectorize -S -march=armv8.2-a+sve -msve-vector-bits=256 foo.c assembly code: pass_v4hi: .LFB0: .cfi_startproc sub sp, sp, #16 .cfi_def_cfa_offset 16 ptrue p0.b, vl32 adrp x0, .LANCHOR0 add x0, x0, :lo12:.LANCHOR0 str d0, [sp, 8] ld1h z0.d, p0/z, [sp, #1, mul vl] st1h z0.d, p0, [x0] add sp, sp, 16 .cfi_def_cfa_offset 0 ret .cfi_endproc The root cause here is that we choose a different mode in aarch64_vectorize_related_mode[1]: VNx2HImode instead of V4HImode. Then in the final tree ssa forwprop pass, we need to do a VIEW_CONVERT from V4HImode to VNx2HImode. One way to fix this is to catch and simplify the pattern in aarch64_expand_sve_mem_move, emitting a mov pattern of V4HImode instead. I am assuming endianness does not make a difference here. Will propose a patch for comments. [1] call trace: (gdb) bt #0 aarch64_vectorize_related_mode (vector_mode=E_VNx8HImode, element_mode=..., nunits=...) at ../../gcc-git/gcc/config/aarch64/aarch64.c:2377 #1 0x00000000012983b4 in related_vector_mode (vector_mode=E_VNx8HImode, element_mode=..., nunits=...) at ../../gcc-git/gcc/stor-layout.c:535 #2 0x0000000001652918 in get_related_vectype_for_scalar_type (prevailing_mode=E_VNx8HImode, scalar_type=0xffffb22da498, nunits=...) at ../../gcc-git/gcc/tree-vect-stmts.c:11463 #3 0x0000000001653304 in get_vectype_for_scalar_type (vinfo=0x2f0dc80, scalar_type=0xffffb22da498, group_size=4) at ../../gcc-git/gcc/tree-vect-stmts.c:11545 #4 0x00000000016533a0 in get_vectype_for_scalar_type (vinfo=0x2f0dc80, scalar_type=0xffffb22da498, node=0x2e5d460) at ../../gcc-git/gcc/tree-vect-stmts.c:11569 #5 0x00000000016987e8 in vect_get_constant_vectors (vinfo=0x2f0dc80, slp_node=0x2e53080, op_num=0, vec_oprnds=0xffffffffc738) at ../../gcc-git/gcc/tree-vect-slp.c:3562 #6 0x00000000016993f8 in vect_get_slp_defs (vinfo=0x2f0dc80, slp_node=0x2e53080, vec_oprnds=0xffffffffc7a8, n=1) at ../../gcc-git/gcc/tree-vect-slp.c:3786 #7 0x0000000001631c70 in vect_get_vec_defs (vinfo=0x2f0dc80, op0=0xffffb20e3120, op1=0x0, stmt_info=0x2feef60, vec_oprnds0=0xffffffffcdd0, vec_oprnds1=0x0, slp_node=0x2e53080) at ../../gcc-git/gcc/tree-vect-stmts.c:1726 #8 0x0000000001648bc8 in vectorizable_store (vinfo=0x2f0dc80, stmt_info=0x2feef60, gsi=0xffffffffdad0, vec_stmt=0xffffffffd5b0, slp_node=0x2e53080, cost_vec=0x0) at ../../gcc-git/gcc/tree-vect-stmts.c:8186 #9 0x0000000001651808 in vect_transform_stmt (vinfo=0x2f0dc80, stmt_info=0x2feef60, gsi=0xffffffffdad0, slp_node=0x2e53080, slp_node_instance=0x2fefe70) at ../../gcc-git/gcc/tree-vect-stmts.c:11184 #10 0x000000000169a4a0 in vect_schedule_slp_instance (vinfo=0x2f0dc80, node=0x2e53080, instance=0x2fefe70) at ../../gcc-git/gcc/tree-vect-slp.c:4134 #11 0x000000000169aaac in vect_schedule_slp (vinfo=0x2f0dc80) at ../../gcc-git/gcc/tree-vect-slp.c:4258 #12 0x00000000016972f0 in vect_slp_bb_region (region_begin=..., region_end=..., datarefs=..., n_stmts=10) at ../../gcc-git/gcc/tree-vect-slp.c:3227 #13 0x0000000001697c60 in vect_slp_bb (bb=0xffffb22ce340) at ../../gcc-git/gcc/tree-vect-slp.c:3350 #14 0x00000000016a56f0 in (anonymous namespace)::pass_slp_vectorize::execute (this=0x2e6aae0, fun=0xffffb2116000) at ../../gcc-git/gcc/tree-vectorizer.c:1320