https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90235
--- Comment #1 from Hongtao.liu <crazylht at gmail dot com> --- (In reply to H.J. Lu from comment #0) > From PR 90202: > > [hjl@gnu-cfl-1 pr90202]$ cat x.ii > struct v { > int val[16]; > }; > > struct v test(struct v a, struct v b) { > struct v res; > > for (int i = 0; i < 16; i++) > res.val[i] = a.val[i] + b.val[i]; > > return res; > } > [hjl@gnu-cfl-1 pr90202]$ make CC=gcc > gcc -O3 -march=skylake -S x.ii > [hjl@gnu-cfl-1 pr90202]$ cat x.s > .file "x.ii" > .text > .p2align 4,,15 > .globl _Z4test1vS_ > .type _Z4test1vS_, @function > _Z4test1vS_: > .LFB0: > .cfi_startproc > pushq %rbp > .cfi_def_cfa_offset 16 > .cfi_offset 6, -16 > movq %rdi, %rax > movq %rsp, %rbp > .cfi_def_cfa_register 6 > vmovdqu 16(%rbp), %ymm1 > vmovdqu 48(%rbp), %ymm2 > vpaddd 80(%rbp), %ymm1, %ymm0 > vmovdqu %ymm0, (%rdi) > vpaddd 112(%rbp), %ymm2, %ymm0 > vmovdqu %ymm0, 32(%rdi) > vzeroupper > popq %rbp > .cfi_def_cfa 7, 8 > ret > .cfi_endproc > > Since there is > > rtx > gen_reg_rtx (machine_mode mode) > { > rtx val; > unsigned int align = GET_MODE_ALIGNMENT (mode); > > gcc_assert (can_create_pseudo_p ()); > > /* If a virtual register with bigger mode alignment is generated, > increase stack alignment estimation because it might be spilled > to stack later. */ > if (SUPPORTS_STACK_ALIGNMENT > && crtl->stack_alignment_estimated < align > && !crtl->stack_realign_processed) > { > unsigned int min_align = MINIMUM_ALIGNMENT (NULL, mode, align); > if (crtl->stack_alignment_estimated < min_align) > crtl->stack_alignment_estimated = min_align; > } > > and IRA has > > frame_pointer_needed > = (! flag_omit_frame_pointer > || (cfun->calls_alloca && EXIT_IGNORE_STACK) > /* We need the frame pointer to catch stack overflow exceptions if > the stack pointer is moving (as for the alloca case just above). > */ > || (STACK_CHECK_MOVING_SP > && flag_stack_check > && flag_exceptions > && cfun->can_throw_non_call_exceptions) > || crtl->accesses_prior_frames > || (SUPPORTS_STACK_ALIGNMENT && crtl->stack_realign_needed) > || targetm.frame_pointer_required ()); > > generate AVX/AVX512 pseudo registers via gen_reg_rtx will mark frame > pointer as needed. Stack realignment is needed to > > 1. Align the outgoing stack. > 2. Support aligned spill of AVX/AVX512 registers. > > But we won't know if spill is needed before RA. As the result, we > save and restore frame pointer even if not needed. Since > > (define_insn "mov<mode>_internal" > [(set (match_operand:VMOVE 0 "nonimmediate_operand" > "=v,v ,v ,m") > (match_operand:VMOVE 1 "nonimmediate_or_sse_const_operand" > " C,BC,vm,v"))] > "TARGET_SSE > && (register_operand (operands[0], <MODE>mode) > || register_operand (operands[1], <MODE>mode))" > > now supports both aligned and unaligned load/store of AVX/AVX512 > registers, we can change gen_reg_rtx to > > /* If a virtual register with bigger mode alignment is generated, > increase stack alignment estimation because it might be spilled > to stack later. */ > if (SUPPORTS_STACK_ALIGNMENT > && !SUPPORTS_MISALIGNED_SPILL > && crtl->stack_alignment_estimated < align > && !crtl->stack_realign_processed) > { > unsigned int min_align = MINIMUM_ALIGNMENT (NULL, mode, align); > if (crtl->stack_alignment_estimated < min_align) > crtl->stack_alignment_estimated = min_align; > } Would this generate more unaligned_loads/stores, and does harm to performance?