On Thu, Jun 3, 2021 at 8:54 AM liuhongt <hongtao....@intel.com> wrote: > > When __builtin_ia32_vzeroupper is called explicitly, the corresponding > vzeroupper pattern does not carry any CLOBBERS or SETs before LRA, > which leads to incorrect optimization in pass_reload. In order to > solve this problem, this patch refine instructions as call_insns in > which the call has a special vzeroupper ABI. > > gcc/ChangeLog: > > PR target/82735 > * config/i386/i386-expand.c (ix86_expand_builtin): Remove > assignment of cfun->machine->has_explicit_vzeroupper. > * config/i386/i386-features.c > (ix86_add_reg_usage_to_vzerouppers): Delete. > (ix86_add_reg_usage_to_vzeroupper): Ditto. > (rest_of_handle_insert_vzeroupper): Remove > ix86_add_reg_usage_to_vzerouppers, add df_analyze at the end > of the function. > (gate): Remove cfun->machine->has_explicit_vzeroupper. > * config/i386/i386-protos.h (ix86_expand_avx_vzeroupper): > Declared. > * config/i386/i386.c (ix86_insn_callee_abi): New function. > (ix86_initialize_callee_abi): Ditto. > (ix86_expand_avx_vzeroupper): Ditto. > (ix86_hard_regno_call_part_clobbered): Adjust for vzeroupper > ABI. > (TARGET_INSN_CALLEE_ABI): Define as ix86_insn_callee_abi. > (ix86_emit_mode_set): Call ix86_expand_avx_vzeroupper > directly. > * config/i386/i386.h (struct GTY(()) machine_function): Delete > has_explicit_vzeroupper. > * config/i386/i386.md (enum unspec): New member > UNSPEC_CALLEE_ABI. > (I386_DEFAULT,I386_VZEROUPPER,I386_UNKNOWN): New > define_constants for insn callee abi index. > * config/i386/predicates.md (vzeroupper_pattern): Adjust. > * config/i386/sse.md (UNSPECV_VZEROUPPER): Deleted. > (avx_vzeroupper): Call ix86_expand_avx_vzeroupper. > (*avx_vzeroupper): Rename to .. > (avx_vzeroupper_callee_abi): .. this, and adjust pattern as > call_insn which has a special vzeroupper ABI. > (*avx_vzeroupper_1): Deleted. > > gcc/testsuite/ChangeLog: > > PR target/82735 > * gcc.target/i386/pr82735-1.c: New test. > * gcc.target/i386/pr82735-2.c: New test. > * gcc.target/i386/pr82735-3.c: New test. > * gcc.target/i386/pr82735-4.c: New test. > * gcc.target/i386/pr82735-5.c: New test.
LGTM, with a small nit below. Thanks, Uros. > --- > gcc/config/i386/i386-expand.c | 4 - > gcc/config/i386/i386-features.c | 99 +++-------------------- > gcc/config/i386/i386-protos.h | 1 + > gcc/config/i386/i386.c | 55 ++++++++++++- > gcc/config/i386/i386.h | 4 - > gcc/config/i386/i386.md | 10 +++ > gcc/config/i386/predicates.md | 5 +- > gcc/config/i386/sse.md | 59 ++++---------- > gcc/testsuite/gcc.target/i386/pr82735-1.c | 29 +++++++ > gcc/testsuite/gcc.target/i386/pr82735-2.c | 22 +++++ > gcc/testsuite/gcc.target/i386/pr82735-3.c | 5 ++ > gcc/testsuite/gcc.target/i386/pr82735-4.c | 48 +++++++++++ > gcc/testsuite/gcc.target/i386/pr82735-5.c | 54 +++++++++++++ > 13 files changed, 252 insertions(+), 143 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr82735-5.c > > diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c > index 9f3d41955a2..d25d59aa4e7 100644 > --- a/gcc/config/i386/i386-expand.c > +++ b/gcc/config/i386/i386-expand.c > @@ -13282,10 +13282,6 @@ rdseed_step: > > return 0; > > - case IX86_BUILTIN_VZEROUPPER: > - cfun->machine->has_explicit_vzeroupper = true; > - break; > - > default: > break; > } > diff --git a/gcc/config/i386/i386-features.c b/gcc/config/i386/i386-features.c > index 77783a154b6..a25769ae478 100644 > --- a/gcc/config/i386/i386-features.c > +++ b/gcc/config/i386/i386-features.c > @@ -1768,92 +1768,22 @@ convert_scalars_to_vector (bool timode_p) > return 0; > } > > -/* Modify the vzeroupper pattern in INSN so that it describes the effect > - that the instruction has on the SSE registers. LIVE_REGS are the set > - of registers that are live across the instruction. > - > - For a live register R we use: > - > - (set (reg:V2DF R) (reg:V2DF R)) > - > - which preserves the low 128 bits but clobbers the upper bits. */ > - > -static void > -ix86_add_reg_usage_to_vzeroupper (rtx_insn *insn, bitmap live_regs) > -{ > - rtx pattern = PATTERN (insn); > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > - unsigned int npats = nregs; > - for (unsigned int i = 0; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (!bitmap_bit_p (live_regs, regno)) > - npats--; > - } > - if (npats == 0) > - return; > - rtvec vec = rtvec_alloc (npats + 1); > - RTVEC_ELT (vec, 0) = XVECEXP (pattern, 0, 0); > - for (unsigned int i = 0, j = 0; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (!bitmap_bit_p (live_regs, regno)) > - continue; > - rtx reg = gen_rtx_REG (V2DImode, regno); > - ++j; > - RTVEC_ELT (vec, j) = gen_rtx_SET (reg, reg); > - } > - XVEC (pattern, 0) = vec; > - INSN_CODE (insn) = -1; > - df_insn_rescan (insn); > -} > - > -/* Walk the vzeroupper instructions in the function and annotate them > - with the effect that they have on the SSE registers. */ > - > -static void > -ix86_add_reg_usage_to_vzerouppers (void) > -{ > - basic_block bb; > - rtx_insn *insn; > - auto_bitmap live_regs; > - > - df_analyze (); > - FOR_EACH_BB_FN (bb, cfun) > - { > - bitmap_copy (live_regs, df_get_live_out (bb)); > - df_simulate_initialize_backwards (bb, live_regs); > - FOR_BB_INSNS_REVERSE (bb, insn) > - { > - if (!NONDEBUG_INSN_P (insn)) > - continue; > - if (vzeroupper_pattern (PATTERN (insn), VOIDmode)) > - ix86_add_reg_usage_to_vzeroupper (insn, live_regs); > - df_simulate_one_insn_backwards (bb, insn, live_regs); > - } > - } > -} > - > static unsigned int > rest_of_handle_insert_vzeroupper (void) > { > - if (TARGET_VZEROUPPER > - && flag_expensive_optimizations > - && !optimize_size) > - { > - /* vzeroupper instructions are inserted immediately after reload to > - account for possible spills from 256bit or 512bit registers. The > pass > - reuses mode switching infrastructure by re-running mode insertion > - pass, so disable entities that have already been processed. */ > - for (int i = 0; i < MAX_386_ENTITIES; i++) > - ix86_optimize_mode_switching[i] = 0; > + /* vzeroupper instructions are inserted immediately after reload to > + account for possible spills from 256bit or 512bit registers. The pass > + reuses mode switching infrastructure by re-running mode insertion > + pass, so disable entities that have already been processed. */ > + for (int i = 0; i < MAX_386_ENTITIES; i++) > + ix86_optimize_mode_switching[i] = 0; > > - ix86_optimize_mode_switching[AVX_U128] = 1; > + ix86_optimize_mode_switching[AVX_U128] = 1; > > - /* Call optimize_mode_switching. */ > - g->get_passes ()->execute_pass_mode_switching (); > - } > - ix86_add_reg_usage_to_vzerouppers (); > + /* Call optimize_mode_switching. */ > + g->get_passes ()->execute_pass_mode_switching (); > + > + df_analyze (); > return 0; > } > > @@ -1882,11 +1812,8 @@ public: > /* opt_pass methods: */ > virtual bool gate (function *) > { > - return TARGET_AVX > - && ((TARGET_VZEROUPPER > - && flag_expensive_optimizations > - && !optimize_size) > - || cfun->machine->has_explicit_vzeroupper); > + return TARGET_AVX && TARGET_VZEROUPPER > + && flag_expensive_optimizations && !optimize_size; > } > > virtual unsigned int execute (function *) > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index 7782cf1163f..e6ac9390777 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -216,6 +216,7 @@ extern rtx ix86_split_stack_guard (void); > extern void ix86_move_vector_high_sse_to_mmx (rtx); > extern void ix86_split_mmx_pack (rtx[], enum rtx_code); > extern void ix86_split_mmx_punpck (rtx[], bool); > +extern void ix86_expand_avx_vzeroupper (void); > > #ifdef TREE_CODE > extern void init_cumulative_args (CUMULATIVE_ARGS *, tree, rtx, tree, int); > diff --git a/gcc/config/i386/i386.c b/gcc/config/i386/i386.c > index 743d8a25fe3..f0b66dd0d56 100644 > --- a/gcc/config/i386/i386.c > +++ b/gcc/config/i386/i386.c > @@ -14426,7 +14426,7 @@ ix86_emit_mode_set (int entity, int mode, int > prev_mode ATTRIBUTE_UNUSED, > break; > case AVX_U128: > if (mode == AVX_U128_CLEAN) > - emit_insn (gen_avx_vzeroupper ()); > + ix86_expand_avx_vzeroupper (); > break; > case I387_ROUNDEVEN: > case I387_TRUNC: > @@ -19494,15 +19494,63 @@ ix86_hard_regno_mode_ok (unsigned int regno, > machine_mode mode) > return false; > } > > +/* Implement TARGET_INSN_CALLEE_ABI. */ > + > +const predefined_function_abi & > +ix86_insn_callee_abi (const rtx_insn *insn) > +{ > + unsigned int abi_id = 0; > + rtx pat = PATTERN (insn); > + if (vzeroupper_pattern (pat, VOIDmode)) > + abi_id = I386_VZEROUPPER; > + > + return function_abis[abi_id]; > +} > + > +/* Initialize function_abis with corresponding abi_id, > + currently only handle vzeroupper. */ > +void > +ix86_initialize_callee_abi (unsigned int abi_id) > +{ > + gcc_assert (abi_id == I386_VZEROUPPER); > + predefined_function_abi &vzeroupper_abi = function_abis[abi_id]; > + if (!vzeroupper_abi.initialized_p ()) > + { > + HARD_REG_SET full_reg_clobbers; > + CLEAR_HARD_REG_SET (full_reg_clobbers); > + vzeroupper_abi.initialize (I386_VZEROUPPER, full_reg_clobbers); > + } > +} > + > +void > +ix86_expand_avx_vzeroupper (void) > +{ > + /* Initialize vzeroupper_abi here. */ > + ix86_initialize_callee_abi (I386_VZEROUPPER); > + rtx_insn *insn = emit_call_insn (gen_avx_vzeroupper_callee_abi ()); > + /* Return false for non-local goto in can_nonlocal_goto. */ > + make_reg_eh_region_note (insn, 0, INT_MIN); > + /* Flag used for call_insn indicates it's a fake call. */ > + RTX_FLAG (insn, used) = 1; > +} > + > + > /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The only ABI that > saves SSE registers across calls is Win64 (thus no need to check the > current ABI here), and with AVX enabled Win64 only guarantees that > the low 16 bytes are saved. */ > > static bool > -ix86_hard_regno_call_part_clobbered (unsigned int, unsigned int regno, > +ix86_hard_regno_call_part_clobbered (unsigned int abi_id, unsigned int regno, > machine_mode mode) > { > + /* Special ABI for vzeroupper which only clobber higher part of sse regs. > */ > + if (abi_id == I386_VZEROUPPER) > + return (GET_MODE_SIZE (mode) > 16 > + && ((TARGET_64BIT > + && (IN_RANGE (regno, FIRST_REX_SSE_REG, LAST_REX_SSE_REG))) > + || (IN_RANGE (regno, FIRST_SSE_REG, LAST_SSE_REG)))); > + > return SSE_REGNO_P (regno) && GET_MODE_SIZE (mode) > 16; > } > > @@ -23916,6 +23964,9 @@ ix86_run_selftests (void) > #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \ > ix86_hard_regno_call_part_clobbered > > +#undef TARGET_INSN_CALLEE_ABI > +#define TARGET_INSN_CALLEE_ABI ix86_insn_callee_abi > + > #undef TARGET_CAN_CHANGE_MODE_CLASS > #define TARGET_CAN_CHANGE_MODE_CLASS ix86_can_change_mode_class > > diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h > index 53d503fc6e0..919d0b2418a 100644 > --- a/gcc/config/i386/i386.h > +++ b/gcc/config/i386/i386.h > @@ -2659,10 +2659,6 @@ struct GTY(()) machine_function { > /* True if the function needs a stack frame. */ > BOOL_BITFIELD stack_frame_required : 1; > > - /* True if __builtin_ia32_vzeroupper () has been expanded in current > - function. */ > - BOOL_BITFIELD has_explicit_vzeroupper : 1; > - > /* True if we should act silently, rather than raise an error for > invalid calls. */ > BOOL_BITFIELD silent_p : 1; > diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md > index 2fc8fae30f3..5d9f5aa39ac 100644 > --- a/gcc/config/i386/i386.md > +++ b/gcc/config/i386/i386.md > @@ -191,6 +191,10 @@ (define_c_enum "unspec" [ > ;; For MOVDIRI and MOVDIR64B support > UNSPEC_MOVDIRI > UNSPEC_MOVDIR64B > + > + ;; For insn_callee_abi: > + UNSPEC_CALLEE_ABI > + > ]) > > (define_c_enum "unspecv" [ > @@ -447,6 +451,12 @@ (define_constants > (FIRST_PSEUDO_REG 76) > ]) > > +;; Insn callee abi index. > +(define_constants > + [(I386_DEFAULT 0) > + (I386_VZEROUPPER 1) > + (I386_UNKNOWN 2)]) Please name these ABI_DEFAULT, ABI_VZEROUPPER and ABI_UNKNOWN. > + > ;; Insns whose names begin with "x86_" are emitted by gen_FOO calls > ;; from i386.c. > > diff --git a/gcc/config/i386/predicates.md b/gcc/config/i386/predicates.md > index abd307ebdb8..8b787553f32 100644 > --- a/gcc/config/i386/predicates.md > +++ b/gcc/config/i386/predicates.md > @@ -1599,8 +1599,9 @@ (define_predicate "vzeroall_pattern" > ;; return true if OP is a vzeroupper pattern. > (define_predicate "vzeroupper_pattern" > (and (match_code "parallel") > - (match_code "unspec_volatile" "a") > - (match_test "XINT (XVECEXP (op, 0, 0), 1) == UNSPECV_VZEROUPPER"))) > + (match_code "unspec" "b") > + (match_test "XINT (XVECEXP (op, 0, 1), 1) == UNSPEC_CALLEE_ABI") > + (match_test "INTVAL (XVECEXP (XVECEXP (op, 0, 1), 0, 0)) == > I386_VZEROUPPER"))) > > ;; Return true if OP is an addsub vec_merge operation > (define_predicate "addsub_vm_operator" > diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md > index a4503ddcb73..949347a3247 100644 > --- a/gcc/config/i386/sse.md > +++ b/gcc/config/i386/sse.md > @@ -205,7 +205,6 @@ (define_c_enum "unspecv" [ > UNSPECV_MONITOR > UNSPECV_MWAIT > UNSPECV_VZEROALL > - UNSPECV_VZEROUPPER > > ;; For KEYLOCKER > UNSPECV_LOADIWKEY > @@ -20857,14 +20856,22 @@ (define_insn "*avx_vzeroall" > ;; if the upper 128bits are unused. Initially we expand the instructions > ;; as though they had no effect on the SSE registers, but later add SETs and > ;; CLOBBERs to the PARALLEL to model the real effect. > + > (define_expand "avx_vzeroupper" > - [(parallel [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX") > + [(parallel [(call (mem:QI (const_int 0)) > + (const_int 0)) > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)])] > + "TARGET_AVX" > +{ > + ix86_expand_avx_vzeroupper (); > + DONE; > +}) > > -(define_insn "*avx_vzeroupper" > - [(match_parallel 0 "vzeroupper_pattern" > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX && XVECLEN (operands[0], 0) == (TARGET_64BIT ? 16 : 8) + 1" > +(define_insn "avx_vzeroupper_callee_abi" > + [(call (mem:QI (const_int 0)) > + (const_int 0)) > + (unspec [(const_int I386_VZEROUPPER)] UNSPEC_CALLEE_ABI)] > + "TARGET_AVX" > "vzeroupper" > [(set_attr "type" "sse") > (set_attr "modrm" "0") > @@ -20873,44 +20880,6 @@ (define_insn "*avx_vzeroupper" > (set_attr "btver2_decode" "vector") > (set_attr "mode" "OI")]) > > -(define_insn_and_split "*avx_vzeroupper_1" > - [(match_parallel 0 "vzeroupper_pattern" > - [(unspec_volatile [(const_int 0)] UNSPECV_VZEROUPPER)])] > - "TARGET_AVX && XVECLEN (operands[0], 0) != (TARGET_64BIT ? 16 : 8) + 1" > - "#" > - "&& epilogue_completed" > - [(match_dup 0)] > -{ > - /* For IPA-RA purposes, make it clear the instruction clobbers > - even XMM registers not mentioned explicitly in the pattern. */ > - unsigned int nregs = TARGET_64BIT ? 16 : 8; > - unsigned int npats = XVECLEN (operands[0], 0); > - rtvec vec = rtvec_alloc (nregs + 1); > - RTVEC_ELT (vec, 0) = XVECEXP (operands[0], 0, 0); > - for (unsigned int i = 0, j = 1; i < nregs; ++i) > - { > - unsigned int regno = GET_SSE_REGNO (i); > - if (j < npats > - && REGNO (SET_DEST (XVECEXP (operands[0], 0, j))) == regno) > - { > - RTVEC_ELT (vec, i + 1) = XVECEXP (operands[0], 0, j); > - j++; > - } > - else > - { > - rtx reg = gen_rtx_REG (V2DImode, regno); > - RTVEC_ELT (vec, i + 1) = gen_rtx_CLOBBER (VOIDmode, reg); > - } > - } > - operands[0] = gen_rtx_PARALLEL (VOIDmode, vec); > -} > - [(set_attr "type" "sse") > - (set_attr "modrm" "0") > - (set_attr "memory" "none") > - (set_attr "prefix" "vex") > - (set_attr "btver2_decode" "vector") > - (set_attr "mode" "OI")]) > - > (define_mode_attr pbroadcast_evex_isa > [(V64QI "avx512bw") (V32QI "avx512bw") (V16QI "avx512bw") > (V32HI "avx512bw") (V16HI "avx512bw") (V8HI "avx512bw") > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-1.c > b/gcc/testsuite/gcc.target/i386/pr82735-1.c > new file mode 100644 > index 00000000000..1a63b9ae9c9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-1.c > @@ -0,0 +1,29 @@ > +/* { dg-do run } */ > +/* { dg-options "-O2 -mavx" } */ > +/* { dg-require-effective-target avx } */ > + > +#include "avx-check.h" > + > +void > +__attribute__ ((noipa)) > +mtest(char *dest) > +{ > + __m256i ymm1 = _mm256_set1_epi8((char)0x1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm1); > + _mm256_zeroupper(); > + __m256i ymm2 = _mm256_set1_epi8((char)0x1); > + _mm256_storeu_si256((__m256i *)dest, ymm2); > +} > + > +void > +avx_test () > +{ > + char buf[64]; > + for (int i = 0; i != 64; i++) > + buf[i] = 2; > + mtest (buf); > + > + for (int i = 0; i < 32; ++i) > + if (buf[i] != 1) > + __builtin_abort (); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-2.c > b/gcc/testsuite/gcc.target/i386/pr82735-2.c > new file mode 100644 > index 00000000000..ac9d006f794 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-2.c > @@ -0,0 +1,22 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx -O2" } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + /* xmm1 can be propagated to xmm2 by CSE. */ > + __m128i xmm1 = _mm_set_epi8(0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, > + 0x9, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16); > + _mm_storeu_si128((__m128i *)(dest + 32), xmm1); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + __m128i xmm2 = xmm1; > + _mm_storeu_si128((__m128i *)dest, xmm2); > +} > + > +/* Darwin local constant symbol is "lC0", ELF targets ".LC0" */ > +/* { dg-final { scan-assembler-times {(?n)vmovdqa\t\.?[Ll]C0[^,]*, > %xmm[0-9]} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-3.c > b/gcc/testsuite/gcc.target/i386/pr82735-3.c > new file mode 100644 > index 00000000000..e3f801e6924 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-3.c > @@ -0,0 +1,5 @@ > +/* { dg-do compile } */ > +/* { dg-options "-mavx -O2 -mabi=ms" } */ > +/* { dg-final { scan-assembler-not {(?n)xmm([6-9]|1[0-5])} } } */ > + > +#include "pr82735-2.c" > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-4.c > b/gcc/testsuite/gcc.target/i386/pr82735-4.c > new file mode 100644 > index 00000000000..78c0a6cb2c8 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-4.c > @@ -0,0 +1,48 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t > ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t > ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, > ymm11, ymm12, ymm13, ymm14, ymm15; > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > + "vmovdqa\t%%ymm0, %1\n\t" > + "vmovdqa\t%%ymm0, %2\n\t" > + "vmovdqa\t%%ymm0, %3\n\t" > + "vmovdqa\t%%ymm0, %4\n\t" > + "vmovdqa\t%%ymm0, %5\n\t" > + "vmovdqa\t%%ymm0, %6\n\t" > + "vmovdqa\t%%ymm0, %7\n\t" > + "vmovdqa\t%%ymm0, %8\n\t" > + "vmovdqa\t%%ymm0, %9\n\t" > + "vmovdqa\t%%ymm0, %10\n\t" > + "vmovdqa\t%%ymm0, %11\n\t" > + "vmovdqa\t%%ymm0, %12\n\t" > + "vmovdqa\t%%ymm0, %13\n\t" > + "vmovdqa\t%%ymm0, %14\n\t" > + "vmovdqa\t%%ymm0, %15\n\t" > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" > (ymm5), > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" > (ymm10), > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" > (ymm15), > + "=v"(ymm0) > + ::); > + _mm256_zeroupper(); > + _mm256_storeu_si256((__m256i *)dest, ymm1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > +} > diff --git a/gcc/testsuite/gcc.target/i386/pr82735-5.c > b/gcc/testsuite/gcc.target/i386/pr82735-5.c > new file mode 100644 > index 00000000000..2a58cbe52d0 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr82735-5.c > @@ -0,0 +1,54 @@ > +/* { dg-do compile { target { ! ia32 } } } */ > +/* { dg-options "-mavx -O2 -mabi=ms -mno-avx512f -masm=att" } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t > ]*%xmm[0-9]+, [0-9]*\(%rsp\)} 10 } } */ > +/* { dg-final { scan-assembler-times {(?n)(?:vmovdqa[1-9]*|vmovap[sd])[\t > ]*[0-9]*\(%rsp\), %xmm[0-9]+} 10 } } */ > + > +#include <immintrin.h> > + > +void test(char *dest) > +{ > + __m256i ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8, ymm9, ymm10, > ymm11, ymm12, ymm13, ymm14, ymm15; > + asm volatile ("vmovdqa\t%%ymm0, %0\n\t" > + "vmovdqa\t%%ymm0, %1\n\t" > + "vmovdqa\t%%ymm0, %2\n\t" > + "vmovdqa\t%%ymm0, %3\n\t" > + "vmovdqa\t%%ymm0, %4\n\t" > + "vmovdqa\t%%ymm0, %5\n\t" > + "vmovdqa\t%%ymm0, %6\n\t" > + "vmovdqa\t%%ymm0, %7\n\t" > + "vmovdqa\t%%ymm0, %8\n\t" > + "vmovdqa\t%%ymm0, %9\n\t" > + "vmovdqa\t%%ymm0, %10\n\t" > + "vmovdqa\t%%ymm0, %11\n\t" > + "vmovdqa\t%%ymm0, %12\n\t" > + "vmovdqa\t%%ymm0, %13\n\t" > + "vmovdqa\t%%ymm0, %14\n\t" > + "vmovdqa\t%%ymm0, %15\n\t" > + : "=v" (ymm1), "=v" (ymm2), "=v"(ymm3), "=v" (ymm4), "=v" > (ymm5), > + "=v" (ymm6), "=v" (ymm7), "=v"(ymm8), "=v" (ymm9), "=v" > (ymm10), > + "=v" (ymm11), "=v" (ymm12), "=v"(ymm13), "=v" (ymm14), "=v" > (ymm15), > + "=v"(ymm0) > + ::); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_zeroupper(); > + _mm256_storeu_si256((__m256i *)dest, ymm1); > + _mm256_storeu_si256((__m256i *)(dest + 32), ymm2); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 2), ymm3); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 3), ymm4); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 4), ymm5); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 5), ymm6); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 6), ymm7); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 7), ymm8); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 8), ymm9); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 9), ymm10); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 10), ymm11); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 11), ymm12); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 12), ymm13); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 13), ymm14); > + _mm256_storeu_si256((__m256i *)(dest + 32 * 14), ymm15); > +} > -- > 2.18.1 >