https://gcc.gnu.org/bugzilla/show_bug.cgi?id=125876
--- Comment #5 from Sarvesh Chandra <Sarvesh.Chandra at amd dot com> --- Comment on attachment 64772 --> https://gcc.gnu.org/bugzilla/attachment.cgi?id=64772 Patch >From c7e5a8224a83bfe0951840c62d40938ca00e56ba Mon Sep 17 00:00:00 2001 >From: Sarvesh Chandra <[email protected]> >Date: Thu, 18 Jun 2026 16:35:46 +0530 >Subject: [PATCH] i386: Match vunpcklpd before vmovddup for V4DF/V8DF > interleave > >An even-lane duplicate of a register operand matched the movddup >pattern, whose memory-only operand forced the register to be spilled >to the stack and reloaded. List the unpcklpd patterns before the >movddup patterns so a register interleave selects vunpcklpd, while a >genuine memory load still folds into vmovddup. > >PR target/107057 > >gcc/ChangeLog: > >* config/i386/sse.md (avx512f_unpcklpd512): Move before >avx512f_movddup512. >(avx_unpcklpd256): Move before avx_movddup256. > >gcc/testsuite/ChangeLog: > >* gcc.target/i386/avx512-movedup.c: New test. > >Co-authored-by: Ashwin Godbole <[email protected]> >Signed-off-by: Sarvesh Chandra <[email protected]> > >diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md >index 5d9ab8b14eb..f13232f0e7c 100644 >--- a/gcc/config/i386/sse.md >+++ b/gcc/config/i386/sse.md >@@ -13916,64 +13916,64 @@ > (set_attr "prefix" "orig,maybe_evex,orig,maybe_evex,maybe_vex") > (set_attr "mode" "V2DF,V2DF,V1DF,V1DF,V1DF")]) > >-(define_insn "avx512f_movddup512<mask_name>" >+(define_insn "avx512f_unpcklpd512<mask_name>" > [(set (match_operand:V8DF 0 "register_operand" "=v") > (vec_select:V8DF > (vec_concat:V16DF >- (match_operand:V8DF 1 "nonimmediate_operand" "m") >- (match_dup 1)) >+ (match_operand:V8DF 1 "register_operand" "v") >+ (match_operand:V8DF 2 "nonimmediate_operand" "vm")) > (parallel [(const_int 0) (const_int 8) > (const_int 2) (const_int 10) > (const_int 4) (const_int 12) > (const_int 6) (const_int 14)])))] > "TARGET_AVX512F" >- "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" >- [(set_attr "type" "ssemov") >+ "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" >+ [(set_attr "type" "sselog") > (set_attr "prefix" "evex") > (set_attr "mode" "V8DF")]) > >-(define_insn "avx512f_unpcklpd512<mask_name>" >+(define_insn "avx512f_movddup512<mask_name>" > [(set (match_operand:V8DF 0 "register_operand" "=v") > (vec_select:V8DF > (vec_concat:V16DF >- (match_operand:V8DF 1 "register_operand" "v") >- (match_operand:V8DF 2 "nonimmediate_operand" "vm")) >+ (match_operand:V8DF 1 "nonimmediate_operand" "m") >+ (match_dup 1)) > (parallel [(const_int 0) (const_int 8) > (const_int 2) (const_int 10) > (const_int 4) (const_int 12) > (const_int 6) (const_int 14)])))] > "TARGET_AVX512F" >- "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" >- [(set_attr "type" "sselog") >+ "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" >+ [(set_attr "type" "ssemov") > (set_attr "prefix" "evex") > (set_attr "mode" "V8DF")]) > > ;; Recall that the 256-bit unpck insns only shuffle within their lanes. >-(define_insn "avx_movddup256<mask_name>" >- [(set (match_operand:V4DF 0 "register_operand" "=v") >+(define_insn "avx_unpcklpd256<mask_name>" >+ [(set (match_operand:V4DF 0 "register_operand" "=v") > (vec_select:V4DF > (vec_concat:V8DF >- (match_operand:V4DF 1 "nonimmediate_operand" "m") >- (match_dup 1)) >+ (match_operand:V4DF 1 "register_operand" " v") >+ (match_operand:V4DF 2 "nonimmediate_operand" "vm")) > (parallel [(const_int 0) (const_int 4) > (const_int 2) (const_int 6)])))] > "TARGET_AVX && <mask_avx512vl_condition>" >- "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" >- [(set_attr "type" "ssemov") >+ "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" >+ [(set_attr "type" "sselog") > (set_attr "prefix" "<mask_prefix>") > (set_attr "mode" "V4DF")]) > >-(define_insn "avx_unpcklpd256<mask_name>" >- [(set (match_operand:V4DF 0 "register_operand" "=v") >+(define_insn "avx_movddup256<mask_name>" >+ [(set (match_operand:V4DF 0 "register_operand" "=v") > (vec_select:V4DF > (vec_concat:V8DF >- (match_operand:V4DF 1 "register_operand" " v") >- (match_operand:V4DF 2 "nonimmediate_operand" "vm")) >+ (match_operand:V4DF 1 "nonimmediate_operand" "m") >+ (match_dup 1)) > (parallel [(const_int 0) (const_int 4) > (const_int 2) (const_int 6)])))] > "TARGET_AVX && <mask_avx512vl_condition>" >- "vunpcklpd\t{%2, %1, %0<mask_operand3>|%0<mask_operand3>, %1, %2}" >- [(set_attr "type" "sselog") >+ "vmovddup\t{%1, %0<mask_operand2>|%0<mask_operand2>, %1}" >+ [(set_attr "type" "ssemov") > (set_attr "prefix" "<mask_prefix>") > (set_attr "mode" "V4DF")]) > >diff --git a/gcc/testsuite/gcc.target/i386/avx512-movedup.c >b/gcc/testsuite/gcc.target/i386/avx512-movedup.c >new file mode 100644 >index 00000000000..20e78dfcb16 >--- /dev/null >+++ b/gcc/testsuite/gcc.target/i386/avx512-movedup.c >@@ -0,0 +1,16 @@ >+/* { dg-do compile } */ >+/* { dg-options "-O2 -mavx512f" } */ >+/* { dg-final { scan-assembler "(?:vmovddup|vunpcklpd)\[ \\t\]+%zmm\[0-9\]+, >%zmm\[0-9\]+" } } */ >+/* { dg-final { scan-assembler "(?:vmovddup|vunpcklpd)\[ \\t\]+%ymm\[0-9\]+, >%ymm\[0-9\]+" } } */ >+/* { dg-final { scan-assembler "vmovddup\[ \\t\]+\\(" } } */ >+/* { dg-final { scan-assembler-not "vmovddup\[^\\n\]*\\(%\[er\]sp\\)" } } */ >+ >+#include <immintrin.h> >+ >+__m512d dup512 (__m512d x) { return _mm512_movedup_pd (x); } >+__m256d dup256 (__m256d x) { return _mm256_movedup_pd (x); } >+__m512d interleave512 (__m512d x) { return _mm512_unpacklo_pd (x, x); } >+__m256d interleave256 (__m256d x) { return _mm256_unpacklo_pd (x, x); } >+__m512d load512 (double *p) { return _mm512_movedup_pd (_mm512_loadu_pd (p)); >} >+ >+ >-- >2.34.1 >
