If a single instruction can store or move the whole block of memory, use vector instruction and don't align destination.
gcc/ PR target/121934 * config/i386/i386-expand.cc (ix86_expand_set_or_cpymem): If a single instruction can store or move the whole block of memory, use vector instruction and don't align destination. gcc/testsuite/ PR target/121934 * gcc.target/i386/pr121934-1a.c: New test. * gcc.target/i386/pr121934-1b.c: Likewise. * gcc.target/i386/pr121934-2a.c: Likewise. * gcc.target/i386/pr121934-2b.c: Likewise. * gcc.target/i386/pr121934-3a.c: Likewise. * gcc.target/i386/pr121934-3b.c: Likewise. * gcc.target/i386/pr121934-4a.c: Likewise. * gcc.target/i386/pr121934-4b.c: Likewise. * gcc.target/i386/pr121934-5a.c: Likewise. * gcc.target/i386/pr121934-5b.c: Likewise. Signed-off-by: H.J. Lu <hjl.to...@gmail.com> --- gcc/config/i386/i386-expand.cc | 62 +++++++++++++-------- gcc/testsuite/gcc.target/i386/pr121934-1a.c | 22 ++++++++ gcc/testsuite/gcc.target/i386/pr121934-1b.c | 7 +++ gcc/testsuite/gcc.target/i386/pr121934-2a.c | 23 ++++++++ gcc/testsuite/gcc.target/i386/pr121934-2b.c | 7 +++ gcc/testsuite/gcc.target/i386/pr121934-3a.c | 23 ++++++++ gcc/testsuite/gcc.target/i386/pr121934-3b.c | 7 +++ gcc/testsuite/gcc.target/i386/pr121934-4a.c | 23 ++++++++ gcc/testsuite/gcc.target/i386/pr121934-4b.c | 7 +++ gcc/testsuite/gcc.target/i386/pr121934-5a.c | 23 ++++++++ gcc/testsuite/gcc.target/i386/pr121934-5b.c | 7 +++ 11 files changed, 187 insertions(+), 24 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-1b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-2b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-3b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-4b.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5a.c create mode 100644 gcc/testsuite/gcc.target/i386/pr121934-5b.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index dc26b3452cb..b0b9e6da946 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -9552,9 +9552,20 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, if (!issetmem) srcreg = ix86_copy_addr_to_reg (XEXP (src, 0)); + bool aligned_dstmem = false; + unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX; + bool single_insn_p = count && count <= nunits; + if (single_insn_p) + { + /* If it can be done with a single instruction, use vector + instruction and don't align destination. */ + alg = vector_loop; + noalign = true; + dynamic_check = -1; + } + unroll_factor = 1; move_mode = word_mode; - int nunits; switch (alg) { case libcall: @@ -9576,7 +9587,6 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, need_zero_guard = true; unroll_factor = 4; /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */ - nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX; nunits /= GET_MODE_SIZE (word_mode); if (nunits > 1) { @@ -9629,28 +9639,32 @@ ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp, } gcc_assert (desired_align >= 1 && align >= 1); - /* Misaligned move sequences handle both prologue and epilogue at once. - Default code generation results in a smaller code for large alignments - and also avoids redundant job when sizes are known precisely. */ - misaligned_prologue_used - = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES - && MAX (desired_align, epilogue_size_needed) <= 32 - && desired_align <= epilogue_size_needed - && ((desired_align > align && !align_bytes) - || (!count && epilogue_size_needed > 1))); - - /* Destination is aligned after the misaligned prologue. */ - bool aligned_dstmem = misaligned_prologue_used; - - if (noalign && !misaligned_prologue_used) - { - /* Also use misaligned prologue if alignment isn't needed and - destination isn't aligned. Since alignment isn't needed, - the destination after prologue won't be aligned. */ - aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode) - <= MEM_ALIGN (dst)); - if (!aligned_dstmem) - misaligned_prologue_used = true; + if (!single_insn_p) + { + /* Misaligned move sequences handle both prologue and epilogue + at once. Default code generation results in a smaller code + for large alignments and also avoids redundant job when sizes + are known precisely. */ + misaligned_prologue_used + = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES + && MAX (desired_align, epilogue_size_needed) <= 32 + && desired_align <= epilogue_size_needed + && ((desired_align > align && !align_bytes) + || (!count && epilogue_size_needed > 1))); + + /* Destination is aligned after the misaligned prologue. */ + aligned_dstmem = misaligned_prologue_used; + + if (noalign && !misaligned_prologue_used) + { + /* Also use misaligned prologue if alignment isn't needed and + destination isn't aligned. Since alignment isn't needed, + the destination after prologue won't be aligned. */ + aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode) + <= MEM_ALIGN (dst)); + if (!aligned_dstmem) + misaligned_prologue_used = true; + } } /* Do the cheap promotion to allow better CSE across the diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1a.c b/gcc/testsuite/gcc.target/i386/pr121934-1a.c new file mode 100644 index 00000000000..6b6881367db --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-1a.c @@ -0,0 +1,22 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */ + +extern int f(); +int a, b, c, d[3]; +void g() { + int h; + if (f()) { + if (b) + i: + c > 0; + a = 0; + for (h = 0; h < 3; h++) { + if (a != 1) + __builtin_printf("0\n"); + d[h] = -1; + } + goto i; + } +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-1b.c b/gcc/testsuite/gcc.target/i386/pr121934-1b.c new file mode 100644 index 00000000000..47381ec3476 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-1b.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ + +#include "pr121934-1a.c" + +/* { dg-final { scan-assembler-not "rep stos" } } */ +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2a.c b/gcc/testsuite/gcc.target/i386/pr121934-2a.c new file mode 100644 index 00000000000..49def11aa4e --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-2a.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre" } */ + +extern int f(); +int a, b, c; +long long int d[3]; +void g() { + int h; + if (f()) { + if (b) + i: + c > 0; + a = 0; + for (h = 0; h < 3; h++) { + if (a != 1) + __builtin_printf("0\n"); + d[h] = (long long int) -1; + } + goto i; + } +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-2b.c b/gcc/testsuite/gcc.target/i386/pr121934-2b.c new file mode 100644 index 00000000000..1c634dfe420 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-2b.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ + +#include "pr121934-2a.c" + +/* { dg-final { scan-assembler-not "rep stos" } } */ +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3a.c b/gcc/testsuite/gcc.target/i386/pr121934-3a.c new file mode 100644 index 00000000000..0c04b69c0d4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-3a.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2" } */ + +extern int f(); +int a, b, c; +_BitInt(128) d[3]; +void g() { + int h; + if (f()) { + if (b) + i: + c > 0; + a = 0; + for (h = 0; h < 3; h++) { + if (a != 1) + __builtin_printf("0\n"); + d[h] = (_BitInt(128)) -1; + } + goto i; + } +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-3b.c b/gcc/testsuite/gcc.target/i386/pr121934-3b.c new file mode 100644 index 00000000000..ff4b0831cea --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-3b.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -msse2 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ + +#include "pr121934-3a.c" + +/* { dg-final { scan-assembler-not "rep stos" } } */ +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4a.c b/gcc/testsuite/gcc.target/i386/pr121934-4a.c new file mode 100644 index 00000000000..5aa3e069cff --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-4a.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256" } */ + +extern int f(); +int a, b, c; +_BitInt(256) d[3]; +void g() { + int h; + if (f()) { + if (b) + i: + c > 0; + a = 0; + for (h = 0; h < 3; h++) { + if (a != 1) + __builtin_printf("0\n"); + d[h] = (_BitInt(256)) -1; + } + goto i; + } +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-4b.c b/gcc/testsuite/gcc.target/i386/pr121934-4b.c new file mode 100644 index 00000000000..5f8241dcad5 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-4b.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mno-avx512f -mavx -mprefer-vector-width=256 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ + +#include "pr121934-4a.c" + +/* { dg-final { scan-assembler-not "rep stos" } } */ +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5a.c b/gcc/testsuite/gcc.target/i386/pr121934-5a.c new file mode 100644 index 00000000000..10be0dd4343 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-5a.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512" } */ + +extern int f(); +int a, b, c; +_BitInt(512) d[3]; +void g() { + int h; + if (f()) { + if (b) + i: + c > 0; + a = 0; + for (h = 0; h < 3; h++) { + if (a != 1) + __builtin_printf("0\n"); + d[h] = (_BitInt(512)) -1; + } + goto i; + } +} + +/* { dg-final { scan-assembler-not "rep stos" } } */ diff --git a/gcc/testsuite/gcc.target/i386/pr121934-5b.c b/gcc/testsuite/gcc.target/i386/pr121934-5b.c new file mode 100644 index 00000000000..6a45a8a7a8b --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr121934-5b.c @@ -0,0 +1,7 @@ +/* { dg-do compile } */ +/* { dg-options "-O3 -fno-tree-dominator-opts -fno-tree-vrp -fno-tree-ccp -fno-tree-forwprop -fno-tree-pre -fno-tree-fre -mavx512f -mprefer-vector-width=512 -mmemset-strategy=rep_byte:8192:align,libcall:-1:noalign" } */ + +#include "pr121934-5a.c" + +/* { dg-final { scan-assembler-not "rep stos" } } */ +/* { dg-final { scan-assembler-not "movb\[ \\t\]+\\\$-1" } } */ -- 2.51.0