Hi,

In PR88760, there are a few disscussion about improve or tune unroller for
targets. And we would agree to enable unroller for small loops at O2 first.
And we could see performance improvement(~10%) for below code:
```
  subroutine foo (i, i1, block)
    integer :: i, i1
    integer :: block(9, 9, 9)
    block(i:9,1,i1) = block(i:9,1,i1) - 10
  end subroutine foo

```
This kind of code occurs a few times in exchange2 benchmark.

Similar C code:
```
  for (i = 0; i < n; i++)
    arr[i] = arr[i] - 10;
```

On powerpc64le, for O2 , enable -funroll-loops and limit
PARAM_MAX_UNROLL_TIMES=2 and PARAM_MAX_UNROLLED_INSNS=20, we can see >2%
overall improvement for SPEC2017.

This patch is only for rs6000 in which we see visible performance improvement.

Bootstrapped on powerpc64le, and cases are updated. Is this ok for trunk?

Jiufu Guo
BR


gcc/
2019-10-25  Jiufu Guo  <guoji...@linux.ibm.com>     

        PR tree-optimization/88760
        * config/rs6000/rs6000-common.c (rs6000_option_optimization_table): for
        O2, enable funroll-loops.
        * config/rs6000/rs6000.c (rs6000_option_override_internal): if unroller
        is enabled throught O2, set constrains to PARAM_MAX_UNROLL_TIMES=2 and
        PARAM_MAX_UNROLLED_INSNS=20 for small loops.
        
gcc.testsuite/
2019-10-25  Jiufu Guo  <guoji...@linux.ibm.com>

        PR tree-optimization/88760
        * gcc.target/powerpc/small-loop-unroll.c: New test.
        * c-c++-common/tsan/thread_leak2.c: Update test.
        * gcc.dg/pr59643.c: Update test.
        * gcc.target/powerpc/loop_align.c: Update test.
        * gcc.target/powerpc/ppc-fma-1.c: Update test.
        * gcc.target/powerpc/ppc-fma-2.c: Update test.
        * gcc.target/powerpc/ppc-fma-3.c: Update test.
        * gcc.target/powerpc/ppc-fma-4.c: Update test.
        * gcc.target/powerpc/pr78604.c: Update test.

---
 gcc/common/config/rs6000/rs6000-common.c             |  1 +
 gcc/config/rs6000/rs6000.c                           | 20 ++++++++++++++++++++
 gcc/testsuite/c-c++-common/tsan/thread_leak2.c       |  1 +
 gcc/testsuite/gcc.dg/pr59643.c                       |  1 +
 gcc/testsuite/gcc.target/powerpc/loop_align.c        |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c         |  2 +-
 gcc/testsuite/gcc.target/powerpc/pr78604.c           |  2 +-
 gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c | 13 +++++++++++++
 11 files changed, 42 insertions(+), 6 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c

diff --git a/gcc/common/config/rs6000/rs6000-common.c 
b/gcc/common/config/rs6000/rs6000-common.c
index 4b0c205..b947196 100644
--- a/gcc/common/config/rs6000/rs6000-common.c
+++ b/gcc/common/config/rs6000/rs6000-common.c
@@ -35,6 +35,7 @@ static const struct default_options 
rs6000_option_optimization_table[] =
     { OPT_LEVELS_ALL, OPT_fsplit_wide_types_early, NULL, 1 },
     /* Enable -fsched-pressure for first pass instruction scheduling.  */
     { OPT_LEVELS_1_PLUS, OPT_fsched_pressure, NULL, 1 },
+    { OPT_LEVELS_2_PLUS, OPT_funroll_loops, NULL, 1 },
     { OPT_LEVELS_NONE, 0, NULL, 0 }
   };
 
diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c
index a129137d..9a8ff9a 100644
--- a/gcc/config/rs6000/rs6000.c
+++ b/gcc/config/rs6000/rs6000.c
@@ -4540,6 +4540,26 @@ rs6000_option_override_internal (bool global_init_p)
                             global_options.x_param_values,
                             global_options_set.x_param_values);
 
+      /* unroll very small loops 2 time if no -funroll-loops.  */
+      if (!global_options_set.x_flag_unroll_loops
+         && !global_options_set.x_flag_unroll_all_loops)
+       {
+         maybe_set_param_value (PARAM_MAX_UNROLL_TIMES, 2,
+                                global_options.x_param_values,
+                                global_options_set.x_param_values);
+
+         maybe_set_param_value (PARAM_MAX_UNROLLED_INSNS, 20,
+                                global_options.x_param_values,
+                                global_options_set.x_param_values);
+
+         /* If fweb or frename-registers are not specificed in command-line,
+            do not turn them on implicitly.  */
+         if (!global_options_set.x_flag_web)
+           global_options.x_flag_web = 0;
+         if (!global_options_set.x_flag_rename_registers)
+           global_options.x_flag_rename_registers = 0;
+       }
+
       /* If using typedef char *va_list, signal that
         __builtin_va_start (&ap, 0) can be optimized to
         ap = __builtin_next_arg (0).  */
diff --git a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c 
b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
index c9b8046..17aa5c6 100644
--- a/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
+++ b/gcc/testsuite/c-c++-common/tsan/thread_leak2.c
@@ -1,4 +1,5 @@
 /* { dg-shouldfail "tsan" } */
+/* { dg-additional-options "-fno-unroll-loops" { target { powerpc-*-* 
powerpc64-*-* powerpc64le-*-* } } } */
 
 #include <pthread.h>
 #include <unistd.h>
diff --git a/gcc/testsuite/gcc.dg/pr59643.c b/gcc/testsuite/gcc.dg/pr59643.c
index de78d60..3aef439 100644
--- a/gcc/testsuite/gcc.dg/pr59643.c
+++ b/gcc/testsuite/gcc.dg/pr59643.c
@@ -1,6 +1,7 @@
 /* PR tree-optimization/59643 */
 /* { dg-do compile } */
 /* { dg-options "-O3 -fdump-tree-pcom-details" } */
+/* { dg-additional-options "-fno-unroll-loops" { target { powerpc-*-* 
powerpc64-*-* powerpc64le-*-* } } } */
 
 void
 foo (double *a, double *b, double *c, double d, double e, int n)
diff --git a/gcc/testsuite/gcc.target/powerpc/loop_align.c 
b/gcc/testsuite/gcc.target/powerpc/loop_align.c
index ebe3782..ef67f77 100644
--- a/gcc/testsuite/gcc.target/powerpc/loop_align.c
+++ b/gcc/testsuite/gcc.target/powerpc/loop_align.c
@@ -1,6 +1,6 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* powerpc-ibm-aix* } } */
-/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16" } */
+/* { dg-options "-O2 -mdejagnu-cpu=power7 -falign-functions=16 
-fno-unroll-loops" } */
 /* { dg-final { scan-assembler ".p2align 5" } } */
 
 void f(double *a, double *b, double *c, unsigned long n) {
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c 
b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
index b4945e6..2a5b92c 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-1.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math 
-fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 4 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c 
b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
index 5ed630a..bf2c67f 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-2.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_vsx_ok } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math 
-ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power7 -ffast-math 
-ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "xvmadd" 2 } } */
 /* { dg-final { scan-assembler-times "xsmadd\|fmadd\ " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c 
b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
index ef252b3..8608116 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-3.c
@@ -2,7 +2,7 @@
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec 
-ffast-math" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec 
-ffast-math -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 2 } } */
 /* { dg-final { scan-assembler-times "fmadd " 2 } } */
 /* { dg-final { scan-assembler-times "fmadds" 2 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c 
b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
index c2eaf1a..291c2ee 100644
--- a/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
+++ b/gcc/testsuite/gcc.target/powerpc/ppc-fma-4.c
@@ -2,7 +2,7 @@
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_altivec_ok } */
 /* { dg-require-effective-target powerpc_fprs } */
-/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec 
-ffast-math -ffp-contract=off" } */
+/* { dg-options "-O3 -ftree-vectorize -mdejagnu-cpu=power6 -maltivec 
-ffast-math -ffp-contract=off -fno-unroll-loops" } */
 /* { dg-final { scan-assembler-times "vmaddfp" 1 } } */
 /* { dg-final { scan-assembler-times "fmadd " 1 } } */
 /* { dg-final { scan-assembler-times "fmadds" 1 } } */
diff --git a/gcc/testsuite/gcc.target/powerpc/pr78604.c 
b/gcc/testsuite/gcc.target/powerpc/pr78604.c
index 76d8945..35bfdb3 100644
--- a/gcc/testsuite/gcc.target/powerpc/pr78604.c
+++ b/gcc/testsuite/gcc.target/powerpc/pr78604.c
@@ -1,7 +1,7 @@
 /* { dg-do compile { target { powerpc*-*-* } } } */
 /* { dg-skip-if "" { powerpc*-*-darwin* } } */
 /* { dg-require-effective-target powerpc_p8vector_ok } */
-/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize 
-fdump-tree-vect-details" } */
+/* { dg-options "-mdejagnu-cpu=power8 -O2 -ftree-vectorize 
-fdump-tree-vect-details -fno-unroll-loops" } */
 
 #ifndef SIZE
 #define SIZE 1024
diff --git a/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c 
b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
new file mode 100644
index 0000000..888ea50b
--- /dev/null
+++ b/gcc/testsuite/gcc.target/powerpc/small-loop-unroll.c
@@ -0,0 +1,13 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-loop2_unroll" } */
+
+void __attribute__ ((noinline)) foo(int n, int *arr)
+{
+  int i;
+  for (i = 0; i < n; i++)
+    arr[i] = arr[i] - 10;
+}
+/* { dg-final { scan-rtl-dump-times "Unrolled loop 1 times" 1 "loop2_unroll" } 
} */
+/* { dg-final { scan-assembler-times "lwz" 3 } } */
+/* { dg-final { scan-assembler-times "stw" 3 } } */
+
-- 
2.7.4

Reply via email to