Hi Bill, Thanks for your prompt review!
on 2021/8/12 上午12:34, Bill Schmidt wrote: > Hi Kewen, > > FWIW, it's easier on reviewers if you include the patch inline instead of as > an attachment. > > On 8/11/21 1:56 AM, Kewen.Lin wrote: >> Hi, >> >> This patch is to add the support to make vectorizer able to >> vectorize scalar version of some built-in functions with its >> corresponding vector version with Power10 support. >> >> Bootstrapped & regtested on powerpc64le-linux-gnu {P9,P10} >> and powerpc64-linux-gnu P8. >> >> Is it ok for trunk? >> >> BR, >> Kewen >> ----- >> gcc/ChangeLog: >> >> * config/rs6000/rs6000.c (rs6000_builtin_md_vectorized_function): Add >> support for some built-in functions vectorized on Power10. >> >> gcc/testsuite/ChangeLog: >> >> * gcc.target/powerpc/dive-vectorize-1.c: New test. >> * gcc.target/powerpc/dive-vectorize-1.h: New test. >> * gcc.target/powerpc/dive-vectorize-2.c: New test. >> * gcc.target/powerpc/dive-vectorize-2.h: New test. >> * gcc.target/powerpc/dive-vectorize-run-1.c: New test. >> * gcc.target/powerpc/dive-vectorize-run-2.c: New test. >> * gcc.target/powerpc/p10-bifs-vectorize-1.c: New test. >> * gcc.target/powerpc/p10-bifs-vectorize-1.h: New test. >> * gcc.target/powerpc/p10-bifs-vectorize-run-1.c: New test. > > --- > gcc/config/rs6000/rs6000.c | 55 +++++++++++++++++++ > .../gcc.target/powerpc/dive-vectorize-1.c | 11 ++++ > .../gcc.target/powerpc/dive-vectorize-1.h | 22 ++++++++ > .../gcc.target/powerpc/dive-vectorize-2.c | 12 ++++ > .../gcc.target/powerpc/dive-vectorize-2.h | 22 ++++++++ > .../gcc.target/powerpc/dive-vectorize-run-1.c | 52 ++++++++++++++++++ > .../gcc.target/powerpc/dive-vectorize-run-2.c | 53 ++++++++++++++++++ > .../gcc.target/powerpc/p10-bifs-vectorize-1.c | 15 +++++ > .../gcc.target/powerpc/p10-bifs-vectorize-1.h | 40 ++++++++++++++ > .../powerpc/p10-bifs-vectorize-run-1.c | 45 +++++++++++++++ > 10 files changed, 327 insertions(+) > create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h > create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h > create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c > create mode 100644 gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h > create mode 100644 > gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c > > diff --git a/gcc/config/rs6000/rs6000.c b/gcc/config/rs6000/rs6000.c > index 279f00cc648..3eac1d05101 100644 > --- a/gcc/config/rs6000/rs6000.c > +++ b/gcc/config/rs6000/rs6000.c > @@ -5785,6 +5785,61 @@ rs6000_builtin_md_vectorized_function (tree fndecl, > tree type_out, > default: > break; > } > + > + machine_mode in_vmode = TYPE_MODE (type_in); > + machine_mode out_vmode = TYPE_MODE (type_out); > + > + /* Power10 supported vectorized built-in functions. */ > + if (TARGET_POWER10 > + && in_vmode == out_vmode > + && VECTOR_UNIT_ALTIVEC_OR_VSX_P (in_vmode)) > + { > + machine_mode exp_mode = DImode; > + machine_mode exp_vmode = V2DImode; > + enum rs6000_builtins vname = RS6000_BUILTIN_COUNT; > > Using this as a flag value looks unnecessary. Is this just being done to > silence a warning? > Good question! I didn't notice there is a warning or not, just get used to initializing variable with one suitable value if possible. If you don't mind, may I still keep it? Since if some future codes use vname in a path where it's not assigned, one explicitly wrong enum (bif) seems better than a random one. Or will this mentioned possibility definitely never happen since the current uninitialized variables detection and warning scheme is robust and should not worry about that completely? > + switch (fn) > + { > + case MISC_BUILTIN_DIVWE: > + case MISC_BUILTIN_DIVWEU: > + exp_mode = SImode; > + exp_vmode = V4SImode; > + if (fn == MISC_BUILTIN_DIVWE) > + vname = P10V_BUILTIN_DIVES_V4SI; > + else > + vname = P10V_BUILTIN_DIVEU_V4SI; > + break; > + case MISC_BUILTIN_DIVDE: > + case MISC_BUILTIN_DIVDEU: > + if (fn == MISC_BUILTIN_DIVDE) > + vname = P10V_BUILTIN_DIVES_V2DI; > + else > + vname = P10V_BUILTIN_DIVEU_V2DI; > + break; > + case P10_BUILTIN_CFUGED: > + vname = P10V_BUILTIN_VCFUGED; > + break; > + case P10_BUILTIN_CNTLZDM: > + vname = P10V_BUILTIN_VCLZDM; > + break; > + case P10_BUILTIN_CNTTZDM: > + vname = P10V_BUILTIN_VCTZDM; > + break; > + case P10_BUILTIN_PDEPD: > + vname = P10V_BUILTIN_VPDEPD; > + break; > + case P10_BUILTIN_PEXTD: > + vname = P10V_BUILTIN_VPEXTD; > + break; > + default: > + return NULL_TREE; > + } > + > + if (vname != RS6000_BUILTIN_COUNT > > Check is not necessary, as you will have returned by now in that case. > Thanks for catching, I put break for "default" initially, didn't noticed the following condition need an adjustment after updating it to early return. Will fix it. BR, Kewen > Otherwise this patch LGTM. Thanks! Still needs maintainer approval, of > course. > > Bill > > + && in_mode == exp_mode > + && in_vmode == exp_vmode) > + return rs6000_builtin_decls[vname]; > + } > + > return NULL_TREE; > } > > diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c > b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c > new file mode 100644 > index 00000000000..84f1b0a88f2 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.c > @@ -0,0 +1,11 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize > -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */ > + > +/* Test if signed/unsigned int extended divisions get vectorized. */ > + > +#include "dive-vectorize-1.h" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > +/* { dg-final { scan-assembler-times {\mvdivesw\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mvdiveuw\M} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h > b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h > new file mode 100644 > index 00000000000..119f637b46b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-1.h > @@ -0,0 +1,22 @@ > +#define N 128 > + > +typedef signed int si; > +typedef unsigned int ui; > + > +si si_a[N], si_b[N], si_c[N]; > +ui ui_a[N], ui_b[N], ui_c[N]; > + > +__attribute__ ((noipa)) void > +test_divwe () > +{ > + for (int i = 0; i < N; i++) > + si_c[i] = __builtin_divwe (si_a[i], si_b[i]); > +} > + > +__attribute__ ((noipa)) void > +test_divweu () > +{ > + for (int i = 0; i < N; i++) > + ui_c[i] = __builtin_divweu (ui_a[i], ui_b[i]); > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c > b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c > new file mode 100644 > index 00000000000..4db0085562f > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.c > @@ -0,0 +1,12 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize > -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */ > + > +/* Test if signed/unsigned long long extended divisions get vectorized. */ > + > +#include "dive-vectorize-2.h" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 2 "vect" } } */ > +/* { dg-final { scan-assembler-times {\mvdivesd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mvdiveud\M} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h > b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h > new file mode 100644 > index 00000000000..1cab56b2e0b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-2.h > @@ -0,0 +1,22 @@ > +#define N 128 > + > +typedef signed long long sLL; > +typedef unsigned long long uLL; > + > +sLL sll_a[N], sll_b[N], sll_c[N]; > +uLL ull_a[N], ull_b[N], ull_c[N]; > + > +__attribute__ ((noipa)) void > +test_divde () > +{ > + for (int i = 0; i < N; i++) > + sll_c[i] = __builtin_divde (sll_a[i], sll_b[i]); > +} > + > +__attribute__ ((noipa)) void > +test_divdeu () > +{ > + for (int i = 0; i < N; i++) > + ull_c[i] = __builtin_divdeu (ull_a[i], ull_b[i]); > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c > b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c > new file mode 100644 > index 00000000000..1d5cbaa9f9b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-1.c > @@ -0,0 +1,52 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target power10_hw } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize > -fno-vect-cost-model" } */ > + > +#include "dive-vectorize-1.h" > + > +/* Check if test cases with signed/unsigned int extended division > + vectorization run successfully. */ > + > +__attribute__ ((optimize (1))) void > +check_divwe () > +{ > + test_divwe (); > + for (int i = 0; i < N; i++) > + { > + si exp = __builtin_divwe (si_a[i], si_b[i]); > + if (exp != si_c[i]) > + __builtin_abort (); > + } > +} > + > +__attribute__ ((optimize (1))) void > +check_divweu () > +{ > + test_divweu (); > + for (int i = 0; i < N; i++) > + { > + ui exp = __builtin_divweu (ui_a[i], ui_b[i]); > + if (exp != ui_c[i]) > + __builtin_abort (); > + } > +} > + > +int > +main () > +{ > + for (int i = 0; i < N; i++) > + { > + si_a[i] = 0x10 * (i * 3 + 2); > + si_b[i] = 0x7890 * (i * 3 + 1); > + ui_a[i] = 0x234 * (i * 11 + 3) - 0xcd * (i * 5 - 7); > + ui_b[i] = 0x6078 * (i * 7 + 3) + 0xef * (i * 7 - 11); > + if (si_b[i] == 0 || ui_b[i] == 0) > + __builtin_abort (); > + } > + > + check_divwe (); > + check_divweu (); > + > + return 0; > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c > b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c > new file mode 100644 > index 00000000000..921b07b3f1b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/dive-vectorize-run-2.c > @@ -0,0 +1,53 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-require-effective-target power10_hw } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize > -fno-vect-cost-model" } */ > + > +#include "dive-vectorize-2.h" > + > +/* Check if test cases with signed/unsigned int extended division > + vectorization run successfully. */ > + > +__attribute__ ((optimize (1))) void > +check_divde () > +{ > + test_divde (); > + for (int i = 0; i < N; i++) > + { > + sLL exp = __builtin_divde (sll_a[i], sll_b[i]); > + if (exp != sll_c[i]) > + __builtin_abort (); > + } > +} > + > +__attribute__ ((optimize (1))) void > +check_divdeu () > +{ > + test_divdeu (); > + for (int i = 0; i < N; i++) > + { > + uLL exp = __builtin_divdeu (ull_a[i], ull_b[i]); > + if (exp != ull_c[i]) > + __builtin_abort (); > + } > +} > + > +int > +main () > +{ > + for (int i = 0; i < N; i++) > + { > + sll_a[i] = 0x102 * (i * 3 + 2); > + sll_b[i] = 0x789ab * (i * 3 + 1); > + ull_a[i] = 0x2345 * (i * 11 + 3) - 0xcd1 * (i * 5 - 7); > + ull_b[i] = 0x6078e * (i * 7 + 3) + 0xefa * (i * 7 - 11); > + if (sll_b[i] == 0 || ull_b[i] == 0) > + __builtin_abort (); > + } > + > + check_divde (); > + check_divdeu (); > + > + return 0; > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c > b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c > new file mode 100644 > index 00000000000..9b8b3642ead > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.c > @@ -0,0 +1,15 @@ > +/* { dg-do compile } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-require-effective-target power10_ok } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize > -fno-vect-cost-model -fno-unroll-loops -fdump-tree-vect-details" } */ > + > +/* Test if some Power10 built-in functions get vectorized. */ > + > +#include "p10-bifs-vectorize-1.h" > + > +/* { dg-final { scan-tree-dump-times "vectorized 1 loops" 5 "vect" } } */ > +/* { dg-final { scan-assembler-times {\mvcfuged\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mvclzdm\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mvctzdm\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mvpdepd\M} 1 } } */ > +/* { dg-final { scan-assembler-times {\mvpextd\M} 1 } } */ > diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h > b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h > new file mode 100644 > index 00000000000..80b7aacf810 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-1.h > @@ -0,0 +1,40 @@ > +#define N 32 > + > +typedef unsigned long long uLL; > +uLL ull_a[N], ull_b[N], ull_c[N]; > + > +__attribute__ ((noipa)) void > +test_cfuged () > +{ > + for (int i = 0; i < N; i++) > + ull_c[i] = __builtin_cfuged (ull_a[i], ull_b[i]); > +} > + > +__attribute__ ((noipa)) void > +test_cntlzdm () > +{ > + for (int i = 0; i < N; i++) > + ull_c[i] = __builtin_cntlzdm (ull_a[i], ull_b[i]); > +} > + > +__attribute__ ((noipa)) void > +test_cnttzdm () > +{ > + for (int i = 0; i < N; i++) > + ull_c[i] = __builtin_cnttzdm (ull_a[i], ull_b[i]); > +} > + > +__attribute__ ((noipa)) void > +test_pdepd () > +{ > + for (int i = 0; i < N; i++) > + ull_c[i] = __builtin_pdepd (ull_a[i], ull_b[i]); > +} > + > +__attribute__ ((noipa)) void > +test_pextd () > +{ > + for (int i = 0; i < N; i++) > + ull_c[i] = __builtin_pextd (ull_a[i], ull_b[i]); > +} > + > diff --git a/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c > b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c > new file mode 100644 > index 00000000000..4b3b1165663 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/powerpc/p10-bifs-vectorize-run-1.c > @@ -0,0 +1,45 @@ > +/* { dg-do run } */ > +/* { dg-require-effective-target lp64 } */ > +/* { dg-require-effective-target power10_hw } */ > +/* { dg-options "-mdejagnu-cpu=power10 -O2 -ftree-vectorize > -fno-vect-cost-model" } */ > + > +#include "p10-bifs-vectorize-1.h" > + > +/* Check if vectorized built-in functions run expectedly. */ > + > +#define CHECK(name) > \ > + __attribute__ ((optimize (1))) void check_##name () > \ > + { > \ > + test_##name (); > \ > + for (int i = 0; i < N; i++) > \ > + { > \ > + uLL exp = __builtin_##name (ull_a[i], ull_b[i]); \ > + if (exp != ull_c[i]) \ > + __builtin_abort (); \ > + } > \ > + } > + > +CHECK (cfuged) > +CHECK (cntlzdm) > +CHECK (cnttzdm) > +CHECK (pdepd) > +CHECK (pextd) > + > +int > +main () > +{ > + for (int i = 0; i < N; i++) > + { > + ull_a[i] = 0x789a * (i * 11 - 5) - 0xcd1 * (i * 5 - 7); > + ull_b[i] = 0xfedc * (i * 7 + 3) + 0x467 * (i * 7 - 11); > + } > + > + check_cfuged (); > + check_cntlzdm (); > + check_cnttzdm (); > + check_pdepd (); > + check_pextd (); > + > + return 0; > +} > + > -- > 2.17.1 > > BR, Kewen