arch12 provides pop count vector instructions for bigger elements than just chars.
gcc/testsuite/ChangeLog: 2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> * gcc.target/s390/vxe/popcount-1.c: New test. gcc/ChangeLog: 2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> * config/s390/vector.md ("popcountv16qi2", "popcountv8hi2") ("popcountv4si2", "popcountv2di2"): Rename to ... ("popcount<mode>2", "popcountv8hi2_vx", "popcountv4si2_vx") ("popcountv2di2_vx"): ... these and add !TARGET_VXE to the condition. ("popcount<mode>2_vxe"): New pattern. --- gcc/ChangeLog | 9 +++ gcc/config/s390/vector.md | 38 ++++++++--- gcc/testsuite/ChangeLog | 4 ++ gcc/testsuite/gcc.target/s390/vxe/popcount-1.c | 88 ++++++++++++++++++++++++++ 4 files changed, 131 insertions(+), 8 deletions(-) create mode 100644 gcc/testsuite/gcc.target/s390/vxe/popcount-1.c diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 89e7906..d516b4d 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,5 +1,14 @@ 2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> + * config/s390/vector.md ("popcountv16qi2", "popcountv8hi2") + ("popcountv4si2", "popcountv2di2"): Rename to ... + ("popcount<mode>2", "popcountv8hi2_vx", "popcountv4si2_vx") + ("popcountv2di2_vx"): ... these and add !TARGET_VXE to the + condition. + ("popcount<mode>2_vxe"): New pattern. + +2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> + * common/config/s390/s390-common.c (processor_flags_table): Add arch12. * config.gcc: Add arch12. diff --git a/gcc/config/s390/vector.md b/gcc/config/s390/vector.md index 68a8ed0..d4c0e95 100644 --- a/gcc/config/s390/vector.md +++ b/gcc/config/s390/vector.md @@ -715,11 +715,33 @@ ; Vector population count -(define_insn "popcountv16qi2" +(define_expand "popcount<mode>2" + [(set (match_operand:VI_HW 0 "register_operand" "=v") + (unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "v")] + UNSPEC_POPCNT))] + "TARGET_VX" +{ + if (TARGET_VXE) + emit_insn (gen_popcount<mode>2_vxe (operands[0], operands[1])); + else + emit_insn (gen_popcount<mode>2_vx (operands[0], operands[1])); + DONE; +}) + +; vpopctb, vpopcth, vpopctf, vpopctg +(define_insn "popcount<mode>2_vxe" + [(set (match_operand:VI_HW 0 "register_operand" "=v") + (unspec:VI_HW [(match_operand:VI_HW 1 "register_operand" "v")] + UNSPEC_POPCNT))] + "TARGET_VXE" + "vpopct<bhfgq>\t%v0,%v1" + [(set_attr "op_type" "VRR")]) + +(define_insn "popcountv16qi2_vx" [(set (match_operand:V16QI 0 "register_operand" "=v") (unspec:V16QI [(match_operand:V16QI 1 "register_operand" "v")] UNSPEC_POPCNT))] - "TARGET_VX" + "TARGET_VX && !TARGET_VXE" "vpopct\t%v0,%v1,0" [(set_attr "op_type" "VRR")]) @@ -729,7 +751,7 @@ ; of the result, add it to the result and extend it to halfword ; element size (unpack). -(define_expand "popcountv8hi2" +(define_expand "popcountv8hi2_vx" [(set (match_dup 2) (unspec:V16QI [(subreg:V16QI (match_operand:V8HI 1 "register_operand" "v") 0)] UNSPEC_POPCNT)) @@ -761,7 +783,7 @@ (and:V8HI (subreg:V8HI (match_dup 2) 0) (subreg:V8HI (match_dup 3) 0))) ] - "TARGET_VX" + "TARGET_VX && !TARGET_VXE" { operands[2] = gen_reg_rtx (V16QImode); operands[3] = gen_reg_rtx (V16QImode); @@ -769,20 +791,20 @@ operands[5] = CONST0_RTX (V16QImode); }) -(define_expand "popcountv4si2" +(define_expand "popcountv4si2_vx" [(set (match_dup 2) (unspec:V16QI [(subreg:V16QI (match_operand:V4SI 1 "register_operand" "v") 0)] UNSPEC_POPCNT)) (set (match_operand:V4SI 0 "register_operand" "=v") (unspec:V4SI [(match_dup 2) (match_dup 3)] UNSPEC_VEC_VSUM))] - "TARGET_VX" + "TARGET_VX && !TARGET_VXE" { operands[2] = gen_reg_rtx (V16QImode); operands[3] = force_reg (V16QImode, CONST0_RTX (V16QImode)); }) -(define_expand "popcountv2di2" +(define_expand "popcountv2di2_vx" [(set (match_dup 2) (unspec:V16QI [(subreg:V16QI (match_operand:V2DI 1 "register_operand" "v") 0)] UNSPEC_POPCNT)) @@ -792,7 +814,7 @@ (set (match_operand:V2DI 0 "register_operand" "=v") (unspec:V2DI [(match_dup 3) (match_dup 5)] UNSPEC_VEC_VSUMG))] - "TARGET_VX" + "TARGET_VX && !TARGET_VXE" { operands[2] = gen_reg_rtx (V16QImode); operands[3] = gen_reg_rtx (V4SImode); diff --git a/gcc/testsuite/ChangeLog b/gcc/testsuite/ChangeLog index bbdd3c8..6d178c5 100644 --- a/gcc/testsuite/ChangeLog +++ b/gcc/testsuite/ChangeLog @@ -1,5 +1,9 @@ 2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> + * gcc.target/s390/vxe/popcount-1.c: New test. + +2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> + * gcc.target/s390/vxe/bitops-1.c: New test. 2017-03-24 Andreas Krebbel <kreb...@linux.vnet.ibm.com> diff --git a/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c b/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c new file mode 100644 index 0000000..9ea835a --- /dev/null +++ b/gcc/testsuite/gcc.target/s390/vxe/popcount-1.c @@ -0,0 +1,88 @@ +/* { dg-do run } */ +/* { dg-options "-O3 -mzarch -march=arch12 --save-temps" } */ +/* { dg-require-effective-target s390_vxe } */ + +/* Vectorization currently only works for v4si. v8hi at least uses 2x + vpopctf but no vpopcth. */ + +typedef unsigned char uv16qi __attribute__((vector_size(16))); +typedef unsigned short uv8hi __attribute__((vector_size(16))); +typedef unsigned int uv4si __attribute__((vector_size(16))); +typedef unsigned long long uv2di __attribute__((vector_size(16))); + +uv16qi __attribute__((noinline)) +vpopctb (uv16qi a) +{ + uv16qi r; + int i; + + for (i = 0; i < 16; i++) + r[i] = __builtin_popcount (a[i]); + + return r; +} +/* { dg-final { scan-assembler "vpopctb\t%v24,%v24" { xfail *-*-* } } } */ + +uv8hi __attribute__((noinline)) +vpopcth (uv8hi a) +{ + uv8hi r; + int i; + + for (i = 0; i < 8; i++) + r[i] = __builtin_popcount (a[i]); + + return r; +} +/* { dg-final { scan-assembler "vpopcth\t%v24,%v24" { xfail *-*-* } } } */ + +uv4si __attribute__((noinline)) +vpopctf (uv4si a) +{ + uv4si r; + int i; + + for (i = 0; i < 4; i++) + r[i] = __builtin_popcount (a[i]); + + return r; +} +/* { dg-final { scan-assembler "vpopctf\t%v24,%v24" } } */ + +uv2di __attribute__((noinline)) +vpopctg (uv2di a) +{ + uv2di r; + int i; + + for (i = 0; i < 2; i++) + r[i] = __builtin_popcount (a[i]); + + return r; +} +/* { dg-final { scan-assembler "vpopctg\t%v24,%v24" { xfail *-*-* } } } */ + +int +main () +{ + uv16qi a = (uv16qi){ 42, 1, ~0, 2, 42, 1, ~0, 2, 42, 1, ~0, 2, 42, 1, ~0, 2 }; + if (__builtin_s390_vec_any_ne (vpopctb (a), + (uv16qi){ 3, 1, 8, 1, 3, 1, 8, 1, + 3, 1, 8, 1, 3, 1, 8, 1 })) + __builtin_abort (); + + if (__builtin_s390_vec_any_ne (vpopcth ((uv8hi){ 42, 1, ~0, 2, 42, 1, ~0, 2 }), + (uv8hi){ 3, 1, 16, 1, 3, 1, 16, 1 })) + __builtin_abort (); + + if (__builtin_s390_vec_any_ne (vpopctf ((uv4si){ 42, 1, ~0, 2 }), + (uv4si){ 3, 1, 32, 1 })) + __builtin_abort (); + + if (__builtin_s390_vec_any_ne (vpopctg ((uv2di){ 42, 1 }), + (uv2di){ 3, 1 })) + __builtin_abort (); + + + return 0; +} -- 2.9.1