Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
On Fri, Sep 23, 2022 at 11:07 AM Hu, Lin1 wrote: > > Hi, Hongtao > > I have modefied this patch and regtested on x86_64-pc-linux-gnu. > Ok. > BRs. > Lin > > -Original Message- > From: Hongtao Liu > Sent: Friday, September 23, 2022 9:48 AM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao > Subject: Re: [PATCH] i386: Optimize code generation of > __mm256_zextsi128_si256(__mm_set1_epi8(-1)) > > On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches > wrote: > > > > Hi all, > > > > This patch aims to optimize code generation of > > __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of > > instructions required to achieve the final result. > > > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > > > BRs, > > Lin > > > > gcc/ChangeLog: > > > > PR target/94962 > > * config/i386/constraints.md (BH): New define_constraint. > > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 > > when operand matches new predicate. > > (standard_sse_constant_opcode): Add new alternative branch to > > return "vpcmpeqd". > > * config/i386/predicates.md > > (vector_all_ones_zero_extend_half_operand): New define_predicate. > > (vector_all_ones_zero_extend_quarter_operand): Ditto. > > * config/i386/sse.md: Add constraint to insn "mov_internal". > (mov_internal): Add new constraint BH. > Put the insn name at first. > > > > gcc/testsuite/ChangeLog: > > > > PR target/94962 > > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > > * gcc.target/i386/pr94962-1.c: New test. > > * gcc.target/i386/pr94962-2.c: Ditto. > > * gcc.target/i386/pr94962-3.c: Ditto. > > * gcc.target/i386/pr94962-4.c: Ditto. > > --- > > gcc/config/i386/constraints.md| 8 +++ > > gcc/config/i386/i386.cc | 26 +++- > > gcc/config/i386/predicates.md | 49 ++ > > gcc/config/i386/sse.md| 8 +-- > > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > > .../i386/avx256-unaligned-store-1.c | 4 +- > > .../i386/avx256-unaligned-store-2.c | 4 +- > > .../i386/avx256-unaligned-store-3.c | 4 +- > > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 > > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + > > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ > > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ > > 12 files changed, 235 insertions(+), 13 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > > > diff --git a/gcc/config/i386/constraints.md > > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 > > --- a/gcc/config/i386/constraints.md > > +++ b/gcc/config/i386/constraints.md > > @@ -168,6 +168,9 @@ > > ;; z Constant call address operand. > > ;; C Integer SSE constant with all bits set operand. > > ;; F Floating-point SSE constant with all bits set operand. > > +;; H Integer SSE constant that is 128/256bit all ones > > +;; and zero-extand to 256/512bit, or 128bit all ones > > +;; and zero-extend to 512bit. > > ;; M x86-64 memory operand. > > > > (define_constraint "Bf" > > @@ -233,6 +236,11 @@ > >(and (match_test "TARGET_SSE") > > (match_operand 0 "float_vector_all_ones_operand"))) > > > > +(define_constraint "BH" > > + "@internal integer constant with last half/quarter bits set operand." > > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > > + (match_operand 0 > > +"vector_all_ones_zero_extend_quarter_operand"))) > > + > > ;; NB: Similar to 'm', but don't use define_memory_constraint on > > x86-64 ;; to prevent LRA from converting the operand to the form '(mem > > (reg X))' > > ;; where X is a base register. > > diff --git a/gcc/config/i386/i386.cc b/gcc/conf
RE: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
Hi, Hongtao I have modefied this patch and regtested on x86_64-pc-linux-gnu. BRs. Lin -Original Message- From: Hongtao Liu Sent: Friday, September 23, 2022 9:48 AM To: Hu, Lin1 Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao Subject: Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1)) On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches wrote: > > Hi all, > > This patch aims to optimize code generation of > __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of > instructions required to achieve the final result. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/94962 > * config/i386/constraints.md (BH): New define_constraint. > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when > operand matches new predicate. > (standard_sse_constant_opcode): Add new alternative branch to return > "vpcmpeqd". > * config/i386/predicates.md > (vector_all_ones_zero_extend_half_operand): New define_predicate. > (vector_all_ones_zero_extend_quarter_operand): Ditto. > * config/i386/sse.md: Add constraint to insn "mov_internal". (mov_internal): Add new constraint BH. Put the insn name at first. > > gcc/testsuite/ChangeLog: > > PR target/94962 > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > * gcc.target/i386/pr94962-1.c: New test. > * gcc.target/i386/pr94962-2.c: Ditto. > * gcc.target/i386/pr94962-3.c: Ditto. > * gcc.target/i386/pr94962-4.c: Ditto. > --- > gcc/config/i386/constraints.md| 8 +++ > gcc/config/i386/i386.cc | 26 +++- > gcc/config/i386/predicates.md | 49 ++ > gcc/config/i386/sse.md| 8 +-- > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > .../i386/avx256-unaligned-store-1.c | 4 +- > .../i386/avx256-unaligned-store-2.c | 4 +- > .../i386/avx256-unaligned-store-3.c | 4 +- > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ > 12 files changed, 235 insertions(+), 13 deletions(-) create mode > 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > diff --git a/gcc/config/i386/constraints.md > b/gcc/config/i386/constraints.md index 7361687632f..95b2b142d41 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,9 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; H Integer SSE constant that is 128/256bit all ones > +;; and zero-extand to 256/512bit, or 128bit all ones > +;; and zero-extend to 512bit. > ;; M x86-64 memory operand. > > (define_constraint "Bf" > @@ -233,6 +236,11 @@ >(and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BH" > + "@internal integer constant with last half/quarter bits set operand." > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > + (match_operand 0 > +"vector_all_ones_zero_extend_quarter_operand"))) > + > ;; NB: Similar to 'm', but don't use define_memory_constraint on > x86-64 ;; to prevent LRA from converting the operand to the form '(mem (reg > X))' > ;; where X is a base register. > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc index > dadf453d6c0..ca799da5d7e 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) >XFmode); } > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > + and 3 if X is all bits 1 with zero extend > in supported SSE/AVX vector mode. */ > > int > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) >
Re: [PATCH] i386: Optimize code generation of __mm256_zextsi128_si256(__mm_set1_epi8(-1))
On Thu, Sep 22, 2022 at 3:20 PM Hu, Lin1 via Gcc-patches wrote: > > Hi all, > > This patch aims to optimize code generation of > __mm256_zextsi128_si256(__mm_set1_epi8(-1)). Reduce the number of > instructions required to achieve the final result. > > Regtested on x86_64-pc-linux-gnu. Ok for trunk? > > BRs, > Lin > > gcc/ChangeLog: > > PR target/94962 > * config/i386/constraints.md (BH): New define_constraint. > * config/i386/i386.cc (standard_sse_constant_p): Add return 3/4 when > operand matches new predicate. > (standard_sse_constant_opcode): Add new alternative branch to return > "vpcmpeqd". > * config/i386/predicates.md > (vector_all_ones_zero_extend_half_operand): New define_predicate. > (vector_all_ones_zero_extend_quarter_operand): Ditto. > * config/i386/sse.md: Add constraint to insn "mov_internal". (mov_internal): Add new constraint BH. Put the insn name at first. > > gcc/testsuite/ChangeLog: > > PR target/94962 > * gcc.target/i386/avx256-unaligned-load-1.c: Modify test. > * gcc.target/i386/avx256-unaligned-store-1.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-2.c: Ditto. > * gcc.target/i386/avx256-unaligned-store-3.c: Ditto. > * gcc.target/i386/pr94962-1.c: New test. > * gcc.target/i386/pr94962-2.c: Ditto. > * gcc.target/i386/pr94962-3.c: Ditto. > * gcc.target/i386/pr94962-4.c: Ditto. > --- > gcc/config/i386/constraints.md| 8 +++ > gcc/config/i386/i386.cc | 26 +++- > gcc/config/i386/predicates.md | 49 ++ > gcc/config/i386/sse.md| 8 +-- > .../gcc.target/i386/avx256-unaligned-load-1.c | 4 +- > .../i386/avx256-unaligned-store-1.c | 4 +- > .../i386/avx256-unaligned-store-2.c | 4 +- > .../i386/avx256-unaligned-store-3.c | 4 +- > gcc/testsuite/gcc.target/i386/pr94962-1.c | 11 > gcc/testsuite/gcc.target/i386/pr94962-2.c | 17 + > gcc/testsuite/gcc.target/i386/pr94962-3.c | 64 +++ > gcc/testsuite/gcc.target/i386/pr94962-4.c | 49 ++ > 12 files changed, 235 insertions(+), 13 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr94962-4.c > > diff --git a/gcc/config/i386/constraints.md b/gcc/config/i386/constraints.md > index 7361687632f..95b2b142d41 100644 > --- a/gcc/config/i386/constraints.md > +++ b/gcc/config/i386/constraints.md > @@ -168,6 +168,9 @@ > ;; z Constant call address operand. > ;; C Integer SSE constant with all bits set operand. > ;; F Floating-point SSE constant with all bits set operand. > +;; H Integer SSE constant that is 128/256bit all ones > +;; and zero-extand to 256/512bit, or 128bit all ones > +;; and zero-extend to 512bit. > ;; M x86-64 memory operand. > > (define_constraint "Bf" > @@ -233,6 +236,11 @@ >(and (match_test "TARGET_SSE") > (match_operand 0 "float_vector_all_ones_operand"))) > > +(define_constraint "BH" > + "@internal integer constant with last half/quarter bits set operand." > + (ior (match_operand 0 "vector_all_ones_zero_extend_half_operand") > + (match_operand 0 "vector_all_ones_zero_extend_quarter_operand"))) > + > ;; NB: Similar to 'm', but don't use define_memory_constraint on x86-64 > ;; to prevent LRA from converting the operand to the form '(mem (reg X))' > ;; where X is a base register. > diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc > index dadf453d6c0..ca799da5d7e 100644 > --- a/gcc/config/i386/i386.cc > +++ b/gcc/config/i386/i386.cc > @@ -5186,7 +5186,8 @@ standard_80387_constant_rtx (int idx) >XFmode); > } > > -/* Return 1 if X is all bits 0 and 2 if X is all bits 1 > +/* Return 1 if X is all bits 0, 2 if X is all bits 1 > + and 3 if X is all bits 1 with zero extend > in supported SSE/AVX vector mode. */ > > int > @@ -5234,6 +5235,10 @@ standard_sse_constant_p (rtx x, machine_mode pred_mode) > } > } > > + if (vector_all_ones_zero_extend_half_operand (x, mode) > + || vector_all_ones_zero_extend_quarter_operand (x, mode)) > +return 3; > + >return 0; > } > > @@ -5341,6 +5346,25 @@ standard_sse_constant_opcode (rtx_insn *insn, rtx > *operands) > gcc_unreachable (); > } > } > + else if (vector_all_ones_zero_extend_half_operand (x, mode)) > +{ > + if (GET_MODE_SIZE (mode) == 64) > + { > + gcc_assert (TARGET_AVX512F); > + return "vpcmpeqd \t %t0, %t0, %t0"; > + } > + else if (GET_MODE_SIZE (mode) == 32) > + { > + gcc_assert (TARGET_AVX); > + return "vpcmpeqd \t %x0, %x0, %x0"; > + } > +