On Thu, Oct 12, 2017 at 9:11 PM, Jakub Jelinek <ja...@redhat.com> wrote: > On Thu, Oct 12, 2017 at 10:40:22AM +0200, Uros Bizjak wrote: >> > So, if you aren't against it, I can extend the patch to handle the 4 >> > other mask patterns; as for other modes, SImode is what is being handled >> > already, DImode is not a problem, because the FEs truncate the shift counts >> > to integer_type_node already, and for HImode I haven't seen problem >> > probably because most tunings avoid HImode math and so it isn't worth >> > optimizing. >> >> OK, I think that we can live wtih 4 new patterns. Since these are all >> written in the same way (as in the patch you posted), the ammended >> patch is pre-approved for mainline. > > Thanks, here is what I've committed to trunk after another bootstrap/regtest > on x86_64-linux and i686-linux: > > 2017-10-12 Jakub Jelinek <ja...@redhat.com> > > PR target/82498 > * config/i386/i386.md (*ashl<mode>3_mask_1, > *<shift_insn><mode>3_mask_1, *<rotate_insn><mode>3_mask_1, > *<btsc><mode>_mask_1, *btr<mode>_mask_1): New define_insn_and_split > patterns. > > * gcc.target/i386/pr82498-1.c: New test. > * gcc.target/i386/pr82498-2.c: New test.
OK for mainline. Thanks, Uros. > --- gcc/config/i386/i386.md.jj 2017-10-11 22:37:55.933863355 +0200 > +++ gcc/config/i386/i386.md 2017-10-12 11:30:38.191535974 +0200 > @@ -10228,6 +10228,26 @@ (define_insn_and_split "*ashl<mode>3_mas > (clobber (reg:CC FLAGS_REG))])] > "operands[2] = gen_lowpart (QImode, operands[2]);") > > +(define_insn_and_split "*ashl<mode>3_mask_1" > + [(set (match_operand:SWI48 0 "nonimmediate_operand") > + (ashift:SWI48 > + (match_operand:SWI48 1 "nonimmediate_operand") > + (and:QI > + (match_operand:QI 2 "register_operand") > + (match_operand:QI 3 "const_int_operand")))) > + (clobber (reg:CC FLAGS_REG))] > + "ix86_binary_operator_ok (ASHIFT, <MODE>mode, operands) > + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) > + == GET_MODE_BITSIZE (<MODE>mode)-1 > + && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(parallel > + [(set (match_dup 0) > + (ashift:SWI48 (match_dup 1) > + (match_dup 2))) > + (clobber (reg:CC FLAGS_REG))])]) > + > (define_insn "*bmi2_ashl<mode>3_1" > [(set (match_operand:SWI48 0 "register_operand" "=r") > (ashift:SWI48 (match_operand:SWI48 1 "nonimmediate_operand" "rm") > @@ -10728,6 +10748,26 @@ (define_insn_and_split "*<shift_insn><mo > (clobber (reg:CC FLAGS_REG))])] > "operands[2] = gen_lowpart (QImode, operands[2]);") > > +(define_insn_and_split "*<shift_insn><mode>3_mask_1" > + [(set (match_operand:SWI48 0 "nonimmediate_operand") > + (any_shiftrt:SWI48 > + (match_operand:SWI48 1 "nonimmediate_operand") > + (and:QI > + (match_operand:QI 2 "register_operand") > + (match_operand:QI 3 "const_int_operand")))) > + (clobber (reg:CC FLAGS_REG))] > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) > + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) > + == GET_MODE_BITSIZE (<MODE>mode)-1 > + && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(parallel > + [(set (match_dup 0) > + (any_shiftrt:SWI48 (match_dup 1) > + (match_dup 2))) > + (clobber (reg:CC FLAGS_REG))])]) > + > (define_insn_and_split "*<shift_insn><mode>3_doubleword" > [(set (match_operand:DWI 0 "register_operand" "=&r") > (any_shiftrt:DWI (match_operand:DWI 1 "register_operand" "0") > @@ -11187,6 +11227,26 @@ (define_insn_and_split "*<rotate_insn><m > (clobber (reg:CC FLAGS_REG))])] > "operands[2] = gen_lowpart (QImode, operands[2]);") > > +(define_insn_and_split "*<rotate_insn><mode>3_mask_1" > + [(set (match_operand:SWI48 0 "nonimmediate_operand") > + (any_rotate:SWI48 > + (match_operand:SWI48 1 "nonimmediate_operand") > + (and:QI > + (match_operand:QI 2 "register_operand") > + (match_operand:QI 3 "const_int_operand")))) > + (clobber (reg:CC FLAGS_REG))] > + "ix86_binary_operator_ok (<CODE>, <MODE>mode, operands) > + && (INTVAL (operands[3]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) > + == GET_MODE_BITSIZE (<MODE>mode)-1 > + && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(parallel > + [(set (match_dup 0) > + (any_rotate:SWI48 (match_dup 1) > + (match_dup 2))) > + (clobber (reg:CC FLAGS_REG))])]) > + > ;; Implement rotation using two double-precision > ;; shift instructions and a scratch register. > > @@ -11494,6 +11554,30 @@ (define_insn_and_split "*<btsc><mode>_ma > (clobber (reg:CC FLAGS_REG))])] > "operands[1] = gen_lowpart (QImode, operands[1]);") > > +(define_insn_and_split "*<btsc><mode>_mask_1" > + [(set (match_operand:SWI48 0 "register_operand") > + (any_or:SWI48 > + (ashift:SWI48 > + (const_int 1) > + (and:QI > + (match_operand:QI 1 "register_operand") > + (match_operand:QI 2 "const_int_operand"))) > + (match_operand:SWI48 3 "register_operand"))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_USE_BT > + && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) > + == GET_MODE_BITSIZE (<MODE>mode)-1 > + && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(parallel > + [(set (match_dup 0) > + (any_or:SWI48 > + (ashift:SWI48 (const_int 1) > + (match_dup 1)) > + (match_dup 3))) > + (clobber (reg:CC FLAGS_REG))])]) > + > (define_insn "*btr<mode>" > [(set (match_operand:SWI48 0 "register_operand" "=r") > (and:SWI48 > @@ -11535,6 +11619,30 @@ (define_insn_and_split "*btr<mode>_mask" > (clobber (reg:CC FLAGS_REG))])] > "operands[1] = gen_lowpart (QImode, operands[1]);") > > +(define_insn_and_split "*btr<mode>_mask_1" > + [(set (match_operand:SWI48 0 "register_operand") > + (and:SWI48 > + (rotate:SWI48 > + (const_int -2) > + (and:QI > + (match_operand:QI 1 "register_operand") > + (match_operand:QI 2 "const_int_operand"))) > + (match_operand:SWI48 3 "register_operand"))) > + (clobber (reg:CC FLAGS_REG))] > + "TARGET_USE_BT > + && (INTVAL (operands[2]) & (GET_MODE_BITSIZE (<MODE>mode)-1)) > + == GET_MODE_BITSIZE (<MODE>mode)-1 > + && can_create_pseudo_p ()" > + "#" > + "&& 1" > + [(parallel > + [(set (match_dup 0) > + (and:SWI48 > + (rotate:SWI48 (const_int -2) > + (match_dup 1)) > + (match_dup 3))) > + (clobber (reg:CC FLAGS_REG))])]) > + > ;; These instructions are never faster than the corresponding > ;; and/ior/xor operations when using immediate operand, so with > ;; 32-bit there's no point. But in 64-bit, we can't hold the > --- gcc/testsuite/gcc.target/i386/pr82498-1.c.jj 2017-10-12 > 11:18:48.905128703 +0200 > +++ gcc/testsuite/gcc.target/i386/pr82498-1.c 2017-10-12 11:18:48.905128703 > +0200 > @@ -0,0 +1,52 @@ > +/* PR target/82498 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mtune=generic -masm=att" } */ > +/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */ > + > +unsigned > +f1 (unsigned x, unsigned char y) > +{ > + if (y == 0) > + return x; > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)); > +} > + > +unsigned > +f2 (unsigned x, unsigned y) > +{ > + if (y == 0) > + return x; > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)); > +} > + > +unsigned > +f3 (unsigned x, unsigned short y) > +{ > + if (y == 0) > + return x; > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x << y) | (x >> (__CHAR_BIT__ * __SIZEOF_INT__ - y)); > +} > + > +unsigned > +f4 (unsigned x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))); > +} > + > +unsigned > +f5 (unsigned x, unsigned int y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))); > +} > + > +unsigned > +f6 (unsigned x, unsigned short y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x << y) | (x >> (-y & (__CHAR_BIT__ * __SIZEOF_INT__ - 1))); > +} > --- gcc/testsuite/gcc.target/i386/pr82498-2.c.jj 2017-10-12 > 12:14:53.452321121 +0200 > +++ gcc/testsuite/gcc.target/i386/pr82498-2.c 2017-10-12 12:08:53.000000000 > +0200 > @@ -0,0 +1,46 @@ > +/* PR target/82498 */ > +/* { dg-do compile } */ > +/* { dg-options "-O2 -mtune=generic -masm=att" } */ > +/* { dg-final { scan-assembler-not {\mand[bwlq]\M} } } */ > + > +int > +f1 (int x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return x >> y; > +} > + > +unsigned > +f2 (unsigned x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return x >> y; > +} > + > +unsigned > +f3 (unsigned x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return x << y; > +} > + > +unsigned > +f4 (unsigned x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return x | (1U << y); > +} > + > +unsigned > +f5 (unsigned x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return x ^ (1U << y); > +} > + > +unsigned > +f6 (unsigned x, unsigned char y) > +{ > + y &= __CHAR_BIT__ * __SIZEOF_INT__ - 1; > + return (x + 2) & ~(1U << y); > +} > > > Jakub