Hi Marc, Here's version #2 of the patch to recognize bswap32 and bswap64 incorporating your suggestions and feedback. The test cases now confirm the transformation is applied when int is 32 bits and long is 64 bits, and should pass otherwise; the patterns now reuse (more) capturing groups, and the patterns have been made more generic to allow the ultimate type to be signed or unsigned (hence there are now two new gcc.dg tests).
Alas my efforts to allow the input argument to be signed, and use fold_convert to coerce it to the correct type before calling __builtin_bswap failed, with the error messages: >fold-bswap-2.c: In function 'swap64': >fold-bswap-2.c:22:1: error: invalid argument to gimple call >(long unsigned int) x_6(D) >_12 = __builtin_bswap64 ((long unsigned int) x_6(D)); >during GIMPLE pass: forwprop >fold-bswap-2.c:22:1: internal compiler error: verify_gimple failed So I require arguments to be the expected type for now. If anyone's sufficiently motivated to support these cases, this can be done as a follow-up patch. This revised patch has been tested on x86_64-pc-linux-gnu with a "make bootstrap" and "make -k check" with no new failures. Ok for mainline? Thanks in advance, Roger -- -----Original Message----- From: Marc Glisse <marc.gli...@inria.fr> Sent: 12 August 2020 10:43 To: Roger Sayle <ro...@nextmovesoftware.com> Cc: 'GCC Patches' <gcc-patches@gcc.gnu.org> Subject: Re: [PATCH] middle-end: Recognize idioms for bswap32 and bswap64 in match.pd. On Wed, 12 Aug 2020, Roger Sayle wrote: > This patch is inspired by a small code fragment in comment #3 of > bugzilla PR rtl-optimization/94804. That snippet appears almost > unrelated to the topic of the PR, but recognizing __builtin_bswap64 > from two __builtin_bswap32 calls, seems like a clever/useful trick. > GCC's optabs.c contains the inverse logic to expand bswap64 by IORing > two bswap32 calls, so this transformation/canonicalization is safe, > even on targets without suitable optab support. But on x86_64, the > swap64 of the test case becomes a single instruction. > > > This patch has been tested on x86_64-pc-linux-gnu with a "make > bootstrap" and a "make -k check" with no new failures. > Ok for mainline? Your tests seem to assume that int has 32 bits and long 64. + (if (operand_equal_p (@0, @2, 0) Why not reuse @0 instead of introducing @2 in the pattern? Similarly, it may be a bit shorter to reuse @1 instead of a new @3 (I don't think the tricks with @@ will be needed here). + && types_match (TREE_TYPE (@0), uint64_type_node) that seems very specific. What goes wrong with a signed type for instance? +(simplify + (bit_ior:c + (lshift + (convert (BUILT_IN_BSWAP16 (convert (bit_and @0 + INTEGER_CST@1)))) + (INTEGER_CST@2)) + (convert (BUILT_IN_BSWAP16 (convert (rshift @3 + INTEGER_CST@4))))) I didn't realize we kept this useless bit_and when casting to a smaller type. We probably get a different pattern on 16-bit targets, but a pattern they do not match won't hurt them. -- Marc Glisse
diff --git a/gcc/match.pd b/gcc/match.pd index c3b8816..c682d3d 100644 --- a/gcc/match.pd +++ b/gcc/match.pd @@ -3410,6 +3410,33 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT) (bswap (bitop:c (bswap @0) @1)) (bitop @0 (bswap @1))))) +/* Recognize ((T)bswap32(x)<<32)|bswap32(x>>32) as bswap64(x). */ +(simplify + (bit_ior:c + (lshift (convert (BUILT_IN_BSWAP32 (convert@0 @1))) + INTEGER_CST@2) + (convert (BUILT_IN_BSWAP32 (convert@3 (rshift @1 @2))))) + (if (INTEGRAL_TYPE_P (type) + && TYPE_PRECISION (type) == 64 + && types_match (TREE_TYPE (@1), uint64_type_node) + && types_match (TREE_TYPE (@0), uint32_type_node) + && types_match (TREE_TYPE (@3), uint32_type_node) + && wi::to_widest (@2) == 32) + (convert (BUILT_IN_BSWAP64 @1)))) + +/* Recognize ((T)bswap16(x)<<16)|bswap16(x>>16) as bswap32(x). */ +(simplify + (bit_ior:c + (lshift + (convert (BUILT_IN_BSWAP16 (convert (bit_and @0 INTEGER_CST@1)))) + (INTEGER_CST@2)) + (convert (BUILT_IN_BSWAP16 (convert (rshift @0 @2))))) + (if (INTEGRAL_TYPE_P (type) + && TYPE_PRECISION (type) == 32 + && types_match (TREE_TYPE (@0), uint32_type_node) + && wi::to_widest (@1) == 65535 + && wi::to_widest (@2) == 16) + (convert (BUILT_IN_BSWAP32 @0)))) /* Combine COND_EXPRs and VEC_COND_EXPRs. */
diff --git a/gcc/testsuite/gcc.dg/fold-bswap-1.c b/gcc/testsuite/gcc.dg/fold-bswap-1.c new file mode 100644 index 0000000..3abb862 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-bswap-1.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +unsigned int swap32(unsigned int x) +{ + if (sizeof(unsigned int)==4 && sizeof(unsigned short)==2) { + unsigned int a = __builtin_bswap16(x); + x >>= 16; + a <<= 16; + return __builtin_bswap16(x) | a; + } else return __builtin_bswap32(x); +} + +unsigned long swap64(unsigned long x) +{ + if (sizeof(unsigned long)==8 && sizeof(unsigned int)==4) { + unsigned long a = __builtin_bswap32(x); + x >>= 32; + a <<= 32; + return __builtin_bswap32(x) | a; + } else return __builtin_bswap64(x); +} + +/* { dg-final { scan-tree-dump-times "__builtin_bswap32" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_bswap64" 1 "optimized" } } */ + diff --git a/gcc/testsuite/gcc.dg/fold-bswap-2.c b/gcc/testsuite/gcc.dg/fold-bswap-2.c new file mode 100644 index 0000000..a581fd6 --- /dev/null +++ b/gcc/testsuite/gcc.dg/fold-bswap-2.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -fdump-tree-optimized" } */ + +int swap32(unsigned int x) +{ + if (sizeof(int)==4 && sizeof(short)==2) { + int a = __builtin_bswap16(x); + x >>= 16; + a <<= 16; + return __builtin_bswap16(x) | a; + } else return __builtin_bswap32(x); +} + +long swap64(unsigned long x) +{ + if (sizeof(long)==8 && sizeof(int)==4) { + long a = __builtin_bswap32(x); + x >>= 32; + a <<= 32; + return __builtin_bswap32(x) | a; + } else return __builtin_bswap64(x); +} + +/* { dg-final { scan-tree-dump-times "__builtin_bswap32" 1 "optimized" } } */ +/* { dg-final { scan-tree-dump-times "__builtin_bswap64" 1 "optimized" } } */ +