I am quoting my analysis from the PR. Could an aarch64 expert pontificate here?

This test is checking the final assembly for a specific sequence. I don't speak aarch64 assembly, but the IL is different coming out of evrp.

The first culprit is this difference in the mergephi1 dump:

   _9 = .CTZ (x_6(D));
-  _10 = _9 & 31;
+  _10 = _9;

These are unsigned ints, so assuming they are 32 bits on aarch64, __builtin_ctz is always less than 32. This is because a CTZ of 0 is undefined according to the GCC manual:

[[
Built-in Function: int __builtin_ctz (unsigned int x)

Returns the number of trailing 0-bits in x, starting at the least significant bit position. If x is 0, the result is undefined.
]]

So a bitwise AND of anything less than 32 with 0x1f (31) is a no-op.

Here are the full IL differences:

--- legacy-evrp/pr90838.c.038t.mergephi1 2020-10-07 08:44:12.152358885 -0400
+++ ranger/pr90838.c.038t.mergephi1     2020-10-07 08:39:12.339296502 -0400
@@ -1,41 +1,41 @@

;; Function ctz1 (ctz1, funcdef_no=0, decl_uid=3587, cgraph_uid=1, symbol_order=0)

 ctz1 (unsigned int x)
 {
static const char table[32] = "\x00\x01\x1c\x02\x1d\x0e\x18\x03\x1e\x16\x14\x0f\x19\x11\x04\b\x1f\x1b\r\x17\x15\x13\x10\x07\x1a\f\x12\x06\v\x05\n\t";
   unsigned int _1;
   unsigned int _2;
   unsigned int _3;
   unsigned int _4;
   char _5;
   int _9;
   int _10;

   <bb 2> :
   _1 = -x_6(D);
   _2 = _1 & x_6(D);
   _3 = _2 * 125613361;
   _4 = _3 >> 27;
   _9 = .CTZ (x_6(D));
-  _10 = _9 & 31;
+  _10 = _9;
   _5 = (char) _10;
   return _10;

 }



;; Function ctz2 (ctz2, funcdef_no=1, decl_uid=3591, cgraph_uid=2, symbol_order=1)

 ctz2 (unsigned int x)
 {
static short int table[64] = {32, 0, 1, 12, 2, 6, 0, 13, 3, 0, 7, 0, 0, 0, 0, 14, 10, 4, 0, 0, 8, 0, 0, 25, 0, 0, 0, 0, 0, 21, 27, 15, 31, 11, 5, 0, 0, 0, 0, 0, 9, 0, 0,
24, 0, 0, 20, 26, 30, 0, 0, 0, 0, 23, 0, 19, 29, 0, 22, 18, 28, 17, 16, 0};
   unsigned int _1;
   unsigned int _2;
   unsigned int _3;
   short int _4;
   int _8;

   <bb 2> :
   _1 = -x_5(D);
@@ -87,27 +87,27 @@


;; Function ctz4 (ctz4, funcdef_no=3, decl_uid=3601, cgraph_uid=4, symbol_order=5)

 ctz4 (long unsigned int x)
 {
   long unsigned int lsb;
   long unsigned int _1;
   long long unsigned int _2;
   long long unsigned int _3;
   char _4;
   int _9;
   int _10;

   <bb 2> :
   _1 = -x_5(D);
   lsb_6 = _1 & x_5(D);
   _2 = lsb_6 * 283881067100198605;
   _3 = _2 >> 58;
   _9 = .CTZ (x_5(D));
-  _10 = _9 & 63;
+  _10 = _9;
   _4 = (char) _10;
   return _10;

 }

The difference in assembly matches. We have 2 less AND's in the final output:

$ diff -u legacy.s ranger.s
--- legacy.s    2020-10-07 09:06:13.420446783 -0400
+++ ranger.s    2020-10-07 09:06:42.646646949 -0400
@@ -8,7 +8,6 @@
 ctz1:
        rbit    w0, w0
        clz     w0, w0
-       and     w0, w0, 31
        ret
        .size   ctz1, .-ctz1
        .align  2
@@ -36,7 +35,6 @@
 ctz4:
        rbit    x0, x0
        clz     x0, x0
-       and     w0, w0, 63
        ret
        .size   ctz4, .-ctz4

If my analysis is correct, we could just remove the line checking for "and", or perhaps check that we don't have any and's.

OK for trunk?
Aldy

    gcc/testsuite/ChangeLog:

            PR target/97312
            * gcc.target/aarch64/pr90838.c: Remove scan for AND.

diff --git a/gcc/testsuite/gcc.target/aarch64/pr90838.c b/gcc/testsuite/gcc.target/aarch64/pr90838.c
index e1e19ac6a61..76cd5e18d2e 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr90838.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr90838.c
@@ -60,5 +60,4 @@ int ctz4 (unsigned long x)
 }

 /* { dg-final { scan-assembler-times "clz\t" 4 } } */
-/* { dg-final { scan-assembler-times "and\t" 2 } } */
 /* { dg-final { scan-assembler-not "cmp\t.*0" } } */

Reply via email to