I am quoting my analysis from the PR. Could an aarch64 expert
pontificate here?
This test is checking the final assembly for a specific sequence. I
don't speak aarch64 assembly, but the IL is different coming out of evrp.
The first culprit is this difference in the mergephi1 dump:
_9 = .CTZ (x_6(D));
- _10 = _9 & 31;
+ _10 = _9;
These are unsigned ints, so assuming they are 32 bits on aarch64,
__builtin_ctz is always less than 32. This is because a CTZ of 0 is
undefined according to the GCC manual:
[[
Built-in Function: int __builtin_ctz (unsigned int x)
Returns the number of trailing 0-bits in x, starting at the least
significant bit position. If x is 0, the result is undefined.
]]
So a bitwise AND of anything less than 32 with 0x1f (31) is a no-op.
Here are the full IL differences:
--- legacy-evrp/pr90838.c.038t.mergephi1 2020-10-07
08:44:12.152358885 -0400
+++ ranger/pr90838.c.038t.mergephi1 2020-10-07 08:39:12.339296502 -0400
@@ -1,41 +1,41 @@
;; Function ctz1 (ctz1, funcdef_no=0, decl_uid=3587, cgraph_uid=1,
symbol_order=0)
ctz1 (unsigned int x)
{
static const char table[32] =
"\x00\x01\x1c\x02\x1d\x0e\x18\x03\x1e\x16\x14\x0f\x19\x11\x04\b\x1f\x1b\r\x17\x15\x13\x10\x07\x1a\f\x12\x06\v\x05\n\t";
unsigned int _1;
unsigned int _2;
unsigned int _3;
unsigned int _4;
char _5;
int _9;
int _10;
<bb 2> :
_1 = -x_6(D);
_2 = _1 & x_6(D);
_3 = _2 * 125613361;
_4 = _3 >> 27;
_9 = .CTZ (x_6(D));
- _10 = _9 & 31;
+ _10 = _9;
_5 = (char) _10;
return _10;
}
;; Function ctz2 (ctz2, funcdef_no=1, decl_uid=3591, cgraph_uid=2,
symbol_order=1)
ctz2 (unsigned int x)
{
static short int table[64] = {32, 0, 1, 12, 2, 6, 0, 13, 3, 0, 7, 0,
0, 0, 0, 14, 10, 4, 0, 0, 8, 0, 0, 25, 0, 0, 0, 0, 0, 21, 27, 15, 31,
11, 5, 0, 0, 0, 0, 0, 9, 0, 0,
24, 0, 0, 20, 26, 30, 0, 0, 0, 0, 23, 0, 19, 29, 0, 22, 18, 28, 17, 16, 0};
unsigned int _1;
unsigned int _2;
unsigned int _3;
short int _4;
int _8;
<bb 2> :
_1 = -x_5(D);
@@ -87,27 +87,27 @@
;; Function ctz4 (ctz4, funcdef_no=3, decl_uid=3601, cgraph_uid=4,
symbol_order=5)
ctz4 (long unsigned int x)
{
long unsigned int lsb;
long unsigned int _1;
long long unsigned int _2;
long long unsigned int _3;
char _4;
int _9;
int _10;
<bb 2> :
_1 = -x_5(D);
lsb_6 = _1 & x_5(D);
_2 = lsb_6 * 283881067100198605;
_3 = _2 >> 58;
_9 = .CTZ (x_5(D));
- _10 = _9 & 63;
+ _10 = _9;
_4 = (char) _10;
return _10;
}
The difference in assembly matches. We have 2 less AND's in the final
output:
$ diff -u legacy.s ranger.s
--- legacy.s 2020-10-07 09:06:13.420446783 -0400
+++ ranger.s 2020-10-07 09:06:42.646646949 -0400
@@ -8,7 +8,6 @@
ctz1:
rbit w0, w0
clz w0, w0
- and w0, w0, 31
ret
.size ctz1, .-ctz1
.align 2
@@ -36,7 +35,6 @@
ctz4:
rbit x0, x0
clz x0, x0
- and w0, w0, 63
ret
.size ctz4, .-ctz4
If my analysis is correct, we could just remove the line checking for
"and", or perhaps check that we don't have any and's.
OK for trunk?
Aldy
gcc/testsuite/ChangeLog:
PR target/97312
* gcc.target/aarch64/pr90838.c: Remove scan for AND.
diff --git a/gcc/testsuite/gcc.target/aarch64/pr90838.c
b/gcc/testsuite/gcc.target/aarch64/pr90838.c
index e1e19ac6a61..76cd5e18d2e 100644
--- a/gcc/testsuite/gcc.target/aarch64/pr90838.c
+++ b/gcc/testsuite/gcc.target/aarch64/pr90838.c
@@ -60,5 +60,4 @@ int ctz4 (unsigned long x)
}
/* { dg-final { scan-assembler-times "clz\t" 4 } } */
-/* { dg-final { scan-assembler-times "and\t" 2 } } */
/* { dg-final { scan-assembler-not "cmp\t.*0" } } */