(I'll cross-post this to gcc and keep it on gcc-help after that.)
On Thu, Oct 6, 2011 at 4:46 PM, Andrew Haley <[email protected]> wrote:
>
> inline int8_t as_signed_8 (unsigned int a) {
> a &= 0xff;
> return a & 0x80 ? (int)a - 0x100 : a;
> }
>
> int overflow(unsigned int a, unsigned int b) {
> int sum = as_signed_8(a) + as_signed_8(b);
> return as_signed_8(sum) != sum;
> }
>
> Andrew.
>
That's a really neat trick, and seems to generate identical code. Thanks!
I'd be interesting to know if this version produces equally efficient
code with MSVC.
To summarize what we have so far, here's four different methods along
with the code generated for X86 and ARM (GCC 4.5.2):
#include <inttypes.h>
inline int8_t as_signed_8(unsigned int a) {
a &= 0xff;
return a & 0x80 ? (int)a - 0x100 : a;
}
bool overflow_range(unsigned int a, unsigned int b) {
const int sum = as_signed_8(a) + as_signed_8(b);
return sum < -128 || sum > 127;
}
bool overflow_bit(unsigned int a, unsigned int b) {
const unsigned int sum = a + b;
return ~(a ^ b) & (a ^ sum) & 0x80;
}
bool overflow_unsafe(unsigned int a, unsigned int b) {
const unsigned int sum = (int8_t)a + (int8_t)b;
return (int8_t)sum != sum;
}
bool overflow_safe(unsigned int a, unsigned int b) {
const int sum = as_signed_8(a) + as_signed_8(b);
return as_signed_8(sum) != sum;
}
Output for X86 with -O3 -fomit-frame-pointer:
00000000 <_Z14overflow_rangejj>:
0: 0f be 54 24 04 movsbl 0x4(%esp),%edx
5: 0f be 44 24 08 movsbl 0x8(%esp),%eax
a: 8d 84 02 80 00 00 00 lea 0x80(%edx,%eax,1),%eax
11: 3d ff 00 00 00 cmp $0xff,%eax
16: 0f 97 c0 seta %al
19: c3 ret
1a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
00000020 <_Z12overflow_bitjj>:
20: 8b 54 24 08 mov 0x8(%esp),%edx
24: 8b 4c 24 04 mov 0x4(%esp),%ecx
28: 89 d0 mov %edx,%eax
2a: 31 c8 xor %ecx,%eax
2c: 01 ca add %ecx,%edx
2e: 31 ca xor %ecx,%edx
30: f7 d0 not %eax
32: 21 d0 and %edx,%eax
34: a8 80 test $0x80,%al
36: 0f 95 c0 setne %al
39: c3 ret
3a: 8d b6 00 00 00 00 lea 0x0(%esi),%esi
00000040 <_Z15overflow_unsafejj>:
40: 0f be 54 24 08 movsbl 0x8(%esp),%edx
45: 0f be 44 24 04 movsbl 0x4(%esp),%eax
4a: 8d 04 02 lea (%edx,%eax,1),%eax
4d: 0f be d0 movsbl %al,%edx
50: 39 c2 cmp %eax,%edx
52: 0f 95 c0 setne %al
55: c3 ret
56: 8d 76 00 lea 0x0(%esi),%esi
59: 8d bc 27 00 00 00 00 lea 0x0(%edi,%eiz,1),%edi
00000060 <_Z13overflow_safejj>:
60: 0f be 54 24 08 movsbl 0x8(%esp),%edx
65: 0f be 44 24 04 movsbl 0x4(%esp),%eax
6a: 8d 04 02 lea (%edx,%eax,1),%eax
6d: 0f be d0 movsbl %al,%edx
70: 39 c2 cmp %eax,%edx
72: 0f 95 c0 setne %al
75: c3 ret
Output for ARM with -O3 -fomit-frame-pointer -mthumb -march=armv7:
00000000 <_Z14overflow_rangejj>:
0: b249 sxtb r1, r1
2: b240 sxtb r0, r0
4: 1808 adds r0, r1, r0
6: 3080 adds r0, #128 ; 0x80
8: 28ff cmp r0, #255 ; 0xff
a: bf94 ite ls
c: 2000 movls r0, #0
e: 2001 movhi r0, #1
10: 4770 bx lr
12: bf00 nop
14: f3af 8000 nop.w
18: f3af 8000 nop.w
1c: f3af 8000 nop.w
00000020 <_Z12overflow_bitjj>:
20: 180b adds r3, r1, r0
22: 4041 eors r1, r0
24: ea83 0200 eor.w r2, r3, r0
28: ea22 0001 bic.w r0, r2, r1
2c: f3c0 10c0 ubfx r0, r0, #7, #1
30: 4770 bx lr
32: bf00 nop
34: f3af 8000 nop.w
38: f3af 8000 nop.w
3c: f3af 8000 nop.w
00000040 <_Z15overflow_unsafejj>:
40: b242 sxtb r2, r0
42: b249 sxtb r1, r1
44: 1888 adds r0, r1, r2
46: b243 sxtb r3, r0
48: 1a18 subs r0, r3, r0
4a: bf18 it ne
4c: 2001 movne r0, #1
4e: 4770 bx lr
00000050 <_Z13overflow_safejj>:
50: b242 sxtb r2, r0
52: b249 sxtb r1, r1
54: 1888 adds r0, r1, r2
56: b243 sxtb r3, r0
58: 1a18 subs r0, r3, r0
5a: bf18 it ne
5c: 2001 movne r0, #1
5e: 4770 bx lr
Not sure which version would be fastest on ARM (no device to benchmark
on handy).
By the way, what's a nice way to benchmark snippets like this with
optimization on? If you call each function in a loop from a different
compilation unit the call overhead tends to dominate. If you instead
put it in the same compilation unit and inline, the compiler might do
things you do not expect that renders the benchmark useless.
/Ulf