From: Simon Guo
Currently memcmp() 64bytes version in powerpc will fall back to .Lshort
(compare per byte mode) if either src or dst address is not 8 bytes aligned.
It can be opmitized in 2 situations:
1) if both addresses are with the same offset with 8 bytes boundary:
memcmp() can compare the unaligned bytes within 8 bytes boundary firstly
and then compare the rest 8-bytes-aligned content with .Llong mode.
2) If src/dst addrs are not with the same offset of 8 bytes boundary:
memcmp() can align src addr with 8 bytes, increment dst addr accordingly,
then load src with aligned mode and load dst with unaligned mode.
This patch optmizes memcmp() behavior in the above 2 situations.
Tested with both little/big endian. Performance result below is based on
little endian.
Following is the test result with src/dst having the same offset case:
(a similar result was observed when src/dst having different offset):
(1) 256 bytes
Test with the existing tools/testing/selftests/powerpc/stringloops/memcmp:
- without patch
29.773018302 seconds time elapsed
( +- 0.09% )
- with patch
16.485568173 seconds time elapsed
( +- 0.02% )
-> There is ~+80% percent improvement
(2) 32 bytes
To observe performance impact on < 32 bytes, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
#include
#include "utils.h"
-#define SIZE 256
+#define SIZE 32
#define ITERATIONS 1
int test_memcmp(const void *s1, const void *s2, size_t n);
- Without patch
0.244746482 seconds time elapsed
( +- 0.36%)
- with patch
0.215069477 seconds time elapsed
( +- 0.51%)
-> There is ~+13% improvement
(3) 0~8 bytes
To observe <8 bytes performance impact, modify
tools/testing/selftests/powerpc/stringloops/memcmp.c with following:
---
#include
#include "utils.h"
-#define SIZE 256
-#define ITERATIONS 1
+#define SIZE 8
+#define ITERATIONS 100
int test_memcmp(const void *s1, const void *s2, size_t n);
---
- Without patch
1.845642503 seconds time elapsed
( +- 0.12% )
- With patch
1.849767135 seconds time elapsed
( +- 0.26% )
-> They are nearly the same. (-0.2%)
Signed-off-by: Simon Guo
---
arch/powerpc/lib/memcmp_64.S | 140 ---
1 file changed, 133 insertions(+), 7 deletions(-)
diff --git a/arch/powerpc/lib/memcmp_64.S b/arch/powerpc/lib/memcmp_64.S
index d75d18b..5776f91 100644
--- a/arch/powerpc/lib/memcmp_64.S
+++ b/arch/powerpc/lib/memcmp_64.S
@@ -24,28 +24,41 @@
#define rH r31
#ifdef __LITTLE_ENDIAN__
+#define LH lhbrx
+#define LW lwbrx
#define LD ldbrx
#else
+#define LH lhzx
+#define LW lwzx
#define LD ldx
#endif
+/*
+ * There are 2 categories for memcmp:
+ * 1) src/dst has the same offset to the 8 bytes boundary. The handlers
+ * are named like .Lsameoffset_
+ * 2) src/dst has different offset to the 8 bytes boundary. The handlers
+ * are named like .Ldiffoffset_
+ */
_GLOBAL(memcmp)
cmpdi cr1,r5,0
- /* Use the short loop if both strings are not 8B aligned */
- or r6,r3,r4
+ /* Use the short loop if the src/dst addresses are not
+* with the same offset of 8 bytes align boundary.
+*/
+ xor r6,r3,r4
andi. r6,r6,7
- /* Use the short loop if length is less than 32B */
- cmpdi cr6,r5,31
+ /* Fall back to short loop if compare at aligned addrs
+* with less than 8 bytes.
+*/
+ cmpdi cr6,r5,7
beq cr1,.Lzero
- bne .Lshort
- bgt cr6,.Llong
+ bgt cr6,.Lno_short
.Lshort:
mtctr r5
-
1: lbz rA,0(r3)
lbz rB,0(r4)
subf. rC,rB,rA
@@ -78,11 +91,89 @@ _GLOBAL(memcmp)
li r3,0
blr
+.Lno_short:
+ dcbt0,r3
+ dcbt0,r4
+ bne .Ldiffoffset_8bytes_make_align_start
+
+
+.Lsameoffset_8bytes_make_align_start:
+ /* attempt to compare bytes not aligned with 8 bytes so that
+* rest comparison can run based on 8 bytes alignment.
+*/
+ andi. r6,r3,7
+
+ /* Try to compare the first double word which is not 8 bytes aligned:
+* load the first double word at (src & ~7UL) and shift left appropriate
+* bits before comparision.
+*/
+ rlwinm r6,r3,3,26,28
+ beq .Lsameoffset_8bytes_aligned
+ clrrdi r3,r3,3
+ clrrdi r4,r4,3
+ LD rA,0,r3
+ LD rB,0,r4
+ sld rA,rA,r6
+ sld rB,rB,r6
+ cmpld cr0,rA,rB
+ srwir6,r6,3
+ bne cr0,.LcmpAB_lightweight
+ subfic r6,r6,8
+ subf. r5,r