[PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes

Yang Yingliang Tue, 23 Mar 2021 00:32:56 -0700

When copy over 128 bytes, src/dst is added after
each ldp/stp instruction, it will cost more time.
To improve this, we only add src/dst after load
or store 64 bytes.


Copy 4096 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23

With this patch:
memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85

It's about 25% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingli...@huawei.com>
---
 arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..c3cd6f84c9c0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -152,29 +152,33 @@ D_h       .req    x14
        .p2align        L1_CACHE_SHIFT
 .Lcpy_body_large:
        /* pre-get 64 bytes data. */
-       ldp1    A_l, A_h, src, #16
-       ldp1    B_l, B_h, src, #16
-       ldp1    C_l, C_h, src, #16
-       ldp1    D_l, D_h, src, #16
+       ldp2    A_l, A_h, src, #0,  #8
+       ldp2    B_l, B_h, src, #16, #24
+       ldp2    C_l, C_h, src, #32, #40
+       ldp2    D_l, D_h, src, #48, #56
+       add     src, src, #64
 1:
        /*
        * interlace the load of next 64 bytes data block with store of the last
        * loaded 64 bytes data.
        */
-       stp1    A_l, A_h, dst, #16
-       ldp1    A_l, A_h, src, #16
-       stp1    B_l, B_h, dst, #16
-       ldp1    B_l, B_h, src, #16
-       stp1    C_l, C_h, dst, #16
-       ldp1    C_l, C_h, src, #16
-       stp1    D_l, D_h, dst, #16
-       ldp1    D_l, D_h, src, #16
+       stp2    A_l, A_h, dst, #0,  #8
+       ldp2    A_l, A_h, src, #0,  #8
+       stp2    B_l, B_h, dst, #16, #24
+       ldp2    B_l, B_h, src, #16, #24
+       stp2    C_l, C_h, dst, #32, #40
+       ldp2    C_l, C_h, src, #32, #40
+       stp2    D_l, D_h, dst, #48, #56
+       ldp2    D_l, D_h, src, #48, #56
+       add     src, src, #64
+       add     dst, dst, #64
        subs    count, count, #64
        b.ge    1b
-       stp1    A_l, A_h, dst, #16
-       stp1    B_l, B_h, dst, #16
-       stp1    C_l, C_h, dst, #16
-       stp1    D_l, D_h, dst, #16
+       stp2    A_l, A_h, dst, #0,  #8
+       stp2    B_l, B_h, dst, #16, #24
+       stp2    C_l, C_h, dst, #32, #40
+       stp2    D_l, D_h, dst, #48, #56
+       add     dst, dst, #64
 
        tst     count, #0x3f
        b.ne    .Ltail63
-- 
2.25.1

[PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes

Reply via email to