When copy over 128 bytes, src/dst is added after each ldp/stp instruction, it will cost more time. To improve this, we only add src/dst after load or store 64 bytes.
Copy 4096 bytes cost on Kunpeng920 (ms): Without this patch: memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23 With this patch: memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85 It's about 25% improvement in memcpy(). Signed-off-by: Yang Yingliang <yangyingli...@huawei.com> --- arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S index 488df234c49a..c3cd6f84c9c0 100644 --- a/arch/arm64/lib/copy_template.S +++ b/arch/arm64/lib/copy_template.S @@ -152,29 +152,33 @@ D_h .req x14 .p2align L1_CACHE_SHIFT .Lcpy_body_large: /* pre-get 64 bytes data. */ - ldp1 A_l, A_h, src, #16 - ldp1 B_l, B_h, src, #16 - ldp1 C_l, C_h, src, #16 - ldp1 D_l, D_h, src, #16 + ldp2 A_l, A_h, src, #0, #8 + ldp2 B_l, B_h, src, #16, #24 + ldp2 C_l, C_h, src, #32, #40 + ldp2 D_l, D_h, src, #48, #56 + add src, src, #64 1: /* * interlace the load of next 64 bytes data block with store of the last * loaded 64 bytes data. */ - stp1 A_l, A_h, dst, #16 - ldp1 A_l, A_h, src, #16 - stp1 B_l, B_h, dst, #16 - ldp1 B_l, B_h, src, #16 - stp1 C_l, C_h, dst, #16 - ldp1 C_l, C_h, src, #16 - stp1 D_l, D_h, dst, #16 - ldp1 D_l, D_h, src, #16 + stp2 A_l, A_h, dst, #0, #8 + ldp2 A_l, A_h, src, #0, #8 + stp2 B_l, B_h, dst, #16, #24 + ldp2 B_l, B_h, src, #16, #24 + stp2 C_l, C_h, dst, #32, #40 + ldp2 C_l, C_h, src, #32, #40 + stp2 D_l, D_h, dst, #48, #56 + ldp2 D_l, D_h, src, #48, #56 + add src, src, #64 + add dst, dst, #64 subs count, count, #64 b.ge 1b - stp1 A_l, A_h, dst, #16 - stp1 B_l, B_h, dst, #16 - stp1 C_l, C_h, dst, #16 - stp1 D_l, D_h, dst, #16 + stp2 A_l, A_h, dst, #0, #8 + stp2 B_l, B_h, dst, #16, #24 + stp2 C_l, C_h, dst, #32, #40 + stp2 D_l, D_h, dst, #48, #56 + add dst, dst, #64 tst count, #0x3f b.ne .Ltail63 -- 2.25.1