[PATCH v2 3/4] arch/sparc: Optimized memcpy, memset, copy_to_user, copy_from_user for M7/M8

2017-08-07 Thread Babu Moger
New algorithm that takes advantage of the M7/M8 block init store
ASI, ie, overlapping pipelines and miss buffer filling.
Full details in code comments.

Signed-off-by: Babu Moger 
---
 arch/sparc/kernel/head_64.S   |   16 +-
 arch/sparc/lib/M7copy_from_user.S |   41 ++
 arch/sparc/lib/M7copy_to_user.S   |   51 ++
 arch/sparc/lib/M7memcpy.S |  923 +
 arch/sparc/lib/M7memset.S |  352 ++
 arch/sparc/lib/M7patch.S  |   51 ++
 arch/sparc/lib/Makefile   |3 +
 7 files changed, 1435 insertions(+), 2 deletions(-)
 create mode 100644 arch/sparc/lib/M7copy_from_user.S
 create mode 100644 arch/sparc/lib/M7copy_to_user.S
 create mode 100644 arch/sparc/lib/M7memcpy.S
 create mode 100644 arch/sparc/lib/M7memset.S
 create mode 100644 arch/sparc/lib/M7patch.S

diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index 78e0211..bf9a5ac 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -603,10 +603,10 @@ niagara_tlb_fixup:
be,pt   %xcc, niagara4_patch
 nop
cmp %g1, SUN4V_CHIP_SPARC_M7
-   be,pt   %xcc, niagara4_patch
+   be,pt   %xcc, sparc_m7_patch
 nop
cmp %g1, SUN4V_CHIP_SPARC_M8
-   be,pt   %xcc, niagara4_patch
+   be,pt   %xcc, sparc_m7_patch
 nop
cmp %g1, SUN4V_CHIP_SPARC_SN
be,pt   %xcc, niagara4_patch
@@ -621,6 +621,18 @@ niagara_tlb_fixup:
 
ba,a,pt %xcc, 80f
 nop
+
+sparc_m7_patch:
+   callm7_patch_copyops
+nop
+   callm7_patch_bzero
+nop
+   callm7_patch_pageops
+nop
+
+   ba,a,pt %xcc, 80f
+nop
+
 niagara4_patch:
callniagara4_patch_copyops
 nop
diff --git a/arch/sparc/lib/M7copy_from_user.S 
b/arch/sparc/lib/M7copy_from_user.S
new file mode 100644
index 000..d0689d7
--- /dev/null
+++ b/arch/sparc/lib/M7copy_from_user.S
@@ -0,0 +1,41 @@
+/*
+ * M7copy_from_user.S: SPARC M7 optimized copy from userspace.
+ *
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ */
+
+
+#define EX_LD(x)   \
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi;   \
+   .text;  \
+   .align 4;
+
+#define EX_LD_FP(x)\
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi_fp;\
+   .text;  \
+   .align 4;
+
+
+#ifndef ASI_AIUS
+#define ASI_AIUS   0x11
+#endif
+
+#define FUNC_NAME  M7copy_from_user
+#define LOAD(type,addr,dest)   type##a [addr] %asi, dest
+#define EX_RETVAL(x)   0
+
+#ifdef __KERNEL__
+#define PREAMBLE   \
+   rd  %asi, %g1;  \
+   cmp %g1, ASI_AIUS;  \
+   bne,pn  %icc, raw_copy_in_user; \
+   nop
+#endif
+
+#include "M7memcpy.S"
diff --git a/arch/sparc/lib/M7copy_to_user.S b/arch/sparc/lib/M7copy_to_user.S
new file mode 100644
index 000..d3be132
--- /dev/null
+++ b/arch/sparc/lib/M7copy_to_user.S
@@ -0,0 +1,51 @@
+/*
+ * M7copy_to_user.S: SPARC M7 optimized copy to userspace.
+ *
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ */
+
+
+#define EX_ST(x)   \
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi;   \
+   .text;  \
+   .align 4;
+
+#define EX_ST_FP(x)\
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi_fp;\
+   .text;  \
+   .align 4;
+
+
+#ifndef ASI_AIUS
+#define ASI_AIUS   0x11
+#endif
+
+#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
+#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
+#endif
+
+#define FUNC_NAME  M7copy_to_user
+#define STORE(type,src,addr)   type##a src, [addr] %asi
+#define STORE_ASI  ASI_BLK_INIT_QUAD_LDD_AIUS
+#defineSTORE_MRU_ASI   ASI_ST_BLKINIT_MRU_S
+#define EX_RETVAL(x)   0
+
+#ifdef __KERNEL__
+   /* Writing to %asi is _expensive_ so we hardcode it.
+* Reading %asi to check for KERNEL_DS is comparatively
+* cheap.
+*/
+#define PREAMBLE   \
+   rd  %asi, %g1;  \
+   cmp %g1, ASI_AIUS;  \
+   bne,pn  %icc, raw_copy_in_user; \
+   nop
+#endif
+
+#include "M7memcpy.S"
diff --git a/arch/sparc/lib/M7memcpy.S 

[PATCH v2 3/4] arch/sparc: Optimized memcpy, memset, copy_to_user, copy_from_user for M7/M8

2017-08-07 Thread Babu Moger
New algorithm that takes advantage of the M7/M8 block init store
ASI, ie, overlapping pipelines and miss buffer filling.
Full details in code comments.

Signed-off-by: Babu Moger 
---
 arch/sparc/kernel/head_64.S   |   16 +-
 arch/sparc/lib/M7copy_from_user.S |   41 ++
 arch/sparc/lib/M7copy_to_user.S   |   51 ++
 arch/sparc/lib/M7memcpy.S |  923 +
 arch/sparc/lib/M7memset.S |  352 ++
 arch/sparc/lib/M7patch.S  |   51 ++
 arch/sparc/lib/Makefile   |3 +
 7 files changed, 1435 insertions(+), 2 deletions(-)
 create mode 100644 arch/sparc/lib/M7copy_from_user.S
 create mode 100644 arch/sparc/lib/M7copy_to_user.S
 create mode 100644 arch/sparc/lib/M7memcpy.S
 create mode 100644 arch/sparc/lib/M7memset.S
 create mode 100644 arch/sparc/lib/M7patch.S

diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index 78e0211..bf9a5ac 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -603,10 +603,10 @@ niagara_tlb_fixup:
be,pt   %xcc, niagara4_patch
 nop
cmp %g1, SUN4V_CHIP_SPARC_M7
-   be,pt   %xcc, niagara4_patch
+   be,pt   %xcc, sparc_m7_patch
 nop
cmp %g1, SUN4V_CHIP_SPARC_M8
-   be,pt   %xcc, niagara4_patch
+   be,pt   %xcc, sparc_m7_patch
 nop
cmp %g1, SUN4V_CHIP_SPARC_SN
be,pt   %xcc, niagara4_patch
@@ -621,6 +621,18 @@ niagara_tlb_fixup:
 
ba,a,pt %xcc, 80f
 nop
+
+sparc_m7_patch:
+   callm7_patch_copyops
+nop
+   callm7_patch_bzero
+nop
+   callm7_patch_pageops
+nop
+
+   ba,a,pt %xcc, 80f
+nop
+
 niagara4_patch:
callniagara4_patch_copyops
 nop
diff --git a/arch/sparc/lib/M7copy_from_user.S 
b/arch/sparc/lib/M7copy_from_user.S
new file mode 100644
index 000..d0689d7
--- /dev/null
+++ b/arch/sparc/lib/M7copy_from_user.S
@@ -0,0 +1,41 @@
+/*
+ * M7copy_from_user.S: SPARC M7 optimized copy from userspace.
+ *
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ */
+
+
+#define EX_LD(x)   \
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi;   \
+   .text;  \
+   .align 4;
+
+#define EX_LD_FP(x)\
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi_fp;\
+   .text;  \
+   .align 4;
+
+
+#ifndef ASI_AIUS
+#define ASI_AIUS   0x11
+#endif
+
+#define FUNC_NAME  M7copy_from_user
+#define LOAD(type,addr,dest)   type##a [addr] %asi, dest
+#define EX_RETVAL(x)   0
+
+#ifdef __KERNEL__
+#define PREAMBLE   \
+   rd  %asi, %g1;  \
+   cmp %g1, ASI_AIUS;  \
+   bne,pn  %icc, raw_copy_in_user; \
+   nop
+#endif
+
+#include "M7memcpy.S"
diff --git a/arch/sparc/lib/M7copy_to_user.S b/arch/sparc/lib/M7copy_to_user.S
new file mode 100644
index 000..d3be132
--- /dev/null
+++ b/arch/sparc/lib/M7copy_to_user.S
@@ -0,0 +1,51 @@
+/*
+ * M7copy_to_user.S: SPARC M7 optimized copy to userspace.
+ *
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ */
+
+
+#define EX_ST(x)   \
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi;   \
+   .text;  \
+   .align 4;
+
+#define EX_ST_FP(x)\
+98:x;  \
+   .section __ex_table,"a";\
+   .align 4;   \
+   .word 98b, __restore_asi_fp;\
+   .text;  \
+   .align 4;
+
+
+#ifndef ASI_AIUS
+#define ASI_AIUS   0x11
+#endif
+
+#ifndef ASI_BLK_INIT_QUAD_LDD_AIUS
+#define ASI_BLK_INIT_QUAD_LDD_AIUS 0x23
+#endif
+
+#define FUNC_NAME  M7copy_to_user
+#define STORE(type,src,addr)   type##a src, [addr] %asi
+#define STORE_ASI  ASI_BLK_INIT_QUAD_LDD_AIUS
+#defineSTORE_MRU_ASI   ASI_ST_BLKINIT_MRU_S
+#define EX_RETVAL(x)   0
+
+#ifdef __KERNEL__
+   /* Writing to %asi is _expensive_ so we hardcode it.
+* Reading %asi to check for KERNEL_DS is comparatively
+* cheap.
+*/
+#define PREAMBLE   \
+   rd  %asi, %g1;  \
+   cmp %g1, ASI_AIUS;  \
+   bne,pn  %icc, raw_copy_in_user; \
+   nop
+#endif
+
+#include "M7memcpy.S"
diff --git a/arch/sparc/lib/M7memcpy.S b/arch/sparc/lib/M7memcpy.S
new