xor: Add alternative SSE implementation only prefetching once per 64-byte line

tip-bot for Jan Beulich Fri, 25 Jan 2013 02:43:30 -0800

Commit-ID:  f317820cb6ee3fb173319bf76e0e62437be78ad2
Gitweb:     http://git.kernel.org/tip/f317820cb6ee3fb173319bf76e0e62437be78ad2
Author:     Jan Beulich <jbeul...@suse.com>
AuthorDate: Fri, 2 Nov 2012 14:20:24 +0000
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Fri, 25 Jan 2013 09:23:50 +0100


x86/xor: Add alternative SSE implementation only prefetching once per 64-byte 
line

On CPUs with 64-byte last level cache lines, this yields roughly
10% better performance, independent of CPU vendor or specific
model (as far as I was able to test).

Signed-off-by: Jan Beulich <jbeul...@suse.com>
Acked-by: H. Peter Anvin <h...@zytor.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Link: http://lkml.kernel.org/r/5093e4b802000078000a6...@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 arch/x86/include/asm/xor.h    | 172 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/include/asm/xor_32.h |  23 +++---
 arch/x86/include/asm/xor_64.h |  10 +--
 3 files changed, 187 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
index c661571..d882975 100644
--- a/arch/x86/include/asm/xor.h
+++ b/arch/x86/include/asm/xor.h
@@ -58,6 +58,14 @@
 #define XO2(x, y)      "       xorps "OFFS(x)"(%[p3]), %%xmm"#y"       ;\n"
 #define XO3(x, y)      "       xorps "OFFS(x)"(%[p4]), %%xmm"#y"       ;\n"
 #define XO4(x, y)      "       xorps "OFFS(x)"(%[p5]), %%xmm"#y"       ;\n"
+#define NOP(x)
+
+#define BLK64(pf, op, i)                               \
+               pf(i)                                   \
+               op(i, 0)                                \
+                       op(i + 1, 1)                    \
+                               op(i + 2, 2)            \
+                                       op(i + 3, 3)
 
 static void
 xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
@@ -111,6 +119,40 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned 
long *p2)
 }
 
 static void
+xor_sse_2_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2)
+{
+       unsigned long lines = bytes >> 8;
+
+       kernel_fpu_begin();
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i)                       \
+               BLK64(PF0, LD, i)       \
+               BLK64(PF1, XO1, i)      \
+               BLK64(NOP, ST, i)       \
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       add %[inc], %[p1]       ;\n"
+       "       add %[inc], %[p2]       ;\n"
+       "       dec %[cnt]              ;\n"
+       "       jnz 1b                  ;\n"
+       : [cnt] "+r" (lines),
+         [p1] "+r" (p1), [p2] "+r" (p2)
+       : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+       : "memory");
+
+       kernel_fpu_end();
+}
+
+static void
 xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
          unsigned long *p3)
 {
@@ -170,6 +212,43 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned 
long *p2,
 }
 
 static void
+xor_sse_3_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+              unsigned long *p3)
+{
+       unsigned long lines = bytes >> 8;
+
+       kernel_fpu_begin();
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i)                       \
+               BLK64(PF0, LD, i)       \
+               BLK64(PF1, XO1, i)      \
+               BLK64(PF2, XO2, i)      \
+               BLK64(NOP, ST, i)       \
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       add %[inc], %[p1]       ;\n"
+       "       add %[inc], %[p2]       ;\n"
+       "       add %[inc], %[p3]       ;\n"
+       "       dec %[cnt]              ;\n"
+       "       jnz 1b                  ;\n"
+       : [cnt] "+r" (lines),
+         [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
+       : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+       : "memory");
+
+       kernel_fpu_end();
+}
+
+static void
 xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
          unsigned long *p3, unsigned long *p4)
 {
@@ -236,6 +315,45 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned 
long *p2,
 }
 
 static void
+xor_sse_4_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+              unsigned long *p3, unsigned long *p4)
+{
+       unsigned long lines = bytes >> 8;
+
+       kernel_fpu_begin();
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i)                       \
+               BLK64(PF0, LD, i)       \
+               BLK64(PF1, XO1, i)      \
+               BLK64(PF2, XO2, i)      \
+               BLK64(PF3, XO3, i)      \
+               BLK64(NOP, ST, i)       \
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       add %[inc], %[p1]       ;\n"
+       "       add %[inc], %[p2]       ;\n"
+       "       add %[inc], %[p3]       ;\n"
+       "       add %[inc], %[p4]       ;\n"
+       "       dec %[cnt]              ;\n"
+       "       jnz 1b                  ;\n"
+       : [cnt] "+r" (lines), [p1] "+r" (p1),
+         [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
+       : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+       : "memory");
+
+       kernel_fpu_end();
+}
+
+static void
 xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
          unsigned long *p3, unsigned long *p4, unsigned long *p5)
 {
@@ -308,12 +426,63 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, 
unsigned long *p2,
        kernel_fpu_end();
 }
 
+static void
+xor_sse_5_pf64(unsigned long bytes, unsigned long *p1, unsigned long *p2,
+              unsigned long *p3, unsigned long *p4, unsigned long *p5)
+{
+       unsigned long lines = bytes >> 8;
+
+       kernel_fpu_begin();
+
+       asm volatile(
+#undef BLOCK
+#define BLOCK(i)                       \
+               BLK64(PF0, LD, i)       \
+               BLK64(PF1, XO1, i)      \
+               BLK64(PF2, XO2, i)      \
+               BLK64(PF3, XO3, i)      \
+               BLK64(PF4, XO4, i)      \
+               BLK64(NOP, ST, i)       \
+
+       " .align 32                     ;\n"
+       " 1:                            ;\n"
+
+               BLOCK(0)
+               BLOCK(4)
+               BLOCK(8)
+               BLOCK(12)
+
+       "       add %[inc], %[p1]       ;\n"
+       "       add %[inc], %[p2]       ;\n"
+       "       add %[inc], %[p3]       ;\n"
+       "       add %[inc], %[p4]       ;\n"
+       "       add %[inc], %[p5]       ;\n"
+       "       dec %[cnt]              ;\n"
+       "       jnz 1b                  ;\n"
+       : [cnt] "+r" (lines), [p1] "+r" (p1), [p2] "+r" (p2),
+         [p3] "+r" (p3), [p4] "+r" (p4), [p5] "+r" (p5)
+       : [inc] XOR_CONSTANT_CONSTRAINT (256UL)
+       : "memory");
+
+       kernel_fpu_end();
+}
+
+static struct xor_block_template xor_block_sse_pf64 = {
+       .name = "prefetch64-sse",
+       .do_2 = xor_sse_2_pf64,
+       .do_3 = xor_sse_3_pf64,
+       .do_4 = xor_sse_4_pf64,
+       .do_5 = xor_sse_5_pf64,
+};
+
 #undef LD
 #undef XO1
 #undef XO2
 #undef XO3
 #undef XO4
 #undef ST
+#undef NOP
+#undef BLK64
 #undef BLOCK
 
 #undef XOR_CONSTANT_CONSTRAINT
@@ -324,4 +493,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned 
long *p2,
 # include <asm/xor_64.h>
 #endif
 
+#define XOR_SELECT_TEMPLATE(FASTEST) \
+       AVX_SELECT(FASTEST)
+
 #endif /* _ASM_X86_XOR_H */
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
index b85dc87..ce05722 100644
--- a/arch/x86/include/asm/xor_32.h
+++ b/arch/x86/include/asm/xor_32.h
@@ -543,26 +543,25 @@ static struct xor_block_template xor_block_pIII_sse = {
 /* Also try the generic routines.  */
 #include <asm-generic/xor.h>
 
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES                              \
 do {                                                   \
-       xor_speed(&xor_block_8regs);                    \
-       xor_speed(&xor_block_8regs_p);                  \
-       xor_speed(&xor_block_32regs);                   \
-       xor_speed(&xor_block_32regs_p);                 \
        AVX_XOR_SPEED;                                  \
-       if (cpu_has_xmm)                                \
+       if (cpu_has_xmm) {                              \
                xor_speed(&xor_block_pIII_sse);         \
-       if (cpu_has_mmx) {                              \
+               xor_speed(&xor_block_sse_pf64);         \
+       } else if (cpu_has_mmx) {                       \
                xor_speed(&xor_block_pII_mmx);          \
                xor_speed(&xor_block_p5_mmx);           \
+       } else {                                        \
+               xor_speed(&xor_block_8regs);            \
+               xor_speed(&xor_block_8regs_p);          \
+               xor_speed(&xor_block_32regs);           \
+               xor_speed(&xor_block_32regs_p);         \
        }                                               \
 } while (0)
 
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST)                   \
-       AVX_SELECT(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
-
 #endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
index 1baf89d..546f1e3 100644
--- a/arch/x86/include/asm/xor_64.h
+++ b/arch/x86/include/asm/xor_64.h
@@ -13,17 +13,15 @@ static struct xor_block_template xor_block_sse = {
 /* Also try the AVX routines */
 #include <asm/xor_avx.h>
 
+/* We force the use of the SSE xor block because it can write around L2.
+   We may also be able to load into the L1 only depending on how the cpu
+   deals with a load to a line that is being prefetched.  */
 #undef XOR_TRY_TEMPLATES
 #define XOR_TRY_TEMPLATES                      \
 do {                                           \
        AVX_XOR_SPEED;                          \
+       xor_speed(&xor_block_sse_pf64);         \
        xor_speed(&xor_block_sse);              \
 } while (0)
 
-/* We force the use of the SSE xor block because it can write around L2.
-   We may also be able to load into the L1 only depending on how the cpu
-   deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
-       AVX_SELECT(&xor_block_sse)
-
 #endif /* _ASM_X86_XOR_64_H */
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[tip:x86/asm] x86/xor: Add alternative SSE implementation only prefetching once per 64-byte line

Reply via email to