[dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-30 Thread Ananyev, Konstantin
Hey Zhihong,

> -Original Message-
> From: Wang, Zhihong
> Sent: Friday, January 30, 2015 5:57 AM
> To: Ananyev, Konstantin; dev at dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in 
> arch/x86/rte_memcpy.h for both SSE and AVX platforms
> 
> Hey Konstantin,
> 
> This method does reduce code size but lead to significant performance drop.
> I think we need to keep the original code.

Sure, no point to make it slower.
Thanks for trying it anyway.
Konstantin

> 
> 
> Thanks
> Zhihong (John)
> 
> 
> > -Original Message-
> > From: Ananyev, Konstantin
> > Sent: Thursday, January 29, 2015 11:18 PM
> > To: Wang, Zhihong; dev at dpdk.org
> > Subject: RE: [dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in
> > arch/x86/rte_memcpy.h for both SSE and AVX platforms
> >
> > Hi Zhihong,
> >
> > > -Original Message-
> > > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Zhihong Wang
> > > Sent: Thursday, January 29, 2015 2:39 AM
> > > To: dev at dpdk.org
> > > Subject: [dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in
> > > arch/x86/rte_memcpy.h for both SSE and AVX platforms
> > >
> > > Main code changes:
> > >
> > > 1. Differentiate architectural features based on CPU flags
> > >
> > > a. Implement separated move functions for SSE/AVX/AVX2 to make
> > > full utilization of cache bandwidth
> > >
> > > b. Implement separated copy flow specifically optimized for target
> > > architecture
> > >
> > > 2. Rewrite the memcpy function "rte_memcpy"
> > >
> > > a. Add store aligning
> > >
> > > b. Add load aligning based on architectural features
> > >
> > > c. Put block copy loop into inline move functions for better
> > > control of instruction order
> > >
> > > d. Eliminate unnecessary MOVs
> > >
> > > 3. Rewrite the inline move functions
> > >
> > > a. Add move functions for unaligned load cases
> > >
> > > b. Change instruction order in copy loops for better pipeline
> > > utilization
> > >
> > > c. Use intrinsics instead of assembly code
> > >
> > > 4. Remove slow glibc call for constant copies
> > >
> > > Signed-off-by: Zhihong Wang 
> > > ---
> > >  .../common/include/arch/x86/rte_memcpy.h   | 680
> > +++--
> > >  1 file changed, 509 insertions(+), 171 deletions(-)
> > >
> > > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > > index fb9eba8..7b2d382 100644
> > > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > > @@ -34,166 +34,189 @@
> > >  #ifndef _RTE_MEMCPY_X86_64_H_
> > >  #define _RTE_MEMCPY_X86_64_H_
> > >
> > > +/**
> > > + * @file
> > > + *
> > > + * Functions for SSE/AVX/AVX2 implementation of memcpy().
> > > + */
> > > +
> > > +#include 
> > >  #include 
> > >  #include 
> > > -#include 
> > > +#include 
> > >
> > >  #ifdef __cplusplus
> > >  extern "C" {
> > >  #endif
> > >
> > > -#include "generic/rte_memcpy.h"
> > > +/**
> > > + * Copy bytes from one location to another. The locations must not
> > overlap.
> > > + *
> > > + * @note This is implemented as a macro, so it's address should not
> > > +be taken
> > > + * and care is needed as parameter expressions may be evaluated
> > multiple times.
> > > + *
> > > + * @param dst
> > > + *   Pointer to the destination of the data.
> > > + * @param src
> > > + *   Pointer to the source data.
> > > + * @param n
> > > + *   Number of bytes to copy.
> > > + * @return
> > > + *   Pointer to the destination data.
> > > + */
> > > +static inline void *
> > > +rte_memcpy(void *dst, const void *src, size_t n)
> > > +__attribute__((always_inline));
> > >
> > > -#ifdef __INTEL_COMPILER
> > > -#pragma warning(disable:593) /* Stop unused variable warning (reg_a
> > > etc). */ -#endif
> > > +#ifdef RTE_MACHINE_CPUFLAG_AVX2
> > >
> > > +/**
> > > + * AVX2 implement

[dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-30 Thread Wang, Zhihong
Hey Konstantin,

This method does reduce code size but lead to significant performance drop.
I think we need to keep the original code.


Thanks
Zhihong (John)


> -Original Message-
> From: Ananyev, Konstantin
> Sent: Thursday, January 29, 2015 11:18 PM
> To: Wang, Zhihong; dev at dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in
> arch/x86/rte_memcpy.h for both SSE and AVX platforms
> 
> Hi Zhihong,
> 
> > -Original Message-
> > From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Zhihong Wang
> > Sent: Thursday, January 29, 2015 2:39 AM
> > To: dev at dpdk.org
> > Subject: [dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in
> > arch/x86/rte_memcpy.h for both SSE and AVX platforms
> >
> > Main code changes:
> >
> > 1. Differentiate architectural features based on CPU flags
> >
> > a. Implement separated move functions for SSE/AVX/AVX2 to make
> > full utilization of cache bandwidth
> >
> > b. Implement separated copy flow specifically optimized for target
> > architecture
> >
> > 2. Rewrite the memcpy function "rte_memcpy"
> >
> > a. Add store aligning
> >
> > b. Add load aligning based on architectural features
> >
> > c. Put block copy loop into inline move functions for better
> > control of instruction order
> >
> > d. Eliminate unnecessary MOVs
> >
> > 3. Rewrite the inline move functions
> >
> > a. Add move functions for unaligned load cases
> >
> > b. Change instruction order in copy loops for better pipeline
> > utilization
> >
> > c. Use intrinsics instead of assembly code
> >
> > 4. Remove slow glibc call for constant copies
> >
> > Signed-off-by: Zhihong Wang 
> > ---
> >  .../common/include/arch/x86/rte_memcpy.h   | 680
> +++--
> >  1 file changed, 509 insertions(+), 171 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > index fb9eba8..7b2d382 100644
> > --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> > @@ -34,166 +34,189 @@
> >  #ifndef _RTE_MEMCPY_X86_64_H_
> >  #define _RTE_MEMCPY_X86_64_H_
> >
> > +/**
> > + * @file
> > + *
> > + * Functions for SSE/AVX/AVX2 implementation of memcpy().
> > + */
> > +
> > +#include 
> >  #include 
> >  #include 
> > -#include 
> > +#include 
> >
> >  #ifdef __cplusplus
> >  extern "C" {
> >  #endif
> >
> > -#include "generic/rte_memcpy.h"
> > +/**
> > + * Copy bytes from one location to another. The locations must not
> overlap.
> > + *
> > + * @note This is implemented as a macro, so it's address should not
> > +be taken
> > + * and care is needed as parameter expressions may be evaluated
> multiple times.
> > + *
> > + * @param dst
> > + *   Pointer to the destination of the data.
> > + * @param src
> > + *   Pointer to the source data.
> > + * @param n
> > + *   Number of bytes to copy.
> > + * @return
> > + *   Pointer to the destination data.
> > + */
> > +static inline void *
> > +rte_memcpy(void *dst, const void *src, size_t n)
> > +__attribute__((always_inline));
> >
> > -#ifdef __INTEL_COMPILER
> > -#pragma warning(disable:593) /* Stop unused variable warning (reg_a
> > etc). */ -#endif
> > +#ifdef RTE_MACHINE_CPUFLAG_AVX2
> >
> > +/**
> > + * AVX2 implementation below
> > + */
> > +
> > +/**
> > + * Copy 16 bytes from one location to another,
> > + * locations should not overlap.
> > + */
> >  static inline void
> >  rte_mov16(uint8_t *dst, const uint8_t *src)  {
> > -   __m128i reg_a;
> > -   asm volatile (
> > -   "movdqu (%[src]), %[reg_a]\n\t"
> > -   "movdqu %[reg_a], (%[dst])\n\t"
> > -   : [reg_a] "=x" (reg_a)
> > -   : [src] "r" (src),
> > - [dst] "r"(dst)
> > -   : "memory"
> > -   );
> > +   __m128i xmm0;
> > +
> > +   xmm0 = _mm_loadu_si128((const __m128i *)src);
> > +   _mm_storeu_si128((__m128i *)dst, xmm0);
> >  }
> >
> > +/**
> > + * Copy 32 bytes from one location to another,
> > + * locations should not overlap.
> > + *

[dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-29 Thread Ananyev, Konstantin
Hi Zhihong,

> -Original Message-
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Zhihong Wang
> Sent: Thursday, January 29, 2015 2:39 AM
> To: dev at dpdk.org
> Subject: [dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in 
> arch/x86/rte_memcpy.h for both SSE and AVX platforms
> 
> Main code changes:
> 
> 1. Differentiate architectural features based on CPU flags
> 
> a. Implement separated move functions for SSE/AVX/AVX2 to make full 
> utilization of cache bandwidth
> 
> b. Implement separated copy flow specifically optimized for target 
> architecture
> 
> 2. Rewrite the memcpy function "rte_memcpy"
> 
> a. Add store aligning
> 
> b. Add load aligning based on architectural features
> 
> c. Put block copy loop into inline move functions for better control of 
> instruction order
> 
> d. Eliminate unnecessary MOVs
> 
> 3. Rewrite the inline move functions
> 
> a. Add move functions for unaligned load cases
> 
> b. Change instruction order in copy loops for better pipeline utilization
> 
> c. Use intrinsics instead of assembly code
> 
> 4. Remove slow glibc call for constant copies
> 
> Signed-off-by: Zhihong Wang 
> ---
>  .../common/include/arch/x86/rte_memcpy.h   | 680 
> +++--
>  1 file changed, 509 insertions(+), 171 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
> b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> index fb9eba8..7b2d382 100644
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> @@ -34,166 +34,189 @@
>  #ifndef _RTE_MEMCPY_X86_64_H_
>  #define _RTE_MEMCPY_X86_64_H_
> 
> +/**
> + * @file
> + *
> + * Functions for SSE/AVX/AVX2 implementation of memcpy().
> + */
> +
> +#include 
>  #include 
>  #include 
> -#include 
> +#include 
> 
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> 
> -#include "generic/rte_memcpy.h"
> +/**
> + * Copy bytes from one location to another. The locations must not overlap.
> + *
> + * @note This is implemented as a macro, so it's address should not be taken
> + * and care is needed as parameter expressions may be evaluated multiple 
> times.
> + *
> + * @param dst
> + *   Pointer to the destination of the data.
> + * @param src
> + *   Pointer to the source data.
> + * @param n
> + *   Number of bytes to copy.
> + * @return
> + *   Pointer to the destination data.
> + */
> +static inline void *
> +rte_memcpy(void *dst, const void *src, size_t n) 
> __attribute__((always_inline));
> 
> -#ifdef __INTEL_COMPILER
> -#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
> -#endif
> +#ifdef RTE_MACHINE_CPUFLAG_AVX2
> 
> +/**
> + * AVX2 implementation below
> + */
> +
> +/**
> + * Copy 16 bytes from one location to another,
> + * locations should not overlap.
> + */
>  static inline void
>  rte_mov16(uint8_t *dst, const uint8_t *src)
>  {
> - __m128i reg_a;
> - asm volatile (
> - "movdqu (%[src]), %[reg_a]\n\t"
> - "movdqu %[reg_a], (%[dst])\n\t"
> - : [reg_a] "=x" (reg_a)
> - : [src] "r" (src),
> -   [dst] "r"(dst)
> - : "memory"
> - );
> + __m128i xmm0;
> +
> + xmm0 = _mm_loadu_si128((const __m128i *)src);
> + _mm_storeu_si128((__m128i *)dst, xmm0);
>  }
> 
> +/**
> + * Copy 32 bytes from one location to another,
> + * locations should not overlap.
> + */
>  static inline void
>  rte_mov32(uint8_t *dst, const uint8_t *src)
>  {
> - __m128i reg_a, reg_b;
> - asm volatile (
> - "movdqu (%[src]), %[reg_a]\n\t"
> - "movdqu 16(%[src]), %[reg_b]\n\t"
> - "movdqu %[reg_a], (%[dst])\n\t"
> - "movdqu %[reg_b], 16(%[dst])\n\t"
> - : [reg_a] "=x" (reg_a),
> -   [reg_b] "=x" (reg_b)
> - : [src] "r" (src),
> -   [dst] "r"(dst)
> - : "memory"
> - );
> -}
> + __m256i ymm0;
> 
> -static inline void
> -rte_mov48(uint8_t *dst, const uint8_t *src)
> -{
> - __m128i reg_a, reg_b, reg_c;
> - asm volatile (
> - "movdqu (%[src]), %[reg_a]\n\t"
> - "movdqu 16(%[src]), %[reg_b]\n\t"
> - "movdqu 32(%[src]), %[reg_c]\n\t"
> - "movdq

[dpdk-dev] [PATCH v2 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-29 Thread Zhihong Wang
Main code changes:

1. Differentiate architectural features based on CPU flags

a. Implement separated move functions for SSE/AVX/AVX2 to make full 
utilization of cache bandwidth

b. Implement separated copy flow specifically optimized for target 
architecture

2. Rewrite the memcpy function "rte_memcpy"

a. Add store aligning

b. Add load aligning based on architectural features

c. Put block copy loop into inline move functions for better control of 
instruction order

d. Eliminate unnecessary MOVs

3. Rewrite the inline move functions

a. Add move functions for unaligned load cases

b. Change instruction order in copy loops for better pipeline utilization

c. Use intrinsics instead of assembly code

4. Remove slow glibc call for constant copies

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 680 +++--
 1 file changed, 509 insertions(+), 171 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index fb9eba8..7b2d382 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -34,166 +34,189 @@
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_

+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ */
+
+#include 
 #include 
 #include 
-#include 
+#include 

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "generic/rte_memcpy.h"
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));

-#ifdef __INTEL_COMPILER
-#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
-#endif
+#ifdef RTE_MACHINE_CPUFLAG_AVX2

+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   : [reg_a] "=x" (reg_a)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   __m128i xmm0;
+
+   xmm0 = _mm_loadu_si128((const __m128i *)src);
+   _mm_storeu_si128((__m128i *)dst, xmm0);
 }

+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
-}
+   __m256i ymm0;

-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-   __m128i reg_a, reg_b, reg_c;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   "movdqu %[reg_c], 32(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b),
- [reg_c] "=x" (reg_c)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   ymm0 = _mm256_loadu_si256((const __m256i *)src);
+   _mm256_storeu_si256((__m256i *)dst, ymm0);
 }

+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b, reg_c, reg_d;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu 48(%[src]), %[reg_d]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   "movdqu %[reg_c], 32(%[dst])\n\t"
-   "movdqu %[reg_d], 48(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b),
- [reg_c] "=x" (reg_c),
-