[dpdk-dev] [PATCH 0/4] DPDK memcpy optimization

2015-01-19 Thread zhihong.w...@intel.com
This patch set optimizes memcpy for DPDK for both SSE and AVX platforms.
It also extends memcpy test coverage with unaligned cases and more test points.

Optimization techniques are summarized below:

1. Utilize full cache bandwidth

2. Enforce aligned stores

3. Apply load address alignment based on architecture features

4. Make load/store address available as early as possible

5. General optimization techniques like inlining, branch reducing, prefetch 
pattern access

Zhihong Wang (4):
  Disabled VTA for memcpy test in app/test/Makefile
  Removed unnecessary test cases in test_memcpy.c
  Extended test coverage in test_memcpy_perf.c
  Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX
platforms

 app/test/Makefile  |   6 +
 app/test/test_memcpy.c |  52 +-
 app/test/test_memcpy_perf.c| 238 +---
 .../common/include/arch/x86/rte_memcpy.h   | 664 +++--
 4 files changed, 656 insertions(+), 304 deletions(-)

-- 
1.9.3



[dpdk-dev] [PATCH 1/4] app/test: Disabled VTA for memcpy test in app/test/Makefile

2015-01-19 Thread zhihong.w...@intel.com
VTA is for debugging only, it increases compile time and binary size, 
especially when there're a lot of inlines.
So disable it since memcpy test contains a lot of inline calls.

Signed-off-by: Zhihong Wang 
---
 app/test/Makefile | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/app/test/Makefile b/app/test/Makefile
index 4311f96..94dbadf 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -143,6 +143,12 @@ CFLAGS_test_kni.o += -Wno-deprecated-declarations
 endif
 CFLAGS += -D_GNU_SOURCE

+# Disable VTA for memcpy test
+ifeq ($(CC), gcc)
+CFLAGS_test_memcpy.o += -fno-var-tracking-assignments
+CFLAGS_test_memcpy_perf.o += -fno-var-tracking-assignments
+endif
+
 # this application needs libraries first
 DEPDIRS-y += lib

-- 
1.9.3



[dpdk-dev] [PATCH 2/4] app/test: Removed unnecessary test cases in test_memcpy.c

2015-01-19 Thread zhihong.w...@intel.com
Removed unnecessary test cases for base move functions since the function 
"func_test" covers them all.

Signed-off-by: Zhihong Wang 
---
 app/test/test_memcpy.c | 52 +-
 1 file changed, 1 insertion(+), 51 deletions(-)

diff --git a/app/test/test_memcpy.c b/app/test/test_memcpy.c
index 56b8e1e..b2bb4e0 100644
--- a/app/test/test_memcpy.c
+++ b/app/test/test_memcpy.c
@@ -78,56 +78,9 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE 100

 /* Data is aligned on this many bytes (power of 2) */
-#define ALIGNMENT_UNIT  16
+#define ALIGNMENT_UNIT  32


-
-/* Structure with base memcpy func pointer, and number of bytes it copies */
-struct base_memcpy_func {
-   void (*func)(uint8_t *dst, const uint8_t *src);
-   unsigned size;
-};
-
-/* To create base_memcpy_func structure entries */
-#define BASE_FUNC(n) {rte_mov##n, n}
-
-/* Max number of bytes that can be copies with a "base" memcpy functions */
-#define MAX_BASE_FUNC_SIZE 256
-
-/*
- * Test the "base" memcpy functions, that a copy fixed number of bytes.
- */
-static int
-base_func_test(void)
-{
-   const struct base_memcpy_func base_memcpy_funcs[6] = {
-   BASE_FUNC(16),
-   BASE_FUNC(32),
-   BASE_FUNC(48),
-   BASE_FUNC(64),
-   BASE_FUNC(128),
-   BASE_FUNC(256),
-   };
-   unsigned i, j;
-   unsigned num_funcs = sizeof(base_memcpy_funcs) / 
sizeof(base_memcpy_funcs[0]);
-   uint8_t dst[MAX_BASE_FUNC_SIZE];
-   uint8_t src[MAX_BASE_FUNC_SIZE];
-
-   for (i = 0; i < num_funcs; i++) {
-   unsigned size = base_memcpy_funcs[i].size;
-   for (j = 0; j < size; j++) {
-   dst[j] = 0;
-   src[j] = (uint8_t) rte_rand();
-   }
-   base_memcpy_funcs[i].func(dst, src);
-   for (j = 0; j < size; j++)
-   if (dst[j] != src[j])
-   return -1;
-   }
-
-   return 0;
-}
-
 /*
  * Create two buffers, and initialise one with random values. These are copied
  * to the second buffer and then compared to see if the copy was successful.
@@ -218,9 +171,6 @@ test_memcpy(void)
ret = func_test();
if (ret != 0)
return -1;
-   ret = base_func_test();
-   if (ret != 0)
-   return -1;
return 0;
 }

-- 
1.9.3



[dpdk-dev] [PATCH 3/4] app/test: Extended test coverage in test_memcpy_perf.c

2015-01-19 Thread zhihong.w...@intel.com
Main code changes:

1. Added more typical data points for a thorough performance test

2. Added unaligned test cases since it's common in DPDK usage

Signed-off-by: Zhihong Wang 
---
 app/test/test_memcpy_perf.c | 238 +---
 1 file changed, 156 insertions(+), 82 deletions(-)

diff --git a/app/test/test_memcpy_perf.c b/app/test/test_memcpy_perf.c
index 7809610..4875af1 100644
--- a/app/test/test_memcpy_perf.c
+++ b/app/test/test_memcpy_perf.c
@@ -54,9 +54,10 @@
 /* List of buffer sizes to test */
 #if TEST_VALUE_RANGE == 0
 static size_t buf_sizes[] = {
-   0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255,
-   256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600,
-   2048, 3072, 4096, 5120, 6144, 7168, 8192
+   1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 
128,
+   129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 
448,
+   449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 
1600,
+   2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 
8192
 };
 /* MUST be as large as largest packet size above */
 #define SMALL_BUFFER_SIZE   8192
@@ -78,7 +79,7 @@ static size_t buf_sizes[TEST_VALUE_RANGE];
 #define TEST_BATCH_SIZE 100

 /* Data is aligned on this many bytes (power of 2) */
-#define ALIGNMENT_UNIT  16
+#define ALIGNMENT_UNIT  32

 /*
  * Pointers used in performance tests. The two large buffers are for uncached
@@ -94,19 +95,19 @@ init_buffers(void)
 {
unsigned i;

-   large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_read == NULL)
goto error_large_buf_read;

-   large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (large_buf_write == NULL)
goto error_large_buf_write;

-   small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (small_buf_read == NULL)
goto error_small_buf_read;

-   small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE, 
ALIGNMENT_UNIT);
+   small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + 
ALIGNMENT_UNIT, ALIGNMENT_UNIT);
if (small_buf_write == NULL)
goto error_small_buf_write;

@@ -140,25 +141,25 @@ free_buffers(void)

 /*
  * Get a random offset into large array, with enough space needed to perform
- * max copy size. Offset is aligned.
+ * max copy size. Offset is aligned, uoffset is used for unalignment setting.
  */
 static inline size_t
-get_rand_offset(void)
+get_rand_offset(size_t uoffset)
 {
-   return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
-   ~(ALIGNMENT_UNIT - 1));
+   return (((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) &
+   ~(ALIGNMENT_UNIT - 1)) + uoffset);
 }

 /* Fill in source and destination addresses. */
 static inline void
-fill_addr_arrays(size_t *dst_addr, int is_dst_cached,
-   size_t *src_addr, int is_src_cached)
+fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset,
+size_t *src_addr, int is_src_cached, size_t 
src_uoffset)
 {
unsigned int i;

for (i = 0; i < TEST_BATCH_SIZE; i++) {
-   dst_addr[i] = (is_dst_cached) ? 0 : get_rand_offset();
-   src_addr[i] = (is_src_cached) ? 0 : get_rand_offset();
+   dst_addr[i] = (is_dst_cached) ? dst_uoffset : 
get_rand_offset(dst_uoffset);
+   src_addr[i] = (is_src_cached) ? src_uoffset : 
get_rand_offset(src_uoffset);
}
 }

@@ -169,16 +170,17 @@ fill_addr_arrays(size_t *dst_addr, int is_dst_cached,
  */
 static void
 do_uncached_write(uint8_t *dst, int is_dst_cached,
-   const uint8_t *src, int is_src_cached, size_t size)
+ const uint8_t *src, int is_src_cached, size_t 
size)
 {
unsigned i, j;
size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE];

for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) {
-   fill_addr_arrays(dst_addrs, is_dst_cached,
-src_addrs, is_src_cached);
-   for (j = 0; j < TEST_BATCH_SIZE; j++)
+   fill_addr_arrays(dst_addrs, is_dst_cached, 0,
+src_addrs, is_src_cached, 0);
+   for (j = 0; j < TEST_BATCH_SIZE; j++) {
rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size);
+   }
}
 }

@@ -186,52 +188,129 @@ do_uncached_writ

[dpdk-dev] [PATCH 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms

2015-01-19 Thread zhihong.w...@intel.com
Main code changes:

1. Differentiate architectural features based on CPU flags

a. Implement separated move functions for SSE/AVX/AVX2 to make full 
utilization of cache bandwidth

b. Implement separated copy flow specifically optimized for target 
architecture

2. Rewrite the memcpy function "rte_memcpy"

a. Add store aligning

b. Add load aligning based on architectural features

c. Put block copy loop into inline move functions for better control of 
instruction order

d. Eliminate unnecessary MOVs

3. Rewrite the inline move functions

a. Add move functions for unaligned load cases

b. Change instruction order in copy loops for better pipeline utilization

c. Use intrinsics instead of assembly code

4. Remove slow glibc call for constant copies

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 664 +++--
 1 file changed, 493 insertions(+), 171 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index fb9eba8..69a5c6f 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -34,166 +34,189 @@
 #ifndef _RTE_MEMCPY_X86_64_H_
 #define _RTE_MEMCPY_X86_64_H_

+/**
+ * @file
+ *
+ * Functions for SSE/AVX/AVX2 implementation of memcpy().
+ */
+
+#include 
 #include 
 #include 
-#include 
+#include 

 #ifdef __cplusplus
 extern "C" {
 #endif

-#include "generic/rte_memcpy.h"
+/**
+ * Copy bytes from one location to another. The locations must not overlap.
+ *
+ * @note This is implemented as a macro, so it's address should not be taken
+ * and care is needed as parameter expressions may be evaluated multiple times.
+ *
+ * @param dst
+ *   Pointer to the destination of the data.
+ * @param src
+ *   Pointer to the source data.
+ * @param n
+ *   Number of bytes to copy.
+ * @return
+ *   Pointer to the destination data.
+ */
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n) 
__attribute__((always_inline));

-#ifdef __INTEL_COMPILER
-#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */
-#endif
+#ifdef RTE_MACHINE_CPUFLAG_AVX2

+/**
+ * AVX2 implementation below
+ */
+
+/**
+ * Copy 16 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   : [reg_a] "=x" (reg_a)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   __m128i xmm0;
+
+   xmm0 = _mm_loadu_si128((const __m128i *)src);
+   _mm_storeu_si128((__m128i *)dst, xmm0);
 }

+/**
+ * Copy 32 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
-}
+   __m256i ymm0;

-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-   __m128i reg_a, reg_b, reg_c;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   "movdqu %[reg_c], 32(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b),
- [reg_c] "=x" (reg_c)
-   : [src] "r" (src),
- [dst] "r"(dst)
-   : "memory"
-   );
+   ymm0 = _mm256_loadu_si256((const __m256i *)src);
+   _mm256_storeu_si256((__m256i *)dst, ymm0);
 }

+/**
+ * Copy 64 bytes from one location to another,
+ * locations should not overlap.
+ */
 static inline void
 rte_mov64(uint8_t *dst, const uint8_t *src)
 {
-   __m128i reg_a, reg_b, reg_c, reg_d;
-   asm volatile (
-   "movdqu (%[src]), %[reg_a]\n\t"
-   "movdqu 16(%[src]), %[reg_b]\n\t"
-   "movdqu 32(%[src]), %[reg_c]\n\t"
-   "movdqu 48(%[src]), %[reg_d]\n\t"
-   "movdqu %[reg_a], (%[dst])\n\t"
-   "movdqu %[reg_b], 16(%[dst])\n\t"
-   "movdqu %[reg_c], 32(%[dst])\n\t"
-   "movdqu %[reg_d], 48(%[dst])\n\t"
-   : [reg_a] "=x" (reg_a),
- [reg_b] "=x" (reg_b),
- [reg_c] "=x" (reg_c),
-  

[dpdk-dev] [PATCH] A fix to work around strict-aliasing rules breaking

2015-03-02 Thread zhihong.w...@intel.com
Fixed strict-aliasing rules breaking errors for some GCC version.

Signed-off-by: Zhihong Wang 
---
 .../common/include/arch/x86/rte_memcpy.h   | 44 --
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h 
b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
index 69a5c6f..f412099 100644
--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -195,6 +195,8 @@ rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
+   uintptr_t dstu = (uintptr_t)dst;
+   uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
int dstofss;
int bits;
@@ -204,22 +206,22 @@ rte_memcpy(void *dst, const void *src, size_t n)
 */
if (n < 16) {
if (n & 0x01) {
-   *(uint8_t *)dst = *(const uint8_t *)src;
-   src = (const uint8_t *)src + 1;
-   dst = (uint8_t *)dst + 1;
+   *(uint8_t *)dstu = *(const uint8_t *)srcu;
+   srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+   dstu = (uintptr_t)((uint8_t *)dstu + 1);
}
if (n & 0x02) {
-   *(uint16_t *)dst = *(const uint16_t *)src;
-   src = (const uint16_t *)src + 1;
-   dst = (uint16_t *)dst + 1;
+   *(uint16_t *)dstu = *(const uint16_t *)srcu;
+   srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+   dstu = (uintptr_t)((uint16_t *)dstu + 1);
}
if (n & 0x04) {
-   *(uint32_t *)dst = *(const uint32_t *)src;
-   src = (const uint32_t *)src + 1;
-   dst = (uint32_t *)dst + 1;
+   *(uint32_t *)dstu = *(const uint32_t *)srcu;
+   srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+   dstu = (uintptr_t)((uint32_t *)dstu + 1);
}
if (n & 0x08) {
-   *(uint64_t *)dst = *(const uint64_t *)src;
+   *(uint64_t *)dstu = *(const uint64_t *)srcu;
}
return ret;
}
@@ -458,6 +460,8 @@ static inline void *
 rte_memcpy(void *dst, const void *src, size_t n)
 {
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
+   uintptr_t dstu = (uintptr_t)dst;
+   uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
int dstofss;
int srcofs;
@@ -467,22 +471,22 @@ rte_memcpy(void *dst, const void *src, size_t n)
 */
if (n < 16) {
if (n & 0x01) {
-   *(uint8_t *)dst = *(const uint8_t *)src;
-   src = (const uint8_t *)src + 1;
-   dst = (uint8_t *)dst + 1;
+   *(uint8_t *)dstu = *(const uint8_t *)srcu;
+   srcu = (uintptr_t)((const uint8_t *)srcu + 1);
+   dstu = (uintptr_t)((uint8_t *)dstu + 1);
}
if (n & 0x02) {
-   *(uint16_t *)dst = *(const uint16_t *)src;
-   src = (const uint16_t *)src + 1;
-   dst = (uint16_t *)dst + 1;
+   *(uint16_t *)dstu = *(const uint16_t *)srcu;
+   srcu = (uintptr_t)((const uint16_t *)srcu + 1);
+   dstu = (uintptr_t)((uint16_t *)dstu + 1);
}
if (n & 0x04) {
-   *(uint32_t *)dst = *(const uint32_t *)src;
-   src = (const uint32_t *)src + 1;
-   dst = (uint32_t *)dst + 1;
+   *(uint32_t *)dstu = *(const uint32_t *)srcu;
+   srcu = (uintptr_t)((const uint32_t *)srcu + 1);
+   dstu = (uintptr_t)((uint32_t *)dstu + 1);
}
if (n & 0x08) {
-   *(uint64_t *)dst = *(const uint64_t *)src;
+   *(uint64_t *)dstu = *(const uint64_t *)srcu;
}
return ret;
}
-- 
1.9.3