[dpdk-dev] [PATCH 0/4] DPDK memcpy optimization
This patch set optimizes memcpy for DPDK for both SSE and AVX platforms. It also extends memcpy test coverage with unaligned cases and more test points. Optimization techniques are summarized below: 1. Utilize full cache bandwidth 2. Enforce aligned stores 3. Apply load address alignment based on architecture features 4. Make load/store address available as early as possible 5. General optimization techniques like inlining, branch reducing, prefetch pattern access Zhihong Wang (4): Disabled VTA for memcpy test in app/test/Makefile Removed unnecessary test cases in test_memcpy.c Extended test coverage in test_memcpy_perf.c Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms app/test/Makefile | 6 + app/test/test_memcpy.c | 52 +- app/test/test_memcpy_perf.c| 238 +--- .../common/include/arch/x86/rte_memcpy.h | 664 +++-- 4 files changed, 656 insertions(+), 304 deletions(-) -- 1.9.3
[dpdk-dev] [PATCH 1/4] app/test: Disabled VTA for memcpy test in app/test/Makefile
VTA is for debugging only, it increases compile time and binary size, especially when there're a lot of inlines. So disable it since memcpy test contains a lot of inline calls. Signed-off-by: Zhihong Wang --- app/test/Makefile | 6 ++ 1 file changed, 6 insertions(+) diff --git a/app/test/Makefile b/app/test/Makefile index 4311f96..94dbadf 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -143,6 +143,12 @@ CFLAGS_test_kni.o += -Wno-deprecated-declarations endif CFLAGS += -D_GNU_SOURCE +# Disable VTA for memcpy test +ifeq ($(CC), gcc) +CFLAGS_test_memcpy.o += -fno-var-tracking-assignments +CFLAGS_test_memcpy_perf.o += -fno-var-tracking-assignments +endif + # this application needs libraries first DEPDIRS-y += lib -- 1.9.3
[dpdk-dev] [PATCH 2/4] app/test: Removed unnecessary test cases in test_memcpy.c
Removed unnecessary test cases for base move functions since the function "func_test" covers them all. Signed-off-by: Zhihong Wang --- app/test/test_memcpy.c | 52 +- 1 file changed, 1 insertion(+), 51 deletions(-) diff --git a/app/test/test_memcpy.c b/app/test/test_memcpy.c index 56b8e1e..b2bb4e0 100644 --- a/app/test/test_memcpy.c +++ b/app/test/test_memcpy.c @@ -78,56 +78,9 @@ static size_t buf_sizes[TEST_VALUE_RANGE]; #define TEST_BATCH_SIZE 100 /* Data is aligned on this many bytes (power of 2) */ -#define ALIGNMENT_UNIT 16 +#define ALIGNMENT_UNIT 32 - -/* Structure with base memcpy func pointer, and number of bytes it copies */ -struct base_memcpy_func { - void (*func)(uint8_t *dst, const uint8_t *src); - unsigned size; -}; - -/* To create base_memcpy_func structure entries */ -#define BASE_FUNC(n) {rte_mov##n, n} - -/* Max number of bytes that can be copies with a "base" memcpy functions */ -#define MAX_BASE_FUNC_SIZE 256 - -/* - * Test the "base" memcpy functions, that a copy fixed number of bytes. - */ -static int -base_func_test(void) -{ - const struct base_memcpy_func base_memcpy_funcs[6] = { - BASE_FUNC(16), - BASE_FUNC(32), - BASE_FUNC(48), - BASE_FUNC(64), - BASE_FUNC(128), - BASE_FUNC(256), - }; - unsigned i, j; - unsigned num_funcs = sizeof(base_memcpy_funcs) / sizeof(base_memcpy_funcs[0]); - uint8_t dst[MAX_BASE_FUNC_SIZE]; - uint8_t src[MAX_BASE_FUNC_SIZE]; - - for (i = 0; i < num_funcs; i++) { - unsigned size = base_memcpy_funcs[i].size; - for (j = 0; j < size; j++) { - dst[j] = 0; - src[j] = (uint8_t) rte_rand(); - } - base_memcpy_funcs[i].func(dst, src); - for (j = 0; j < size; j++) - if (dst[j] != src[j]) - return -1; - } - - return 0; -} - /* * Create two buffers, and initialise one with random values. These are copied * to the second buffer and then compared to see if the copy was successful. @@ -218,9 +171,6 @@ test_memcpy(void) ret = func_test(); if (ret != 0) return -1; - ret = base_func_test(); - if (ret != 0) - return -1; return 0; } -- 1.9.3
[dpdk-dev] [PATCH 3/4] app/test: Extended test coverage in test_memcpy_perf.c
Main code changes: 1. Added more typical data points for a thorough performance test 2. Added unaligned test cases since it's common in DPDK usage Signed-off-by: Zhihong Wang --- app/test/test_memcpy_perf.c | 238 +--- 1 file changed, 156 insertions(+), 82 deletions(-) diff --git a/app/test/test_memcpy_perf.c b/app/test/test_memcpy_perf.c index 7809610..4875af1 100644 --- a/app/test/test_memcpy_perf.c +++ b/app/test/test_memcpy_perf.c @@ -54,9 +54,10 @@ /* List of buffer sizes to test */ #if TEST_VALUE_RANGE == 0 static size_t buf_sizes[] = { - 0, 1, 7, 8, 9, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, 129, 255, - 256, 257, 320, 384, 511, 512, 513, 1023, 1024, 1025, 1518, 1522, 1600, - 2048, 3072, 4096, 5120, 6144, 7168, 8192 + 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 31, 32, 33, 63, 64, 65, 127, 128, + 129, 191, 192, 193, 255, 256, 257, 319, 320, 321, 383, 384, 385, 447, 448, + 449, 511, 512, 513, 767, 768, 769, 1023, 1024, 1025, 1518, 1522, 1536, 1600, + 2048, 2560, 3072, 3584, 4096, 4608, 5120, 5632, 6144, 6656, 7168, 7680, 8192 }; /* MUST be as large as largest packet size above */ #define SMALL_BUFFER_SIZE 8192 @@ -78,7 +79,7 @@ static size_t buf_sizes[TEST_VALUE_RANGE]; #define TEST_BATCH_SIZE 100 /* Data is aligned on this many bytes (power of 2) */ -#define ALIGNMENT_UNIT 16 +#define ALIGNMENT_UNIT 32 /* * Pointers used in performance tests. The two large buffers are for uncached @@ -94,19 +95,19 @@ init_buffers(void) { unsigned i; - large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE, ALIGNMENT_UNIT); + large_buf_read = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); if (large_buf_read == NULL) goto error_large_buf_read; - large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE, ALIGNMENT_UNIT); + large_buf_write = rte_malloc("memcpy", LARGE_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); if (large_buf_write == NULL) goto error_large_buf_write; - small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE, ALIGNMENT_UNIT); + small_buf_read = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); if (small_buf_read == NULL) goto error_small_buf_read; - small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE, ALIGNMENT_UNIT); + small_buf_write = rte_malloc("memcpy", SMALL_BUFFER_SIZE + ALIGNMENT_UNIT, ALIGNMENT_UNIT); if (small_buf_write == NULL) goto error_small_buf_write; @@ -140,25 +141,25 @@ free_buffers(void) /* * Get a random offset into large array, with enough space needed to perform - * max copy size. Offset is aligned. + * max copy size. Offset is aligned, uoffset is used for unalignment setting. */ static inline size_t -get_rand_offset(void) +get_rand_offset(size_t uoffset) { - return ((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) & - ~(ALIGNMENT_UNIT - 1)); + return (((rte_rand() % (LARGE_BUFFER_SIZE - SMALL_BUFFER_SIZE)) & + ~(ALIGNMENT_UNIT - 1)) + uoffset); } /* Fill in source and destination addresses. */ static inline void -fill_addr_arrays(size_t *dst_addr, int is_dst_cached, - size_t *src_addr, int is_src_cached) +fill_addr_arrays(size_t *dst_addr, int is_dst_cached, size_t dst_uoffset, +size_t *src_addr, int is_src_cached, size_t src_uoffset) { unsigned int i; for (i = 0; i < TEST_BATCH_SIZE; i++) { - dst_addr[i] = (is_dst_cached) ? 0 : get_rand_offset(); - src_addr[i] = (is_src_cached) ? 0 : get_rand_offset(); + dst_addr[i] = (is_dst_cached) ? dst_uoffset : get_rand_offset(dst_uoffset); + src_addr[i] = (is_src_cached) ? src_uoffset : get_rand_offset(src_uoffset); } } @@ -169,16 +170,17 @@ fill_addr_arrays(size_t *dst_addr, int is_dst_cached, */ static void do_uncached_write(uint8_t *dst, int is_dst_cached, - const uint8_t *src, int is_src_cached, size_t size) + const uint8_t *src, int is_src_cached, size_t size) { unsigned i, j; size_t dst_addrs[TEST_BATCH_SIZE], src_addrs[TEST_BATCH_SIZE]; for (i = 0; i < (TEST_ITERATIONS / TEST_BATCH_SIZE); i++) { - fill_addr_arrays(dst_addrs, is_dst_cached, -src_addrs, is_src_cached); - for (j = 0; j < TEST_BATCH_SIZE; j++) + fill_addr_arrays(dst_addrs, is_dst_cached, 0, +src_addrs, is_src_cached, 0); + for (j = 0; j < TEST_BATCH_SIZE; j++) { rte_memcpy(dst+dst_addrs[j], src+src_addrs[j], size); + } } } @@ -186,52 +188,129 @@ do_uncached_writ
[dpdk-dev] [PATCH 4/4] lib/librte_eal: Optimized memcpy in arch/x86/rte_memcpy.h for both SSE and AVX platforms
Main code changes: 1. Differentiate architectural features based on CPU flags a. Implement separated move functions for SSE/AVX/AVX2 to make full utilization of cache bandwidth b. Implement separated copy flow specifically optimized for target architecture 2. Rewrite the memcpy function "rte_memcpy" a. Add store aligning b. Add load aligning based on architectural features c. Put block copy loop into inline move functions for better control of instruction order d. Eliminate unnecessary MOVs 3. Rewrite the inline move functions a. Add move functions for unaligned load cases b. Change instruction order in copy loops for better pipeline utilization c. Use intrinsics instead of assembly code 4. Remove slow glibc call for constant copies Signed-off-by: Zhihong Wang --- .../common/include/arch/x86/rte_memcpy.h | 664 +++-- 1 file changed, 493 insertions(+), 171 deletions(-) diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index fb9eba8..69a5c6f 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -34,166 +34,189 @@ #ifndef _RTE_MEMCPY_X86_64_H_ #define _RTE_MEMCPY_X86_64_H_ +/** + * @file + * + * Functions for SSE/AVX/AVX2 implementation of memcpy(). + */ + +#include #include #include -#include +#include #ifdef __cplusplus extern "C" { #endif -#include "generic/rte_memcpy.h" +/** + * Copy bytes from one location to another. The locations must not overlap. + * + * @note This is implemented as a macro, so it's address should not be taken + * and care is needed as parameter expressions may be evaluated multiple times. + * + * @param dst + * Pointer to the destination of the data. + * @param src + * Pointer to the source data. + * @param n + * Number of bytes to copy. + * @return + * Pointer to the destination data. + */ +static inline void * +rte_memcpy(void *dst, const void *src, size_t n) __attribute__((always_inline)); -#ifdef __INTEL_COMPILER -#pragma warning(disable:593) /* Stop unused variable warning (reg_a etc). */ -#endif +#ifdef RTE_MACHINE_CPUFLAG_AVX2 +/** + * AVX2 implementation below + */ + +/** + * Copy 16 bytes from one location to another, + * locations should not overlap. + */ static inline void rte_mov16(uint8_t *dst, const uint8_t *src) { - __m128i reg_a; - asm volatile ( - "movdqu (%[src]), %[reg_a]\n\t" - "movdqu %[reg_a], (%[dst])\n\t" - : [reg_a] "=x" (reg_a) - : [src] "r" (src), - [dst] "r"(dst) - : "memory" - ); + __m128i xmm0; + + xmm0 = _mm_loadu_si128((const __m128i *)src); + _mm_storeu_si128((__m128i *)dst, xmm0); } +/** + * Copy 32 bytes from one location to another, + * locations should not overlap. + */ static inline void rte_mov32(uint8_t *dst, const uint8_t *src) { - __m128i reg_a, reg_b; - asm volatile ( - "movdqu (%[src]), %[reg_a]\n\t" - "movdqu 16(%[src]), %[reg_b]\n\t" - "movdqu %[reg_a], (%[dst])\n\t" - "movdqu %[reg_b], 16(%[dst])\n\t" - : [reg_a] "=x" (reg_a), - [reg_b] "=x" (reg_b) - : [src] "r" (src), - [dst] "r"(dst) - : "memory" - ); -} + __m256i ymm0; -static inline void -rte_mov48(uint8_t *dst, const uint8_t *src) -{ - __m128i reg_a, reg_b, reg_c; - asm volatile ( - "movdqu (%[src]), %[reg_a]\n\t" - "movdqu 16(%[src]), %[reg_b]\n\t" - "movdqu 32(%[src]), %[reg_c]\n\t" - "movdqu %[reg_a], (%[dst])\n\t" - "movdqu %[reg_b], 16(%[dst])\n\t" - "movdqu %[reg_c], 32(%[dst])\n\t" - : [reg_a] "=x" (reg_a), - [reg_b] "=x" (reg_b), - [reg_c] "=x" (reg_c) - : [src] "r" (src), - [dst] "r"(dst) - : "memory" - ); + ymm0 = _mm256_loadu_si256((const __m256i *)src); + _mm256_storeu_si256((__m256i *)dst, ymm0); } +/** + * Copy 64 bytes from one location to another, + * locations should not overlap. + */ static inline void rte_mov64(uint8_t *dst, const uint8_t *src) { - __m128i reg_a, reg_b, reg_c, reg_d; - asm volatile ( - "movdqu (%[src]), %[reg_a]\n\t" - "movdqu 16(%[src]), %[reg_b]\n\t" - "movdqu 32(%[src]), %[reg_c]\n\t" - "movdqu 48(%[src]), %[reg_d]\n\t" - "movdqu %[reg_a], (%[dst])\n\t" - "movdqu %[reg_b], 16(%[dst])\n\t" - "movdqu %[reg_c], 32(%[dst])\n\t" - "movdqu %[reg_d], 48(%[dst])\n\t" - : [reg_a] "=x" (reg_a), - [reg_b] "=x" (reg_b), - [reg_c] "=x" (reg_c), -
[dpdk-dev] [PATCH] A fix to work around strict-aliasing rules breaking
Fixed strict-aliasing rules breaking errors for some GCC version. Signed-off-by: Zhihong Wang --- .../common/include/arch/x86/rte_memcpy.h | 44 -- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h index 69a5c6f..f412099 100644 --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -195,6 +195,8 @@ rte_mov256blocks(uint8_t *dst, const uint8_t *src, size_t n) static inline void * rte_memcpy(void *dst, const void *src, size_t n) { + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; void *ret = dst; int dstofss; int bits; @@ -204,22 +206,22 @@ rte_memcpy(void *dst, const void *src, size_t n) */ if (n < 16) { if (n & 0x01) { - *(uint8_t *)dst = *(const uint8_t *)src; - src = (const uint8_t *)src + 1; - dst = (uint8_t *)dst + 1; + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); } if (n & 0x02) { - *(uint16_t *)dst = *(const uint16_t *)src; - src = (const uint16_t *)src + 1; - dst = (uint16_t *)dst + 1; + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); } if (n & 0x04) { - *(uint32_t *)dst = *(const uint32_t *)src; - src = (const uint32_t *)src + 1; - dst = (uint32_t *)dst + 1; + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); } if (n & 0x08) { - *(uint64_t *)dst = *(const uint64_t *)src; + *(uint64_t *)dstu = *(const uint64_t *)srcu; } return ret; } @@ -458,6 +460,8 @@ static inline void * rte_memcpy(void *dst, const void *src, size_t n) { __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8; + uintptr_t dstu = (uintptr_t)dst; + uintptr_t srcu = (uintptr_t)src; void *ret = dst; int dstofss; int srcofs; @@ -467,22 +471,22 @@ rte_memcpy(void *dst, const void *src, size_t n) */ if (n < 16) { if (n & 0x01) { - *(uint8_t *)dst = *(const uint8_t *)src; - src = (const uint8_t *)src + 1; - dst = (uint8_t *)dst + 1; + *(uint8_t *)dstu = *(const uint8_t *)srcu; + srcu = (uintptr_t)((const uint8_t *)srcu + 1); + dstu = (uintptr_t)((uint8_t *)dstu + 1); } if (n & 0x02) { - *(uint16_t *)dst = *(const uint16_t *)src; - src = (const uint16_t *)src + 1; - dst = (uint16_t *)dst + 1; + *(uint16_t *)dstu = *(const uint16_t *)srcu; + srcu = (uintptr_t)((const uint16_t *)srcu + 1); + dstu = (uintptr_t)((uint16_t *)dstu + 1); } if (n & 0x04) { - *(uint32_t *)dst = *(const uint32_t *)src; - src = (const uint32_t *)src + 1; - dst = (uint32_t *)dst + 1; + *(uint32_t *)dstu = *(const uint32_t *)srcu; + srcu = (uintptr_t)((const uint32_t *)srcu + 1); + dstu = (uintptr_t)((uint32_t *)dstu + 1); } if (n & 0x08) { - *(uint64_t *)dst = *(const uint64_t *)src; + *(uint64_t *)dstu = *(const uint64_t *)srcu; } return ret; } -- 1.9.3