Provide build option to have functions in <rte_memcpy.h> delegate to
the standard compiler/libc memcpy(), instead of using the various
custom DPDK, handcrafted, per-architecture rte_memcpy()
implementations.

A new meson build option 'use_cc_memcpy' is added. By default,
the compiler/libc memcpy() is used.

The performance benefits of the custom DPDK rte_memcpy()
implementations have been diminishing with every compiler release, and
with current toolchains the use of a custom memcpy() implementation
may even be a liability.

This patch leaves an option to stay on the custom DPDK implementations,
would that prove beneficial for certain applications or architectures.

An additional benefit of this change is that compilers and static
analysis tools have an easier time detecting incorrect usage of
rte_memcpy() (e.g., buffer overruns, or overlapping source and
destination buffers).

Signed-off-by: Mattias Rönnblom <mattias.ronnb...@ericsson.com>
Acked-by: Morten Brørup <m...@smartsharesystems.com>

---

PATCH:
 o Add entry in release notes.
 o Update meson help text.

RFC v3:
 o Fix missing #endif on loongarch.
 o PPC and RISCV now implemented, meaning all architectures are supported.
 o Unnecessary <rte_vect.h> include is removed from <rte_memcpy.h>.

RFC v2:
 * Fix bug where rte_memcpy.h was not installed on x86.
 * Made attempt to make Loongarch compile.
---
 config/meson.build                     |  1 +
 doc/guides/rel_notes/release_24_07.rst | 21 +++++++++
 lib/eal/arm/include/rte_memcpy.h       | 10 +++++
 lib/eal/include/generic/rte_memcpy.h   | 61 +++++++++++++++++++++++---
 lib/eal/loongarch/include/rte_memcpy.h | 53 ++--------------------
 lib/eal/ppc/include/rte_memcpy.h       | 10 +++++
 lib/eal/riscv/include/rte_memcpy.h     | 53 ++--------------------
 lib/eal/x86/include/meson.build        |  1 +
 lib/eal/x86/include/rte_memcpy.h       | 11 ++++-
 meson_options.txt                      |  2 +
 10 files changed, 117 insertions(+), 106 deletions(-)

diff --git a/config/meson.build b/config/meson.build
index 8c8b019c25..456056628e 100644
--- a/config/meson.build
+++ b/config/meson.build
@@ -353,6 +353,7 @@ endforeach
 # set other values pulled from the build options
 dpdk_conf.set('RTE_MAX_ETHPORTS', get_option('max_ethports'))
 dpdk_conf.set('RTE_LIBEAL_USE_HPET', get_option('use_hpet'))
+dpdk_conf.set('RTE_USE_CC_MEMCPY', get_option('use_cc_memcpy'))
 dpdk_conf.set('RTE_ENABLE_STDATOMIC', get_option('enable_stdatomic'))
 dpdk_conf.set('RTE_ENABLE_TRACE_FP', get_option('enable_trace_fp'))
 dpdk_conf.set('RTE_PKTMBUF_HEADROOM', get_option('pkt_mbuf_headroom'))
diff --git a/doc/guides/rel_notes/release_24_07.rst 
b/doc/guides/rel_notes/release_24_07.rst
index a69f24cf99..4b6eafa86e 100644
--- a/doc/guides/rel_notes/release_24_07.rst
+++ b/doc/guides/rel_notes/release_24_07.rst
@@ -24,6 +24,27 @@ DPDK Release 24.07
 New Features
 ------------
 
+* **Compiler memcpy replaces custom DPDK implementation.**
+
+  The memory copy functions of ``<rte_memcpy.h>`` now delegates to the
+  standard memcpy() function, implemented by the compiler and the C
+  runtime (e.g., libc).
+
+  In this release of DPDK, the handcrafted, per-architecture memory
+  copy implementations are still available, and may be reactivated by
+  setting the new ``use_cc_memcpy`` build option to false.
+
+  The performance benefits of the custom DPDK rte_memcpy()
+  implementations have been diminishing with every new compiler
+  release, and with current toolchains the use of a custom memcpy()
+  implementation may even result in worse performance than the
+  standard memcpy().
+
+  An additional benefit of this change is that compilers and static
+  analysis tools have an easier time detecting incorrect usage of
+  rte_memcpy() (e.g., buffer overruns, or overlapping source and
+  destination buffers).
+
 .. This section should contain new features added in this release.
    Sample format:
 
diff --git a/lib/eal/arm/include/rte_memcpy.h b/lib/eal/arm/include/rte_memcpy.h
index 47dea9a8cc..e8aff722df 100644
--- a/lib/eal/arm/include/rte_memcpy.h
+++ b/lib/eal/arm/include/rte_memcpy.h
@@ -5,10 +5,20 @@
 #ifndef _RTE_MEMCPY_ARM_H_
 #define _RTE_MEMCPY_ARM_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #ifdef RTE_ARCH_64
 #include <rte_memcpy_64.h>
 #else
 #include <rte_memcpy_32.h>
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_ARM_H_ */
diff --git a/lib/eal/include/generic/rte_memcpy.h 
b/lib/eal/include/generic/rte_memcpy.h
index e7f0f8eaa9..cae06117fb 100644
--- a/lib/eal/include/generic/rte_memcpy.h
+++ b/lib/eal/include/generic/rte_memcpy.h
@@ -5,12 +5,19 @@
 #ifndef _RTE_MEMCPY_H_
 #define _RTE_MEMCPY_H_
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * @file
  *
  * Functions for vectorised implementation of memcpy().
  */
 
+#include <stdint.h>
+#include <string.h>
+
 /**
  * Copy 16 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -35,8 +42,6 @@ rte_mov16(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov32(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy 48 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -49,8 +54,6 @@ rte_mov32(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov48(uint8_t *dst, const uint8_t *src);
 
-#endif /* __DOXYGEN__ */
-
 /**
  * Copy 64 bytes from one location to another using optimised
  * instructions. The locations should not overlap.
@@ -87,8 +90,6 @@ rte_mov128(uint8_t *dst, const uint8_t *src);
 static inline void
 rte_mov256(uint8_t *dst, const uint8_t *src);
 
-#ifdef __DOXYGEN__
-
 /**
  * Copy bytes from one location to another. The locations must not overlap.
  *
@@ -111,6 +112,52 @@ rte_mov256(uint8_t *dst, const uint8_t *src);
 static void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
-#endif /* __DOXYGEN__ */
+#ifdef RTE_USE_CC_MEMCPY
+static inline void
+rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 16);
+}
+
+static inline void
+rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 32);
+}
+
+static inline void
+rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 48);
+}
+
+static inline void
+rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 64);
+}
+
+static inline void
+rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 128);
+}
+
+static inline void
+rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+       memcpy(dst, src, 256);
+}
+
+static inline void *
+rte_memcpy(void *dst, const void *src, size_t n)
+{
+       return memcpy(dst, src, n);
+}
+#endif /* RTE_USE_CC_MEMCPY */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* _RTE_MEMCPY_H_ */
diff --git a/lib/eal/loongarch/include/rte_memcpy.h 
b/lib/eal/loongarch/include/rte_memcpy.h
index 22578d40f4..344b4416b5 100644
--- a/lib/eal/loongarch/include/rte_memcpy.h
+++ b/lib/eal/loongarch/include/rte_memcpy.h
@@ -5,57 +5,12 @@
 #ifndef RTE_MEMCPY_LOONGARCH_H
 #define RTE_MEMCPY_LOONGARCH_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_LOONGARCH_H */
diff --git a/lib/eal/ppc/include/rte_memcpy.h b/lib/eal/ppc/include/rte_memcpy.h
index 6f388c0234..645fd83986 100644
--- a/lib/eal/ppc/include/rte_memcpy.h
+++ b/lib/eal/ppc/include/rte_memcpy.h
@@ -6,6 +6,14 @@
 #ifndef _RTE_MEMCPY_PPC_64_H_
 #define _RTE_MEMCPY_PPC_64_H_
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdint.h>
 #include <string.h>
 
@@ -215,4 +223,6 @@ rte_memcpy_func(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_PPC_64_H_ */
diff --git a/lib/eal/riscv/include/rte_memcpy.h 
b/lib/eal/riscv/include/rte_memcpy.h
index e34f19396e..4acdc4af5f 100644
--- a/lib/eal/riscv/include/rte_memcpy.h
+++ b/lib/eal/riscv/include/rte_memcpy.h
@@ -7,57 +7,12 @@
 #ifndef RTE_MEMCPY_RISCV_H
 #define RTE_MEMCPY_RISCV_H
 
-#include <stdint.h>
-#include <string.h>
+#include <rte_config.h>
 
-#include "rte_common.h"
-
-#ifdef __cplusplus
-extern "C" {
+#ifndef RTE_USE_CC_MEMCPY
+#define RTE_USE_CC_MEMCPY
 #endif
 
-#include "generic/rte_memcpy.h"
-
-static inline void
-rte_mov16(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 16);
-}
-
-static inline void
-rte_mov32(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 32);
-}
-
-static inline void
-rte_mov48(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 48);
-}
-
-static inline void
-rte_mov64(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 64);
-}
-
-static inline void
-rte_mov128(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 128);
-}
-
-static inline void
-rte_mov256(uint8_t *dst, const uint8_t *src)
-{
-       memcpy(dst, src, 256);
-}
-
-#define rte_memcpy(d, s, n)    memcpy((d), (s), (n))
-
-#ifdef __cplusplus
-}
-#endif
+#include <generic/rte_memcpy.h>
 
 #endif /* RTE_MEMCPY_RISCV_H */
diff --git a/lib/eal/x86/include/meson.build b/lib/eal/x86/include/meson.build
index 52d2f8e969..09c2fe2485 100644
--- a/lib/eal/x86/include/meson.build
+++ b/lib/eal/x86/include/meson.build
@@ -16,6 +16,7 @@ arch_headers = files(
         'rte_spinlock.h',
         'rte_vect.h',
 )
+
 arch_indirect_headers = files(
         'rte_atomic_32.h',
         'rte_atomic_64.h',
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 72a92290e0..c5ba74d2ed 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -11,12 +11,19 @@
  * Functions for SSE/AVX/AVX2/AVX512 implementation of memcpy().
  */
 
+#include <rte_config.h>
+
+#ifdef RTE_USE_CC_MEMCPY
+
+#include <generic/rte_memcpy.h>
+
+#else
+
 #include <stdio.h>
 #include <stdint.h>
 #include <string.h>
 #include <rte_vect.h>
 #include <rte_common.h>
-#include <rte_config.h>
 
 #ifdef __cplusplus
 extern "C" {
@@ -878,4 +885,6 @@ rte_memcpy(void *dst, const void *src, size_t n)
 }
 #endif
 
+#endif /* RTE_USE_CC_MEMCPY */
+
 #endif /* _RTE_MEMCPY_X86_64_H_ */
diff --git a/meson_options.txt b/meson_options.txt
index e49b2fc089..06f544b631 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -60,3 +60,5 @@ option('tests', type: 'boolean', value: true, description:
        'build unit tests')
 option('use_hpet', type: 'boolean', value: false, description:
        'use HPET timer in EAL')
+option('use_cc_memcpy', type: 'boolean', value: true, description:
+       'Have the functions of <rte_memcpy.h> delegate to compiler/libc 
memcpy() instead of using custom implementation.')
-- 
2.34.1

Reply via email to