Hi Anatoly, > -----Original Message----- > From: Burakov, Anatoly > Sent: Monday, October 1, 2018 1:56 PM > To: dev@dpdk.org > Cc: Lu, Wenzhuo <wenzhuo...@intel.com>; Wu, Jingjing > <jingjing...@intel.com>; Iremonger, Bernard <bernard.iremon...@intel.com>; > Mcnamara, John <john.mcnam...@intel.com>; Kovacevic, Marko > <marko.kovace...@intel.com>; laszlo.madara...@ericsson.com; > laszlo.vadke...@ericsson.com; andras.kov...@ericsson.com; > winnie.t...@ericsson.com; daniel.andr...@ericsson.com; > janos.ko...@ericsson.com; geza.ko...@ericsson.com; > srinath.man...@broadcom.com; scott.bran...@broadcom.com; > ajit.khapa...@broadcom.com; Wiles, Keith <keith.wi...@intel.com>; > Richardson, Bruce <bruce.richard...@intel.com>; tho...@monjalon.net; > shreyansh.j...@nxp.com; shah...@mellanox.com; > arybche...@solarflare.com; alejandro.luc...@netronome.com > Subject: [PATCH v8 19/21] app/testpmd: add support for external memory > > Currently, mempools can only be allocated either using native DPDK memory, or > anonymous memory. This patch will add two new methods to allocate mempool > using external memory (regular or hugepage memory), and add documentation > about it to testpmd user guide. > > It adds a new flag "--mp-alloc", with four possible values: > native (use regular DPDK allocator), anon (use anonymous mempool), xmem > (use externally allocated memory area), and xmemhuge (use externally allocated > hugepage memory area). Old flag "--mp-anon" is kept for compatibility. > > All external memory is allocated using the same external heap, but each will > allocate and add a new memory area. > > Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com> > Suggested-by: Konstantin Ananyev <konstantin.anan...@intel.com> > --- > app/test-pmd/config.c | 21 +- > app/test-pmd/parameters.c | 23 +- > app/test-pmd/testpmd.c | 320 ++++++++++++++++++++++++-- > app/test-pmd/testpmd.h | 13 +- > doc/guides/testpmd_app_ug/run_app.rst | 12 + > 5 files changed, 364 insertions(+), 25 deletions(-) > > diff --git a/app/test-pmd/config.c b/app/test-pmd/config.c index > 794aa5268..3b921cfc6 100644 > --- a/app/test-pmd/config.c > +++ b/app/test-pmd/config.c > @@ -2423,6 +2423,23 @@ fwd_config_setup(void) > simple_fwd_config_setup(); > } > > +static const char * > +mp_alloc_to_str(uint8_t mode) > +{ > + switch (mode) { > + case MP_ALLOC_NATIVE: > + return "native"; > + case MP_ALLOC_ANON: > + return "anon"; > + case MP_ALLOC_XMEM: > + return "xmem"; > + case MP_ALLOC_XMEM_HUGE: > + return "xmemhuge"; > + default: > + return "invalid"; > + } > +} > + > void > pkt_fwd_config_display(struct fwd_config *cfg) { @@ -2431,12 +2448,12 @@ > pkt_fwd_config_display(struct fwd_config *cfg) > streamid_t sm_id; > > printf("%s packet forwarding%s - ports=%d - cores=%d - streams=%d - " > - "NUMA support %s, MP over anonymous pages %s\n", > + "NUMA support %s, MP allocation mode: %s\n", > cfg->fwd_eng->fwd_mode_name, > retry_enabled == 0 ? "" : " with retry", > cfg->nb_fwd_ports, cfg->nb_fwd_lcores, cfg- > >nb_fwd_streams, > numa_support == 1 ? "enabled" : "disabled", > - mp_anon != 0 ? "enabled" : "disabled"); > + mp_alloc_to_str(mp_alloc_type)); > > if (retry_enabled) > printf("TX retry num: %u, delay between TX retries: %uus\n", > diff --git a/app/test-pmd/parameters.c b/app/test-pmd/parameters.c index > 9220e1c1b..b4016668c 100644 > --- a/app/test-pmd/parameters.c > +++ b/app/test-pmd/parameters.c > @@ -190,6 +190,11 @@ usage(char* progname) > printf(" --vxlan-gpe-port=N: UPD port of tunnel VXLAN-GPE\n"); > printf(" --mlockall: lock all memory\n"); > printf(" --no-mlockall: do not lock all memory\n"); > + printf(" --mp-alloc <native|anon|xmem|xmemhuge>: mempool > allocation method.\n" > + " native: use regular DPDK memory to create and populate > mempool\n" > + " anon: use regular DPDK memory to create and anonymous > memory to populate mempool\n" > + " xmem: use anonymous memory to create and populate > mempool\n" > + " xmemhuge: use anonymous hugepage memory to create and > populate mempool\n"); > } > > #ifdef RTE_LIBRTE_CMDLINE > @@ -625,6 +630,7 @@ launch_args_parse(int argc, char** argv) > { "vxlan-gpe-port", 1, 0, 0 }, > { "mlockall", 0, 0, 0 }, > { "no-mlockall", 0, 0, 0 }, > + { "mp-alloc", 1, 0, 0 }, > { 0, 0, 0, 0 }, > }; > > @@ -743,7 +749,22 @@ launch_args_parse(int argc, char** argv) > if (!strcmp(lgopts[opt_idx].name, "numa")) > numa_support = 1; > if (!strcmp(lgopts[opt_idx].name, "mp-anon")) { > - mp_anon = 1; > + mp_alloc_type = MP_ALLOC_ANON; > + } > + if (!strcmp(lgopts[opt_idx].name, "mp-alloc")) { > + if (!strcmp(optarg, "native")) > + mp_alloc_type = MP_ALLOC_NATIVE; > + else if (!strcmp(optarg, "anon")) > + mp_alloc_type = MP_ALLOC_ANON; > + else if (!strcmp(optarg, "xmem")) > + mp_alloc_type = MP_ALLOC_XMEM; > + else if (!strcmp(optarg, "xmemhuge")) > + mp_alloc_type = > MP_ALLOC_XMEM_HUGE; > + else > + rte_exit(EXIT_FAILURE, > + "mp-alloc %s invalid - must be: > " > + "native, anon or xmem\n",
Should xmemhuge be added to above line? > + optarg); > } > if (!strcmp(lgopts[opt_idx].name, "port-numa-config")) > { > if (parse_portnuma_config(optarg)) > diff --git a/app/test-pmd/testpmd.c b/app/test-pmd/testpmd.c index > 001f0e552..255a9c664 100644 > --- a/app/test-pmd/testpmd.c > +++ b/app/test-pmd/testpmd.c > @@ -27,6 +27,7 @@ > #include <rte_log.h> > #include <rte_debug.h> > #include <rte_cycles.h> > +#include <rte_malloc_heap.h> > #include <rte_memory.h> > #include <rte_memcpy.h> > #include <rte_launch.h> > @@ -63,6 +64,22 @@ > > #include "testpmd.h" > > +#ifndef MAP_HUGETLB > +/* FreeBSD may not have MAP_HUGETLB (in fact, it probably doesn't) */ > +#define HUGE_FLAG (0x40000) #else #define HUGE_FLAG MAP_HUGETLB > #endif > + > +#ifndef MAP_HUGE_SHIFT > +/* older kernels (or FreeBSD) will not have this define */ #define > +HUGE_SHIFT (26) #else #define HUGE_SHIFT MAP_HUGE_SHIFT #endif > + > +#define EXTMEM_HEAP_NAME "extmem" > + > uint16_t verbose_level = 0; /**< Silent by default. */ int testpmd_logtype; > /**< > Log type for testpmd logs */ > > @@ -88,9 +105,13 @@ uint8_t numa_support = 1; /**< numa enabled by > default */ uint8_t socket_num = UMA_NO_CONFIG; > > /* > - * Use ANONYMOUS mapped memory (might be not physically continuous) for > mbufs. > + * Select mempool allocation type: > + * - native: use regular DPDK memory > + * - anon: use regular DPDK memory to create mempool, but populate using > + * anonymous memory (may not be IOVA-contiguous) > + * - xmem: use externally allocated hugepage memory > */ > -uint8_t mp_anon = 0; > +uint8_t mp_alloc_type = MP_ALLOC_NATIVE; > > /* > * Store specified sockets on which memory pool to be used by ports @@ - > 527,6 +548,231 @@ set_def_fwd_config(void) > set_default_fwd_ports_config(); > } > > +/* extremely pessimistic estimation of memory required to create a > +mempool */ static int calc_mem_size(uint32_t nb_mbufs, uint32_t > +mbuf_sz, size_t pgsz, size_t *out) { > + unsigned int n_pages, mbuf_per_pg, leftover; > + uint64_t total_mem, mbuf_mem, obj_sz; > + > + /* there is no good way to predict how much space the mempool will > + * occupy because it will allocate chunks on the fly, and some of those > + * will come from default DPDK memory while some will come from our > + * external memory, so just assume 128MB will be enough for everyone. > + */ > + uint64_t hdr_mem = 128 << 20; > + > + /* account for possible non-contiguousness */ > + obj_sz = rte_mempool_calc_obj_size(mbuf_sz, 0, NULL); > + if (obj_sz > pgsz) { > + TESTPMD_LOG(ERR, "Object size is bigger than page size\n"); > + return -1; > + } > + > + mbuf_per_pg = pgsz / obj_sz; > + leftover = (nb_mbufs % mbuf_per_pg) > 0; > + n_pages = (nb_mbufs / mbuf_per_pg) + leftover; > + > + mbuf_mem = n_pages * pgsz; > + > + total_mem = RTE_ALIGN(hdr_mem + mbuf_mem, pgsz); > + > + if (total_mem > SIZE_MAX) { > + TESTPMD_LOG(ERR, "Memory size too big\n"); > + return -1; > + } > + *out = (size_t)total_mem; > + > + return 0; > +} > + > +static inline uint32_t > +bsf64(uint64_t v) > +{ > + return (uint32_t)__builtin_ctzll(v); > +} > + > +static inline uint32_t > +log2_u64(uint64_t v) > +{ > + if (v == 0) > + return 0; > + v = rte_align64pow2(v); > + return bsf64(v); > +} > + > +static int > +pagesz_flags(uint64_t page_sz) > +{ > + /* as per mmap() manpage, all page sizes are log2 of page size > + * shifted by MAP_HUGE_SHIFT > + */ > + int log2 = log2_u64(page_sz); Missing blank line after declarations. > + return (log2 << HUGE_SHIFT); > +} > + > +static void * > +alloc_mem(size_t memsz, size_t pgsz, bool huge) { > + void *addr; > + int flags; > + > + /* allocate anonymous hugepages */ > + flags = MAP_ANONYMOUS | MAP_PRIVATE; > + if (huge) > + flags |= HUGE_FLAG | pagesz_flags(pgsz); > + > + addr = mmap(NULL, memsz, PROT_READ | PROT_WRITE, flags, -1, 0); > + if (addr == MAP_FAILED) > + return NULL; > + > + return addr; > +} > + > +struct extmem_param { > + void *addr; > + size_t len; > + size_t pgsz; > + rte_iova_t *iova_table; > + unsigned int iova_table_len; > +}; > + > +static int > +create_extmem(uint32_t nb_mbufs, uint32_t mbuf_sz, struct extmem_param > *param, > + bool huge) > +{ > + uint64_t pgsizes[] = {RTE_PGSIZE_2M, RTE_PGSIZE_1G, /* x86_64, ARM > */ > + RTE_PGSIZE_16M, RTE_PGSIZE_16G}; /* POWER */ > + unsigned int cur_page, n_pages, pgsz_idx; > + size_t mem_sz, cur_pgsz; > + rte_iova_t *iovas = NULL; > + void *addr; > + int ret; > + > + for (pgsz_idx = 0; pgsz_idx < RTE_DIM(pgsizes); pgsz_idx++) { > + /* skip anything that is too big */ > + if (pgsizes[pgsz_idx] > SIZE_MAX) > + continue; > + > + cur_pgsz = pgsizes[pgsz_idx]; > + > + /* if we were told not to allocate hugepages, override */ > + if (!huge) > + cur_pgsz = sysconf(_SC_PAGESIZE); > + > + ret = calc_mem_size(nb_mbufs, mbuf_sz, cur_pgsz, &mem_sz); > + if (ret < 0) { > + TESTPMD_LOG(ERR, "Cannot calculate memory > size\n"); > + return -1; > + } > + > + /* allocate our memory */ > + addr = alloc_mem(mem_sz, cur_pgsz, huge); > + > + /* if we couldn't allocate memory with a specified page size, > + * that doesn't mean we can't do it with other page sizes, so > + * try another one. > + */ > + if (addr == NULL) > + continue; > + > + /* store IOVA addresses for every page in this memory area */ > + n_pages = mem_sz / cur_pgsz; > + > + iovas = malloc(sizeof(*iovas) * n_pages); > + > + if (iovas == NULL) { > + TESTPMD_LOG(ERR, "Cannot allocate memory for iova > addresses\n"); > + goto fail; > + } > + /* lock memory if it's not huge pages */ > + if (!huge) > + mlock(addr, mem_sz); > + > + /* populate IOVA addresses */ > + for (cur_page = 0; cur_page < n_pages; cur_page++) { > + rte_iova_t iova; > + size_t offset; > + void *cur; > + > + offset = cur_pgsz * cur_page; > + cur = RTE_PTR_ADD(addr, offset); > + iova = rte_mem_virt2iova(cur); > + > + iovas[cur_page] = iova; > + } > + > + break; > + } > + /* if we couldn't allocate anything */ > + if (iovas == NULL) > + return -1; > + > + param->addr = addr; > + param->len = mem_sz; > + param->pgsz = cur_pgsz; > + param->iova_table = iovas; > + param->iova_table_len = n_pages; > + > + return 0; > +fail: > + if (iovas) > + free(iovas); > + if (addr) > + munmap(addr, mem_sz); > + > + return -1; > +} > + > +static int > +setup_extmem(uint32_t nb_mbufs, uint32_t mbuf_sz, bool huge) { > + struct extmem_param param; > + int socket_id, ret; > + > + memset(¶m, 0, sizeof(param)); > + > + /* check if our heap exists */ > + socket_id = rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME); > + if (socket_id < 0) { > + /* create our heap */ > + ret = rte_malloc_heap_create(EXTMEM_HEAP_NAME); > + if (ret < 0) { > + TESTPMD_LOG(ERR, "Cannot create heap\n"); > + return -1; > + } > + } > + > + ret = create_extmem(nb_mbufs, mbuf_sz, ¶m, huge); > + if (ret < 0) { > + TESTPMD_LOG(ERR, "Cannot create memory area\n"); > + return -1; > + } > + > + /* we now have a valid memory area, so add it to heap */ > + ret = rte_malloc_heap_memory_add(EXTMEM_HEAP_NAME, > + param.addr, param.len, param.iova_table, > + param.iova_table_len, param.pgsz); > + > + /* when using VFIO, memory is automatically mapped for DMA by EAL > */ > + > + /* not needed any more */ > + free(param.iova_table); > + > + if (ret < 0) { > + TESTPMD_LOG(ERR, "Cannot add memory to heap\n"); > + munmap(param.addr, param.len); > + return -1; > + } > + > + /* success */ > + > + TESTPMD_LOG(DEBUG, "Allocated %zuMB of external memory\n", > + param.len >> 20); > + > + return 0; > +} > + > /* > * Configuration initialisation done once at init time. > */ > @@ -545,27 +791,59 @@ mbuf_pool_create(uint16_t mbuf_seg_size, unsigned > nb_mbuf, > "create a new mbuf pool <%s>: n=%u, size=%u, socket=%u\n", > pool_name, nb_mbuf, mbuf_seg_size, socket_id); > > - if (mp_anon != 0) { > - rte_mp = rte_mempool_create_empty(pool_name, nb_mbuf, > - mb_size, (unsigned) mb_mempool_cache, > - sizeof(struct rte_pktmbuf_pool_private), > - socket_id, 0); > - if (rte_mp == NULL) > - goto err; > + switch (mp_alloc_type) { > + case MP_ALLOC_NATIVE: > + { > + /* wrapper to rte_mempool_create() */ > + TESTPMD_LOG(INFO, "preferred mempool ops > selected: %s\n", > + rte_mbuf_best_mempool_ops()); > + rte_mp = rte_pktmbuf_pool_create(pool_name, > nb_mbuf, > + mb_mempool_cache, 0, mbuf_seg_size, > socket_id); > + break; > + } > + case MP_ALLOC_ANON: > + { > + rte_mp = rte_mempool_create_empty(pool_name, > nb_mbuf, > + mb_size, (unsigned int) mb_mempool_cache, > + sizeof(struct rte_pktmbuf_pool_private), > + socket_id, 0); > + if (rte_mp == NULL) > + goto err; > + > + if (rte_mempool_populate_anon(rte_mp) == 0) { > + rte_mempool_free(rte_mp); > + rte_mp = NULL; > + goto err; > + } > + rte_pktmbuf_pool_init(rte_mp, NULL); > + rte_mempool_obj_iter(rte_mp, rte_pktmbuf_init, > NULL); > + break; > + } > + case MP_ALLOC_XMEM: > + case MP_ALLOC_XMEM_HUGE: > + { > + int heap_socket; > + bool huge = mp_alloc_type == > MP_ALLOC_XMEM_HUGE; > > - if (rte_mempool_populate_anon(rte_mp) == 0) { > - rte_mempool_free(rte_mp); > - rte_mp = NULL; > - goto err; > + if (setup_extmem(nb_mbuf, mbuf_seg_size, huge) < 0) > + rte_exit(EXIT_FAILURE, "Could not create > external memory\n"); > + > + heap_socket = > + > rte_malloc_heap_get_socket(EXTMEM_HEAP_NAME); > + if (heap_socket < 0) > + rte_exit(EXIT_FAILURE, "Could not get external > memory socket > +ID\n"); > + > + TESTPMD_LOG(INFO, "preferred mempool ops > selected: %s\n", > + rte_mbuf_best_mempool_ops()); > + rte_mp = rte_pktmbuf_pool_create(pool_name, > nb_mbuf, > + mb_mempool_cache, 0, > mbuf_seg_size, > + heap_socket); > + break; > + } > + default: > + { > + rte_exit(EXIT_FAILURE, "Invalid mempool creation > mode\n"); > } > - rte_pktmbuf_pool_init(rte_mp, NULL); > - rte_mempool_obj_iter(rte_mp, rte_pktmbuf_init, NULL); > - } else { > - /* wrapper to rte_mempool_create() */ > - TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n", > - rte_mbuf_best_mempool_ops()); > - rte_mp = rte_pktmbuf_pool_create(pool_name, nb_mbuf, > - mb_mempool_cache, 0, mbuf_seg_size, socket_id); > } > > err: > diff --git a/app/test-pmd/testpmd.h b/app/test-pmd/testpmd.h index > a1f661472..65e0cec90 100644 > --- a/app/test-pmd/testpmd.h > +++ b/app/test-pmd/testpmd.h > @@ -69,6 +69,16 @@ enum { > PORT_TOPOLOGY_LOOP, > }; > > +enum { > + MP_ALLOC_NATIVE, /**< allocate and populate mempool natively */ > + MP_ALLOC_ANON, > + /**< allocate mempool natively, but populate using anonymous > memory */ > + MP_ALLOC_XMEM, > + /**< allocate and populate mempool using anonymous memory */ > + MP_ALLOC_XMEM_HUGE > + /**< allocate and populate mempool using anonymous hugepage > memory */ > +}; > + > #ifdef RTE_TEST_PMD_RECORD_BURST_STATS > /** > * The data structure associated with RX and TX packet burst statistics @@ - > 304,7 +314,8 @@ extern uint8_t numa_support; /**< set by "--numa" > parameter */ extern uint16_t port_topology; /**< set by "--port-topology" > parameter */ extern uint8_t no_flush_rx; /**<set by "--no-flush-rx" parameter > */ extern uint8_t flow_isolate_all; /**< set by "--flow-isolate-all */ > -extern > uint8_t mp_anon; /**< set by "--mp-anon" parameter */ > +extern uint8_t mp_alloc_type; > +/**< set by "--mp-anon" or "--mp-alloc" parameter */ > extern uint8_t no_link_check; /**<set by "--disable-link-check" parameter */ > extern volatile int test_done; /* stop packet forwarding when set to 1. */ > extern > uint8_t lsc_interrupt; /**< disabled by "--no-lsc-interrupt" parameter */ > diff --git > a/doc/guides/testpmd_app_ug/run_app.rst > b/doc/guides/testpmd_app_ug/run_app.rst > index f301c2b6f..67a8532a4 100644 > --- a/doc/guides/testpmd_app_ug/run_app.rst > +++ b/doc/guides/testpmd_app_ug/run_app.rst > @@ -498,3 +498,15 @@ The commandline options are: > * ``--no-mlockall`` > > Disable locking all memory. > + > +* ``--mp-alloc <native|anon|xmem|xmemhuge>`` > + > + Select mempool allocation mode: > + > + * native: create and populate mempool using native DPDK memory > + * anon: create mempool using native DPDK memory, but populate using > + anonymous memory > + * xmem: create and populate mempool using externally and anonymously > + allocated area > + * xmemhuge: create and populate mempool using externally and > anonymously > + allocated hugepage area > -- > 2.17.1 The following checkpatch warnings is testpmd.c should probably be fixed. WARNING: line over 80 characters #332: FILE: app/test-pmd/testpmd.c:685: + TESTPMD_LOG(ERR, "Cannot allocate memory for iova addresses\n"); WARNING: line over 80 characters #441: FILE: app/test-pmd/testpmd.c:798: + TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n", WARNING: line over 80 characters #476: FILE: app/test-pmd/testpmd.c:829: + rte_exit(EXIT_FAILURE, "Could not create external memory\n"); WARNING: line over 80 characters #481: FILE: app/test-pmd/testpmd.c:834: + rte_exit(EXIT_FAILURE, "Could not get external memory socket ID\n"); WARNING: line over 80 characters #483: FILE: app/test-pmd/testpmd.c:836: + TESTPMD_LOG(INFO, "preferred mempool ops selected: %s\n", WARNING: line over 80 characters #492: FILE: app/test-pmd/testpmd.c:845: + rte_exit(EXIT_FAILURE, "Invalid mempool creation mode\n"); Regards, Bernard.