Nothing uses that code yet. The bulk of it is copied from old
memory allocation stuff (eal_memory.c). We provide an API to
allocate either one page or multiple pages, guaranteeing that
we'll get contiguous VA for all of the pages that we requested.

Signed-off-by: Anatoly Burakov <anatoly.bura...@intel.com>
---
 lib/librte_eal/common/eal_memalloc.h       |  47 ++++
 lib/librte_eal/linuxapp/eal/Makefile       |   2 +
 lib/librte_eal/linuxapp/eal/eal_memalloc.c | 416 +++++++++++++++++++++++++++++
 3 files changed, 465 insertions(+)
 create mode 100755 lib/librte_eal/common/eal_memalloc.h
 create mode 100755 lib/librte_eal/linuxapp/eal/eal_memalloc.c

diff --git a/lib/librte_eal/common/eal_memalloc.h 
b/lib/librte_eal/common/eal_memalloc.h
new file mode 100755
index 0000000..59fd330
--- /dev/null
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -0,0 +1,47 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EAL_MEMALLOC_H
+#define EAL_MEMALLOC_H
+
+#include <stdbool.h>
+
+#include <rte_memory.h>
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket);
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
+               int socket, bool exact);
+
+#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/Makefile 
b/lib/librte_eal/linuxapp/eal/Makefile
index 782e1ad..88f10e9 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -62,6 +62,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
@@ -105,6 +106,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
 CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
 CFLAGS_eal_timer.o := -D_GNU_SOURCE
 CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
 CFLAGS_eal_thread.o := -D_GNU_SOURCE
 CFLAGS_eal_log.o := -D_GNU_SOURCE
 CFLAGS_eal_common_log.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c 
b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
new file mode 100755
index 0000000..527c2f6
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -0,0 +1,416 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_memory.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+       siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+       return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+       sigset_t mask;
+       struct sigaction action;
+
+       sigemptyset(&mask);
+       sigaddset(&mask, SIGBUS);
+       action.sa_flags = 0;
+       action.sa_mask = mask;
+       action.sa_handler = huge_sigbus_handler;
+
+       huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+       if (huge_need_recover) {
+               sigaction(SIGBUS, &huge_action_old, NULL);
+               huge_need_recover = 0;
+       }
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) {
+       bool have_numa = true;
+
+       /* Check if kernel supports NUMA. */
+       if (numa_available() != 0) {
+               RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+               have_numa = false;
+       }
+
+       if (have_numa) {
+               RTE_LOG(DEBUG, EAL, "Trying to obtain current memory 
policy.\n");
+               if (get_mempolicy(oldpolicy, oldmask->maskp,
+                                 oldmask->size + 1, 0, 0) < 0) {
+                       RTE_LOG(ERR, EAL,
+                               "Failed to get current mempolicy: %s. "
+                               "Assuming MPOL_DEFAULT.\n", strerror(errno));
+                       oldpolicy = MPOL_DEFAULT;
+               }
+               RTE_LOG(DEBUG, EAL,
+                       "Setting policy MPOL_PREFERRED for socket %d\n",
+                       socket_id);
+               numa_set_preferred(socket_id);
+       }
+       return have_numa;
+}
+
+static void
+resotre_numa(int *oldpolicy, struct bitmask *oldmask) {
+       RTE_LOG(DEBUG, EAL,
+               "Restoring previous memory policy: %d\n", *oldpolicy);
+       if (oldpolicy == MPOL_DEFAULT) {
+               numa_set_localalloc();
+       } else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+                                oldmask->size + 1) < 0) {
+               RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+                       strerror(errno));
+               numa_set_localalloc();
+       }
+       numa_free_cpumask(oldmask);
+}
+#endif
+
+static int
+alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
+               struct hugepage_info *hi, unsigned list_idx, unsigned seg_idx) {
+       int cur_socket_id = 0;
+       uint64_t fa_offset;
+       char path[PATH_MAX];
+       int ret = 0;
+
+       if (internal_config.single_file_segments) {
+               eal_get_hugefile_path(path, sizeof(path), hi->hugedir, 
list_idx);
+       } else {
+               eal_get_hugefile_path(path, sizeof(path), hi->hugedir,
+                               list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+       }
+
+       /* try to create hugepage file */
+       int fd = open(path, O_CREAT | O_RDWR, 0600);
+       if (fd < 0) {
+               RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+                               strerror(errno));
+               goto fname;
+       }
+       if (internal_config.single_file_segments) {
+               fa_offset = seg_idx * size;
+               if (fallocate(fd, 0, fa_offset, size)) {
+                       RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+                               __func__, strerror(errno));
+                       goto opened;
+               }
+       } else {
+               if (ftruncate(fd, size) < 0) {
+                       RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+                               __func__, strerror(errno));
+                       goto opened;
+               }
+               fa_offset = 0;
+       }
+
+       /* map the segment, and populate page tables,
+        * the kernel fills this segment with zeros */
+       void *va = mmap(addr, size, PROT_READ | PROT_WRITE,
+                       MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, fa_offset);
+       if (va == MAP_FAILED) {
+               RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+                       strerror(errno));
+               goto resized;
+       }
+       if (va != addr) {
+               RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+               goto mapped;
+       }
+
+       rte_iova_t iova = rte_mem_virt2iova(addr);
+       if (iova == RTE_BAD_PHYS_ADDR) {
+               RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+                       __func__);
+               goto mapped;
+       }
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+       if (cur_socket_id != socket_id) {
+               RTE_LOG(DEBUG, EAL,
+                               "%s(): allocation happened on wrong socket 
(wanted %d, got %d)\n",
+                       __func__, socket_id, cur_socket_id);
+               goto mapped;
+       }
+#endif
+
+       /* In linux, hugetlb limitations, like cgroup, are
+        * enforced at fault time instead of mmap(), even
+        * with the option of MAP_POPULATE. Kernel will send
+        * a SIGBUS signal. To avoid to be killed, save stack
+        * environment here, if SIGBUS happens, we can jump
+        * back here.
+        */
+       if (huge_wrap_sigsetjmp()) {
+               RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size 
%uMB\n",
+                       (unsigned)(size / 0x100000));
+               goto mapped;
+       }
+       *(int *)addr = *(int *) addr;
+
+       close(fd);
+
+       ms->addr = addr;
+       ms->hugepage_sz = size;
+       ms->len = size;
+       ms->nchannel = rte_memory_get_nchannel();
+       ms->nrank = rte_memory_get_nrank();
+       ms->iova = iova;
+       ms->socket_id = socket_id;
+
+       goto out;
+
+mapped:
+       munmap(addr, size);
+resized:
+       if (internal_config.single_file_segments)
+               fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                               fa_offset, size);
+       else {
+               unlink(path);
+       }
+opened:
+       close(fd);
+fname:
+       /* anything but goto out is an error */
+       ret = -1;
+out:
+       return ret;
+}
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
+               uint64_t size, int socket, bool exact) {
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *msl = NULL;
+       void *addr;
+       unsigned msl_idx;
+       int cur_idx, next_idx, end_idx, i, ret = 0;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       bool have_numa;
+       int oldpolicy;
+       struct bitmask *oldmask = numa_allocate_nodemask();
+#endif
+       struct hugepage_info *hi = NULL;
+
+       /* dynamic allocation not supported in legacy mode */
+       if (internal_config.legacy_mem)
+               return -1;
+
+       for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+               if (size ==
+                               internal_config.hugepage_info[i].hugepage_sz) {
+                       hi = &internal_config.hugepage_info[i];
+                       break;
+               }
+       }
+       if (!hi) {
+               RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info 
entry\n",
+                       __func__);
+               return -1;
+       }
+
+       /* find our memseg list */
+       for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+               struct rte_memseg_list *cur_msl = &mcfg->memsegs[msl_idx];
+
+               if (cur_msl->hugepage_sz != size) {
+                       continue;
+               }
+               if (cur_msl->socket_id != socket) {
+                       continue;
+               }
+               msl = cur_msl;
+               break;
+       }
+       if (!msl) {
+               RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+                       __func__);
+               return -1;
+       }
+
+       /* first, try finding space in already existing list */
+       cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);
+
+       if (cur_idx < 0) {
+               int old_len = msl->memseg_arr.len;
+               int space = 0;
+               int new_len = old_len;
+
+               /* grow new len until we can either fit n or can't grow */
+               while (new_len < msl->memseg_arr.capacity &&
+                               (space < n)) {
+                       new_len = RTE_MIN(new_len * 2, 
msl->memseg_arr.capacity);
+                       space = new_len - old_len;
+               }
+
+               /* check if we can expand the list */
+               if (old_len == new_len) {
+                       /* can't expand, the list is full */
+                       RTE_LOG(ERR, EAL, "%s(): no space in memseg list\n",
+                               __func__);
+                       return -1;
+               }
+
+               if (rte_fbarray_resize(&msl->memseg_arr, new_len)) {
+                       RTE_LOG(ERR, EAL, "%s(): can't resize memseg list\n",
+                               __func__);
+                       return -1;
+               }
+
+               /*
+                * we could conceivably end up with free space at the end of the
+                * list that wasn't enough to cover everything but can cover
+                * some of it, so start at (old_len - n) if possible.
+                */
+               next_idx = RTE_MAX(0, old_len - n);
+
+               cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr,
+                               next_idx, n);
+
+               if (cur_idx < 0) {
+                       /* still no space, bail out */
+                       RTE_LOG(ERR, EAL, "%s(): no space in memseg list\n",
+                               __func__);
+                       return -1;
+               }
+       }
+
+       end_idx = cur_idx + n;
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       have_numa = prepare_numa(&oldpolicy, oldmask, socket);
+#endif
+
+       for (i = 0; cur_idx < end_idx; cur_idx++, i++) {
+               struct rte_memseg *cur;
+
+               cur = rte_fbarray_get(&msl->memseg_arr, cur_idx);
+               addr = RTE_PTR_ADD(msl->base_va,
+                               cur_idx * msl->hugepage_sz);
+
+               if (alloc_page(cur, addr, size, socket, hi, msl_idx, cur_idx)) {
+                       RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, 
but only %i were allocated\n",
+                               n, i);
+
+                       /* if exact number of pages wasn't requested, stop */
+                       if (!exact) {
+                               ret = i;
+                               goto restore_numa;
+                       }
+                       if (ms)
+                               memset(ms, 0, sizeof(struct rte_memseg*) * n);
+                       ret = -1;
+                       goto restore_numa;
+               }
+               if (ms)
+                       ms[i] = cur;
+
+               rte_fbarray_set_used(&msl->memseg_arr, cur_idx, true);
+       }
+       ret = n;
+
+restore_numa:
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+       if (have_numa)
+               resotre_numa(&oldpolicy, oldmask);
+#endif
+       return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket) {
+       struct rte_memseg *ms;
+       if (eal_memalloc_alloc_page_bulk(&ms, 1, size, socket, true) < 0)
+               return NULL;
+       return ms;
+}
-- 
2.7.4

Reply via email to