This patch adds an option, --huge-trybest, to use a recover mechanism to
the case that there are not so many hugepages (declared in sysfs), which
can be used. It relys on a mem access to fault-in hugepages, and if fails
with SIGBUS, recover to previously saved stack environment with
siglongjmp().

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
          ./examples/helloworld/build/helloworld -c 0x2 -n 4 --huge-trybest

Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
---
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/common/eal_common_options.c |   4 ++
 lib/librte_eal/common/eal_internal_cfg.h   |   1 +
 lib/librte_eal/common/eal_options.h        |   2 +
 lib/librte_eal/linuxapp/eal/eal.c          |   1 +
 lib/librte_eal/linuxapp/eal/eal_memory.c   | 104 ++++++++++++++++++++++++++---
 5 files changed, 104 insertions(+), 8 deletions(-)

diff --git a/lib/librte_eal/common/eal_common_options.c 
b/lib/librte_eal/common/eal_common_options.c
index 29942ea..8ff6a2e 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@ eal_long_options[] = {
        {OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
        {OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
        {OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+       {OPT_HUGE_TRYBEST,      0, NULL, OPT_HUGE_TRYBEST_NUM     },
        {0,                     0, NULL, 0                        }
 };

@@ -896,6 +897,9 @@ eal_parse_common_option(int opt, const char *optarg,
                        return -1;
                }
                break;
+       case OPT_HUGE_TRYBEST_NUM:
+               internal_config.huge_trybest = 1;
+               break;

        /* don't know what to do, leave this to caller */
        default:
diff --git a/lib/librte_eal/common/eal_internal_cfg.h 
b/lib/librte_eal/common/eal_internal_cfg.h
index 5f1367e..90a3533 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -64,6 +64,7 @@ struct internal_config {
        volatile unsigned force_nchannel; /**< force number of channels */
        volatile unsigned force_nrank;    /**< force number of ranks */
        volatile unsigned no_hugetlbfs;   /**< true to disable hugetlbfs */
+       volatile unsigned huge_trybest;   /**< try best to allocate hugepages */
        unsigned hugepage_unlink;         /**< true to unlink backing files */
        volatile unsigned xen_dom0_support; /**< support app running on Xen 
Dom0*/
        volatile unsigned no_pci;         /**< true to disable PCI */
diff --git a/lib/librte_eal/common/eal_options.h 
b/lib/librte_eal/common/eal_options.h
index a881c62..02397c5 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@ enum {
        OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
        OPT_XEN_DOM0_NUM,
+#define OPT_HUGE_TRYBEST      "huge-trybest"
+       OPT_HUGE_TRYBEST_NUM,
        OPT_LONG_MAX_NUM
 };

diff --git a/lib/librte_eal/linuxapp/eal/eal.c 
b/lib/librte_eal/linuxapp/eal/eal.c
index ceac435..3e23877 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -343,6 +343,7 @@ eal_usage(const char *prgname)
               "  --"OPT_CREATE_UIO_DEV"    Create /dev/uioX (usually done by 
hotplug)\n"
               "  --"OPT_VFIO_INTR"         Interrupt mode for VFIO 
(legacy|msi|msix)\n"
               "  --"OPT_XEN_DOM0"          Support running on Xen dom0 without 
hugetlbfs\n"
+              "  --"OPT_HUGE_TRYBEST"      Try best to accommodate hugepages\n"
               "\n");
        /* Allow the application to print its usage message too if hook is set 
*/
        if ( rte_application_usage_hook ) {
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c 
b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..e4e1f3b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>

 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
        return addr;
 }

+static sigjmp_buf jmpenv;
+
+static void sigbus_handler(int signo __rte_unused)
+{
+       siglongjmp(jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling setjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int wrap_setjmp(void)
+{
+       return setjmp(jmpenv);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
@@ -396,7 +413,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
                if (fd < 0) {
                        RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
                                        strerror(errno));
-                       return -1;
+                       return i;
                }

                /* map the segment, and populate page tables,
@@ -407,7 +424,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
                        RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
                                        strerror(errno));
                        close(fd);
-                       return -1;
+                       return i;
                }

                if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
                        hugepg_tbl[i].final_va = virtaddr;
                }

+               if (orig && internal_config.huge_trybest) {
+                       /* In linux, hugetlb limitations, like cgroup, are
+                        * enforced at fault time instead of mmap(), even
+                        * with the option of MAP_POPULATE. Kernel will send
+                        * a SIGBUS signal. To avoid to be killed, save stack
+                        * environment here, if SIGBUS happens, we can jump
+                        * back here.
+                        */
+                       if (wrap_setjmp()) {
+                               RTE_LOG(ERR, EAL, "SIGBUS: Cannot mmap more "
+                                       "hugepages of size %u MB\n",
+                                       (unsigned)(hugepage_sz / 0x100000));
+                               munmap(virtaddr, hugepage_sz);
+                               close(fd);
+                               unlink(hugepg_tbl[i].filepath);
+                               return i;
+                       }
+                       *(int *)virtaddr = 0;
+               }
+
+
                /* set shared flock on the file. */
                if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
                        RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
                                __func__, strerror(errno));
                        close(fd);
-                       return -1;
+                       return i;
                }

                close(fd);
@@ -430,7 +468,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
                vma_addr = (char *)vma_addr + hugepage_sz;
                vma_len -= hugepage_sz;
        }
-       return 0;
+       return i;
 }

 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1074,33 @@ calc_num_pages_per_socket(uint64_t * memory,
        return total_num_pages;
 }

+static struct sigaction action_old;
+static int need_recover;
+
+static void
+register_sigbus(void)
+{
+       sigset_t mask;
+       struct sigaction action;
+
+       sigemptyset(&mask);
+       sigaddset(&mask, SIGBUS);
+       action.sa_flags = 0;
+       action.sa_mask = mask;
+       action.sa_handler = sigbus_handler;
+
+       need_recover = !sigaction(SIGBUS, &action, &action_old);
+}
+
+static void
+recover_sigbus(void)
+{
+       if (need_recover) {
+               sigaction(SIGBUS, &action_old, NULL);
+               need_recover = 0;
+       }
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1187,12 @@ rte_eal_hugepage_init(void)

        hp_offset = 0; /* where we start the current page size entries */

+       if (internal_config.huge_trybest)
+               register_sigbus();
+
        /* map all hugepages and sort them */
        for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+               int pages_old, pages_new;
                struct hugepage_info *hpi;

                /*
@@ -1137,10 +1206,24 @@ rte_eal_hugepage_init(void)
                        continue;

                /* map all hugepages available */
-               if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-                       RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-                                       (unsigned)(hpi->hugepage_sz / 
0x100000));
-                       goto fail;
+               pages_old = hpi->num_pages[0];
+               pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+               if (pages_new < pages_old) {
+                       RTE_LOG(DEBUG, EAL,
+                               "%d not %d hugepages of size %u MB allocated\n",
+                               pages_new, pages_old,
+                               (unsigned)(hpi->hugepage_sz / 0x100000));
+                       if (internal_config.huge_trybest) {
+                               int pages = pages_old - pages_new;
+
+                               internal_config.memory -=
+                                       hpi->hugepage_sz * pages;
+                               nr_hugepages -= pages;
+                               hpi->num_pages[0] = pages_new;
+                               if (pages_new == 0)
+                                       continue;
+                       } else
+                               goto fail;
                }

                /* find physical addresses and sockets for each hugepage */
@@ -1187,6 +1270,9 @@ rte_eal_hugepage_init(void)
 #endif
        }

+       if (internal_config.huge_trybest)
+               recover_sigbus();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
        nr_hugefiles = 0;
        for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) {
@@ -1373,6 +1459,8 @@ rte_eal_hugepage_init(void)
        return 0;

 fail:
+       if (internal_config.huge_trybest)
+               recover_sigbus();
        free(tmp_hp);
        return -1;
 }
-- 
2.1.4

Reply via email to