Author: jeff
Date: Thu Mar 29 02:54:50 2018
New Revision: 331723
URL: https://svnweb.freebsd.org/changeset/base/331723

Log:
  Implement several enhancements to NUMA policies.
  
  Add a new "interleave" allocation policy which stripes pages across
  domains with a stride or width keeping contiguity within a multi-page
  region.
  
  Move the kernel to the dedicated numbered cpuset #2 making it possible
  to assign kernel threads and memory policy separately from user.  This
  also eliminates the need for the complicated interrupt binding code.
  
  Add a sysctl API for viewing and manipulating domainsets.  Refactor some
  of the cpuset_t manipulation code using the generic bitset type so that
  it can be used for both.  This probably belongs in a dedicated subr file.
  
  Attempt to improve the include situation.
  
  Reviewed by:  kib
  Discussed with:       jhb (cpuset parts)
  Tested by:    pho (before review feedback)
  Sponsored by: Netflix, Dell/EMC Isilon
  Differential Revision:        https://reviews.freebsd.org/D14839

Added:
  head/share/man/man9/domainset.9   (contents, props changed)
Modified:
  head/share/man/man9/Makefile
  head/sys/kern/kern_cpuset.c
  head/sys/kern/kern_kthread.c
  head/sys/sys/_bitset.h
  head/sys/sys/cpuset.h
  head/sys/sys/domainset.h
  head/sys/sys/proc.h
  head/sys/vm/vm_domainset.c
  head/sys/vm/vm_domainset.h
  head/sys/vm/vm_page.c
  head/sys/vm/vnode_pager.c
  head/usr.bin/cpuset/cpuset.1
  head/usr.bin/cpuset/cpuset.c

Modified: head/share/man/man9/Makefile
==============================================================================
--- head/share/man/man9/Makefile        Thu Mar 29 02:50:57 2018        
(r331722)
+++ head/share/man/man9/Makefile        Thu Mar 29 02:54:50 2018        
(r331723)
@@ -118,6 +118,7 @@ MAN=        accept_filter.9 \
        disk.9 \
        dnv.9 \
        domain.9 \
+       domainset.9 \
        dpcpu.9 \
        drbr.9 \
        driver.9 \

Added: head/share/man/man9/domainset.9
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/share/man/man9/domainset.9     Thu Mar 29 02:54:50 2018        
(r331723)
@@ -0,0 +1,128 @@
+.\" Copyright (c) 2018 Jeffrey Roberson <j...@freebsd.org>
+.\" All rights reserved.
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in the
+.\"    documentation and/or other materials provided with the distribution.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS''
+.\" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+.\" TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+.\" PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE
+.\" LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+.\" CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+.\" SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+.\" INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+.\" CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+.\" ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+.\" POSSIBILITY OF SUCH DAMAGE.
+.\"
+.\" $FreeBSD$
+.\"
+.Dd March 24, 2018
+.Dt DOMAINSET 9
+.Os
+.Sh NAME
+.Nm domainset(9)
+\(em
+.Nm domainset_create ,
+.Nm sysctl_handle_domainset .
+.Nd domainset functions and operation
+.Sh SYNOPSIS
+.In sys/_domainset.h
+.In sys/domainset.h
+.\"
+.Bd -literal -offset indent
+struct domainset {
+        domainset_t     ds_mask;
+        uint16_t        ds_policy;
+        domainid_t      ds_prefer;
+       ...
+};
+.Ed
+.Pp
+.Ft struct domainset *
+.Fn domainset_create "const struct domainset *key"
+.Ft int
+.Fn sysctl_handle_domainset "SYSCTL_HANDLER_ARGS"
+.Sh DESCRIPTION
+The
+.Nm
+API provides memory domain allocation policy for NUMA machines.
+Each
+.Vt domainset
+contains a bitmask of allowed domains, an integer policy, and an optional
+preferred domain.
+Together, these specify a search order for memory allocations as well as
+the ability to restrict threads and objects to a subset of available
+memory domains for system partitioning and resource management.
+.Pp
+Every thread in the system and optionally every
+.Vt vm_object_t ,
+which is used to represent files and other memory sources, has
+a reference to a
+.Vt struct domainset .
+The domainset associated with the object is consulted first and the system
+falls back to the thread policy if none exists.
+.Pp
+The allocation policy has the following possible values:
+.Bl -tag -width "foo"
+.It Dv DOMAINSET_POLICY_ROUNDROBIN
+Memory is allocated from each domain in the mask in a round-robin fashion.
+This distributes bandwidth evenly among available domains.
+This policy can specify a single domain for a fixed allocation.
+.It Dv DOMAINSET_POLICY_FIRSTTOUCH
+Memory is allocated from the node that it is first accessed on.
+Allocation falls back to round-robin if the current domain is not in the
+allowed set or is out of memory.
+This policy optimizes for locality but may give pessimal results if the
+memory is accessed from many CPUs that are not in the local domain.
+.It Dv DOMAINSET_POLICY_PREFER
+Memory is allocated from the node in the
+.Vt prefer
+member.  The preferred node must be set in the allowed mask.
+If the preferred node is out of memory the allocation falls back to 
+round-robin among allowed sets.
+.It Dv DOMAINSET_POLICY_INTERLEAVE
+Memory is allocated in a striped fashion with multiple pages
+allocated to each domain in the set according to the offset within
+the object.
+The strip width is object dependent and may be as large as a
+super-page (2MB on amd64).
+This gives good distribution among memory domains while keeping system
+efficiency higher and is preferential to round-robin for general use.
+.El
+.Pp
+The
+.Fn domainset_create
+function takes a partially filled in domainset as a key and returns a
+valid domainset or NULL.
+It is critical that consumers not use domainsets that have not been
+returned by this function.
+.Vt
+domainset
+is an immutable type that is shared among all matching keys and must
+not be modified after return.
+.Pp
+The
+.Fn sysctl_handle_domainset
+function is provided as a convenience for modifying or viewing domainsets
+that are not accessible via
+.Xr cpuset 2 .
+It is intended for use with 
+.Xr sysctl 9 .
+.Pp
+.Sh SEE ALSO
+.Xr cpuset 1 ,
+.Xr cpuset 2 ,
+.Xr cpuset_setdomain 2 ,
+.Xr bitset 9
+.Sh HISTORY
+.In sys/domainset.h
+first appeared in
+.Fx 12.0 .

Modified: head/sys/kern/kern_cpuset.c
==============================================================================
--- head/sys/kern/kern_cpuset.c Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/kern/kern_cpuset.c Thu Mar 29 02:54:50 2018        (r331723)
@@ -37,6 +37,8 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/ctype.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
@@ -63,9 +65,7 @@ __FBSDID("$FreeBSD$");
 #include <vm/uma.h>
 #include <vm/vm.h>
 #include <vm/vm_object.h>
-#include <vm/vm_page.h>
-#include <vm/vm_param.h>
-#include <vm/vm_phys.h>
+#include <vm/vm_extern.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -112,13 +112,17 @@ __FBSDID("$FreeBSD$");
  * meaning 'curthread'.  It may query available cpus for that tid with a
  * getaffinity call using (CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, ...).
  */
+
+LIST_HEAD(domainlist, domainset);
+
 static uma_zone_t cpuset_zone;
 static uma_zone_t domainset_zone;
 static struct mtx cpuset_lock;
 static struct setlist cpuset_ids;
 static struct domainlist cpuset_domains;
 static struct unrhdr *cpuset_unr;
-static struct cpuset *cpuset_zero, *cpuset_default;
+static struct cpuset *cpuset_zero, *cpuset_default, *cpuset_kernel;
+static struct domainset domainset0, domainset2;
 
 /* Return the size of cpuset_t at the kernel level */
 SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD | CTLFLAG_CAPRD,
@@ -445,6 +449,7 @@ static struct domainset *
 _domainset_create(struct domainset *domain, struct domainlist *freelist)
 {
        struct domainset *ndomain;
+       int i, j, max;
 
        mtx_lock_spin(&cpuset_lock);
        LIST_FOREACH(ndomain, &cpuset_domains, ds_link)
@@ -457,7 +462,10 @@ _domainset_create(struct domainset *domain, struct dom
        if (ndomain == NULL) {
                LIST_INSERT_HEAD(&cpuset_domains, domain, ds_link);
                domain->ds_cnt = DOMAINSET_COUNT(&domain->ds_mask);
-               domain->ds_max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+               max = DOMAINSET_FLS(&domain->ds_mask) + 1;
+               for (i = 0, j = 0; i < max; i++)
+                       if (DOMAINSET_ISSET(i, &domain->ds_mask))
+                               domain->ds_order[j++] = i;
        }
        mtx_unlock_spin(&cpuset_lock);
        if (ndomain == NULL)
@@ -473,11 +481,24 @@ _domainset_create(struct domainset *domain, struct dom
 /*
  * Create or lookup a domainset based on the key held in 'domain'.
  */
-static struct domainset *
+struct domainset *
 domainset_create(const struct domainset *domain)
 {
        struct domainset *ndomain;
 
+       /*
+        * Validate the policy.  It must specify a useable policy number with
+        * only valid domains.  Preferred must include the preferred domain
+        * in the mask.
+        */
+       if (domain->ds_policy <= DOMAINSET_POLICY_INVALID ||
+           domain->ds_policy > DOMAINSET_POLICY_MAX)
+               return (NULL);
+       if (domain->ds_policy == DOMAINSET_POLICY_PREFER &&
+           !DOMAINSET_ISSET(domain->ds_prefer, &domain->ds_mask))
+               return (NULL);
+       if (!DOMAINSET_SUBSET(&domainset0.ds_mask, &domain->ds_mask))
+               return (NULL);
        ndomain = uma_zalloc(domainset_zone, M_WAITOK | M_ZERO);
        domainset_copy(domain, ndomain);
        return _domainset_create(ndomain, NULL);
@@ -507,7 +528,7 @@ domainset_notify(void)
                PROC_UNLOCK(p);
        }
        sx_sunlock(&allproc_lock);
-       kernel_object->domain.dr_policy = cpuset_default->cs_domain;
+       kernel_object->domain.dr_policy = cpuset_kernel->cs_domain;
 }
 
 /*
@@ -1128,6 +1149,55 @@ out:
        return (error);
 }
 
+static int
+bitset_strprint(char *buf, size_t bufsiz, const struct bitset *set, int setlen)
+{
+       size_t bytes;
+       int i, once;
+       char *p;
+
+       once = 0;
+       p = buf;
+       for (i = 0; i < __bitset_words(setlen); i++) {
+               if (once != 0) {
+                       if (bufsiz < 1)
+                               return (0);
+                       *p = ',';
+                       p++;
+                       bufsiz--;
+               } else
+                       once = 1;
+               if (bufsiz < sizeof(__STRING(ULONG_MAX)))
+                       return (0);
+               bytes = snprintf(p, bufsiz, "%lx", set->__bits[i]);
+               p += bytes;
+               bufsiz -= bytes;
+       }
+       return (p - buf);
+}
+
+static int
+bitset_strscan(struct bitset *set, int setlen, const char *buf)
+{
+       int i, ret;
+       const char *p;
+
+       BIT_ZERO(setlen, set);
+       p = buf;
+       for (i = 0; i < __bitset_words(setlen); i++) {
+               if (*p == ',') {
+                       p++;
+                       continue;
+               }
+               ret = sscanf(p, "%lx", &set->__bits[i]);
+               if (ret == 0 || ret == -1)
+                       break;
+               while (isxdigit(*p))
+                       p++;
+       }
+       return (p - buf);
+}
+
 /*
  * Return a string representing a valid layout for a cpuset_t object.
  * It expects an incoming buffer at least sized as CPUSETBUFSIZ.
@@ -1135,19 +1205,9 @@ out:
 char *
 cpusetobj_strprint(char *buf, const cpuset_t *set)
 {
-       char *tbuf;
-       size_t i, bytesp, bufsiz;
 
-       tbuf = buf;
-       bytesp = 0;
-       bufsiz = CPUSETBUFSIZ;
-
-       for (i = 0; i < (_NCPUWORDS - 1); i++) {
-               bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
-               bufsiz -= bytesp;
-               tbuf += bytesp;
-       }
-       snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
+       bitset_strprint(buf, CPUSETBUFSIZ, (const struct bitset *)set,
+           CPU_SETSIZE);
        return (buf);
 }
 
@@ -1158,37 +1218,71 @@ cpusetobj_strprint(char *buf, const cpuset_t *set)
 int
 cpusetobj_strscan(cpuset_t *set, const char *buf)
 {
-       u_int nwords;
-       int i, ret;
+       char p;
 
        if (strlen(buf) > CPUSETBUFSIZ - 1)
                return (-1);
 
-       /* Allow to pass a shorter version of the mask when necessary. */
-       nwords = 1;
-       for (i = 0; buf[i] != '\0'; i++)
-               if (buf[i] == ',')
-                       nwords++;
-       if (nwords > _NCPUWORDS)
+       p = buf[bitset_strscan((struct bitset *)set, CPU_SETSIZE, buf)];
+       if (p != '\0')
                return (-1);
 
-       CPU_ZERO(set);
-       for (i = 0; i < (nwords - 1); i++) {
-               ret = sscanf(buf, "%lx,", &set->__bits[i]);
-               if (ret == 0 || ret == -1)
-                       return (-1);
-               buf = strstr(buf, ",");
-               if (buf == NULL)
-                       return (-1);
-               buf++;
-       }
-       ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
-       if (ret == 0 || ret == -1)
-               return (-1);
        return (0);
 }
 
 /*
+ * Handle a domainset specifier in the sysctl tree.  A poiner to a pointer to
+ * a domainset is in arg1.  If the user specifies a valid domainset the
+ * pointer is updated.
+ *
+ * Format is:
+ * hex mask word 0,hex mask word 1,...:decimal policy:decimal preferred
+ */
+int
+sysctl_handle_domainset(SYSCTL_HANDLER_ARGS)
+{
+       char buf[DOMAINSETBUFSIZ];
+       struct domainset *dset;
+       struct domainset key;
+       int policy, prefer, error;
+       char *p;
+
+       dset = *(struct domainset **)arg1;
+       error = 0;
+
+       if (dset != NULL) {
+               p = buf + bitset_strprint(buf, DOMAINSETBUFSIZ,
+                   (const struct bitset *)&dset->ds_mask, DOMAINSET_SETSIZE);
+               sprintf(p, ":%d:%d", dset->ds_policy, dset->ds_prefer);
+       } else
+               sprintf(buf, "<NULL>");
+       error = sysctl_handle_string(oidp, buf, sizeof(buf), req);
+       if (error != 0 || req->newptr == NULL)
+               return (error);
+
+       /*
+        * Read in and validate the string.
+        */
+       memset(&key, 0, sizeof(key));
+       p = &buf[bitset_strscan((struct bitset *)&key.ds_mask,
+           DOMAINSET_SETSIZE, buf)];
+       if (p == buf)
+               return (EINVAL);
+       if (sscanf(p, ":%d:%d", &policy, &prefer) != 2)
+               return (EINVAL);
+       key.ds_policy = policy;
+       key.ds_prefer = prefer;
+
+       /* Domainset_create() validates the policy.*/
+       dset = domainset_create(&key);
+       if (dset == NULL)
+               return (EINVAL);
+       *(struct domainset **)arg1 = dset;
+
+       return (error);
+}
+
+/*
  * Apply an anonymous mask or a domain to a single thread.
  */
 static int
@@ -1239,95 +1333,19 @@ cpuset_setthread(lwpid_t id, cpuset_t *mask)
 int
 cpuset_setithread(lwpid_t id, int cpu)
 {
-       struct setlist cpusets;
-       struct cpuset *nset, *rset;
-       struct cpuset *parent, *old_set;
-       struct thread *td;
-       struct proc *p;
-       cpusetid_t cs_id;
        cpuset_t mask;
-       int error;
 
-       cpuset_freelist_init(&cpusets, 1);
-       rset = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
-       cs_id = CPUSET_INVALID;
-
        CPU_ZERO(&mask);
        if (cpu == NOCPU)
                CPU_COPY(cpuset_root, &mask);
        else
                CPU_SET(cpu, &mask);
-
-       error = cpuset_which(CPU_WHICH_TID, id, &p, &td, &old_set);
-       if (error != 0 || ((cs_id = alloc_unr(cpuset_unr)) == CPUSET_INVALID))
-               goto out;
-
-       /* cpuset_which() returns with PROC_LOCK held. */
-       old_set = td->td_cpuset;
-
-       if (cpu == NOCPU) {
-               nset = LIST_FIRST(&cpusets);
-               LIST_REMOVE(nset, cs_link);
-
-               /*
-                * roll back to default set. We're not using cpuset_shadow()
-                * here because we can fail CPU_SUBSET() check. This can happen
-                * if default set does not contain all CPUs.
-                */
-               error = _cpuset_create(nset, cpuset_default, &mask, NULL,
-                   CPUSET_INVALID);
-
-               goto applyset;
-       }
-
-       if (old_set->cs_id == 1 || (old_set->cs_id == CPUSET_INVALID &&
-           old_set->cs_parent->cs_id == 1)) {
-
-               /*
-                * Current set is either default (1) or
-                * shadowed version of default set.
-                *
-                * Allocate new root set to be able to shadow it
-                * with any mask.
-                */
-               error = _cpuset_create(rset, cpuset_zero,
-                   &cpuset_zero->cs_mask, NULL, cs_id);
-               if (error != 0) {
-                       PROC_UNLOCK(p);
-                       goto out;
-               }
-               rset->cs_flags |= CPU_SET_ROOT;
-               parent = rset;
-               rset = NULL;
-               cs_id = CPUSET_INVALID;
-       } else {
-               /* Assume existing set was already allocated by previous call */
-               parent = old_set;
-               old_set = NULL;
-       }
-
-       error = cpuset_shadow(parent, &nset, &mask, NULL, &cpusets, NULL);
-applyset:
-       if (error == 0) {
-               thread_lock(td);
-               old_set = cpuset_update_thread(td, nset);
-               thread_unlock(td);
-       } else
-               old_set = NULL;
-       PROC_UNLOCK(p);
-       if (old_set != NULL)
-               cpuset_rel(old_set);
-out:
-       cpuset_freelist_free(&cpusets);
-       if (rset != NULL)
-               uma_zfree(cpuset_zone, rset);
-       if (cs_id != CPUSET_INVALID)
-               free_unr(cpuset_unr, cs_id);
-       return (error);
+       return _cpuset_setthread(id, &mask, NULL);
 }
 
-static struct domainset domainset0;
-
+/*
+ * Create the domainset for cpuset 0, 1 and cpuset 2.
+ */
 void
 domainset_zero(void)
 {
@@ -1340,14 +1358,17 @@ domainset_zero(void)
        DOMAINSET_ZERO(&dset->ds_mask);
        for (i = 0; i < vm_ndomains; i++)
                DOMAINSET_SET(i, &dset->ds_mask);
-       dset->ds_policy = DOMAINSET_POLICY_ROUNDROBIN;
+       dset->ds_policy = DOMAINSET_POLICY_FIRSTTOUCH;
        dset->ds_prefer = -1;
        curthread->td_domain.dr_policy = _domainset_create(dset, NULL);
-       kernel_object->domain.dr_policy = curthread->td_domain.dr_policy;
+
+       domainset_copy(dset, &domainset2);
+       domainset2.ds_policy = DOMAINSET_POLICY_INTERLEAVE;
+       kernel_object->domain.dr_policy = _domainset_create(&domainset2, NULL);
 }
 
 /*
- * Creates system-wide cpusets and the cpuset for thread0 including two
+ * Creates system-wide cpusets and the cpuset for thread0 including three
  * sets:
  * 
  * 0 - The root set which should represent all valid processors in the
@@ -1357,6 +1378,8 @@ domainset_zero(void)
  * 1 - The default set which all processes are a member of until changed.
  *     This allows an administrator to move all threads off of given cpus to
  *     dedicate them to high priority tasks or save power etc.
+ * 2 - The kernel set which allows restriction and policy to be applied only
+ *     to kernel threads and the kernel_object.
  */
 struct cpuset *
 cpuset_thread0(void)
@@ -1366,12 +1389,12 @@ cpuset_thread0(void)
        int i;
 
        cpuset_zone = uma_zcreate("cpuset", sizeof(struct cpuset), NULL, NULL,
-           NULL, NULL, UMA_ALIGN_PTR, 0);
+           NULL, NULL, UMA_ALIGN_CACHE, 0);
        domainset_zone = uma_zcreate("domainset", sizeof(struct domainset),
-           NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+           NULL, NULL, NULL, NULL, UMA_ALIGN_CACHE, 0);
 
        /*
-        * Create the root system set for the whole machine.  Doesn't use
+        * Create the root system set (0) for the whole machine.  Doesn't use
         * cpuset_create() due to NULL parent.
         */
        set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
@@ -1385,12 +1408,20 @@ cpuset_thread0(void)
        cpuset_root = &set->cs_mask;
 
        /*
-        * Now derive a default, modifiable set from that to give out.
+        * Now derive a default (1), modifiable set from that to give out.
         */
        set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
        error = _cpuset_create(set, cpuset_zero, NULL, NULL, 1);
        KASSERT(error == 0, ("Error creating default set: %d\n", error));
        cpuset_default = set;
+       /*
+        * Create the kernel set (2).
+        */
+       set = uma_zalloc(cpuset_zone, M_WAITOK | M_ZERO);
+       error = _cpuset_create(set, cpuset_zero, NULL, NULL, 2);
+       KASSERT(error == 0, ("Error creating kernel set: %d\n", error));
+       set->cs_domain = &domainset2;
+       cpuset_kernel = set;
 
        /*
         * Initialize the unit allocator. 0 and 1 are allocated above.
@@ -1407,9 +1438,21 @@ cpuset_thread0(void)
        CPU_COPY(&all_cpus, &cpuset_domain[0]);
 domains_set:
 
-       return (set);
+       return (cpuset_default);
 }
 
+void
+cpuset_kernthread(struct thread *td)
+{
+       struct cpuset *set;
+
+       thread_lock(td);
+       set = td->td_cpuset;
+       td->td_cpuset = cpuset_ref(cpuset_kernel);
+       thread_unlock(td);
+       cpuset_rel(set);
+}
+
 /*
  * Create a cpuset, which would be cpuset_create() but
  * mark the new 'set' as root.
@@ -2108,7 +2151,7 @@ out:
 }
 
 #ifdef DDB
-BITSET_DEFINE(bitset, 1);
+
 static void
 ddb_display_bitset(const struct bitset *set, int size)
 {
@@ -2164,9 +2207,8 @@ DB_SHOW_COMMAND(domainsets, db_show_domainsets)
        struct domainset *set;
 
        LIST_FOREACH(set, &cpuset_domains, ds_link) {
-               db_printf("set=%p policy %d prefer %d cnt %d max %d\n",
-                   set, set->ds_policy, set->ds_prefer, set->ds_cnt,
-                   set->ds_max);
+               db_printf("set=%p policy %d prefer %d cnt %d\n",
+                   set, set->ds_policy, set->ds_prefer, set->ds_cnt);
                db_printf("  mask =");
                ddb_display_domainset(&set->ds_mask);
                db_printf("\n");

Modified: head/sys/kern/kern_kthread.c
==============================================================================
--- head/sys/kern/kern_kthread.c        Thu Mar 29 02:50:57 2018        
(r331722)
+++ head/sys/kern/kern_kthread.c        Thu Mar 29 02:54:50 2018        
(r331723)
@@ -131,7 +131,7 @@ kproc_create(void (*func)(void *), void *arg,
        cpu_fork_kthread_handler(td, func, arg);
 
        /* Avoid inheriting affinity from a random parent. */
-       cpuset_setthread(td->td_tid, cpuset_root);
+       cpuset_kernthread(td);
        thread_lock(td);
        TD_SET_CAN_RUN(td);
        sched_prio(td, PVM);
@@ -309,7 +309,7 @@ kthread_add(void (*func)(void *), void *arg, struct pr
        tidhash_add(newtd);
 
        /* Avoid inheriting affinity from a random parent. */
-       cpuset_setthread(newtd->td_tid, cpuset_root);
+       cpuset_kernthread(newtd);
 
        /* Delay putting it on the run queue until now. */
        if (!(flags & RFSTOPPED)) {

Modified: head/sys/sys/_bitset.h
==============================================================================
--- head/sys/sys/_bitset.h      Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/sys/_bitset.h      Thu Mar 29 02:54:50 2018        (r331723)
@@ -57,4 +57,10 @@ struct t {                                                   
        \
  */
 #define BITSET_DEFINE_VAR(t)   BITSET_DEFINE(t, 1)
 
+/*
+ * Define a default type that can be used while manually specifying size
+ * to every call.
+ */
+BITSET_DEFINE(bitset, 1);
+
 #endif /* !_SYS__BITSET_H_ */

Modified: head/sys/sys/cpuset.h
==============================================================================
--- head/sys/sys/cpuset.h       Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/sys/cpuset.h       Thu Mar 29 02:54:50 2018        (r331723)
@@ -139,6 +139,7 @@ int cpuset_create_root(struct prison *, struct cpuset 
 int    cpuset_setproc_update_set(struct proc *, struct cpuset *);
 int    cpuset_which(cpuwhich_t, id_t, struct proc **,
            struct thread **, struct cpuset **);
+void   cpuset_kernthread(struct thread *);
 
 char   *cpusetobj_strprint(char *, const cpuset_t *);
 int    cpusetobj_strscan(cpuset_t *, const char *);

Modified: head/sys/sys/domainset.h
==============================================================================
--- head/sys/sys/domainset.h    Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/sys/domainset.h    Thu Mar 29 02:54:50 2018        (r331723)
@@ -28,8 +28,8 @@
  * $FreeBSD$
  */
 
-#ifndef _SYS_DOMAINSETSET_H_
-#define        _SYS_DOMAINSETSET_H_
+#ifndef _SYS_DOMAINSET_H_
+#define        _SYS_DOMAINSET_H_
 
 #include <sys/_domainset.h>
 
@@ -38,8 +38,12 @@
 #define        _NDOMAINSETBITS                 _BITSET_BITS
 #define        _NDOMAINSETWORDS                
__bitset_words(DOMAINSET_SETSIZE)
 
-#define        DOMAINSETSETBUFSIZ      ((2 + sizeof(long) * 2) * 
_NDOMAINSETWORDS)
+#define        DOMAINSETBUFSIZ                                                 
\
+           (((2 + sizeof(long) * 2) * _NDOMAINSETWORDS) +              \
+           sizeof("::") + sizeof(__XSTRING(DOMAINSET_POLICY_MAX)) +    \
+           sizeof(__XSTRING(MAXMEMDOM)))
 
+
 #define        DOMAINSET_CLR(n, p)             BIT_CLR(DOMAINSET_SETSIZE, n, p)
 #define        DOMAINSET_COPY(f, t)            BIT_COPY(DOMAINSET_SETSIZE, f, 
t)
 #define        DOMAINSET_ISSET(n, p)           BIT_ISSET(DOMAINSET_SETSIZE, n, 
p)
@@ -73,23 +77,37 @@
 #define        DOMAINSET_POLICY_ROUNDROBIN     1
 #define        DOMAINSET_POLICY_FIRSTTOUCH     2
 #define        DOMAINSET_POLICY_PREFER         3
-#define        DOMAINSET_POLICY_MAX            DOMAINSET_POLICY_PREFER
+#define        DOMAINSET_POLICY_INTERLEAVE     4
+#define        DOMAINSET_POLICY_MAX            DOMAINSET_POLICY_INTERLEAVE
 
 #ifdef _KERNEL
-#include <sys/queue.h>
-LIST_HEAD(domainlist, domainset);
+#if MAXMEMDOM < 256
+typedef        uint8_t         domainid_t;
+#else
+typedef uint16_t       domainid_t;
+#endif
 
 struct domainset {
        LIST_ENTRY(domainset)   ds_link;
        domainset_t     ds_mask;        /* allowed domains. */
        uint16_t        ds_policy;      /* Policy type. */
-       int16_t         ds_prefer;      /* Preferred domain or -1. */
-       uint16_t        ds_cnt;         /* popcnt from above. */
-       uint16_t        ds_max;         /* Maximum domain in set. */
+       domainid_t      ds_prefer;      /* Preferred domain or -1. */
+       domainid_t      ds_cnt;         /* popcnt from above. */
+       domainid_t      ds_order[MAXMEMDOM];  /* nth domain table. */
 };
 
 void domainset_zero(void);
 
+/*
+ * Add a domainset to the system based on a key initializing policy, prefer,
+ * and mask.  Do not create and directly use domainset structures.  The
+ * returned value will not match the key pointer.
+ */
+struct domainset *domainset_create(const struct domainset *);
+#ifdef _SYS_SYSCTL_H_
+int sysctl_handle_domainset(SYSCTL_HANDLER_ARGS);
+#endif
+
 #else
 __BEGIN_DECLS
 int    cpuset_getdomain(cpulevel_t, cpuwhich_t, id_t, size_t, domainset_t *,
@@ -99,4 +117,4 @@ int  cpuset_setdomain(cpulevel_t, cpuwhich_t, id_t, siz
 
 __END_DECLS
 #endif
-#endif /* !_SYS_DOMAINSETSET_H_ */
+#endif /* !_SYS_DOMAINSET_H_ */

Modified: head/sys/sys/proc.h
==============================================================================
--- head/sys/sys/proc.h Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/sys/proc.h Thu Mar 29 02:54:50 2018        (r331723)
@@ -67,7 +67,7 @@
 #include <sys/ucontext.h>
 #include <sys/ucred.h>
 #include <sys/types.h>
-#include <sys/domainset.h>
+#include <sys/_domainset.h>
 
 #include <machine/proc.h>              /* Machine-dependent proc substruct. */
 #ifdef _KERNEL

Modified: head/sys/vm/vm_domainset.c
==============================================================================
--- head/sys/vm/vm_domainset.c  Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/vm/vm_domainset.c  Thu Mar 29 02:54:50 2018        (r331723)
@@ -56,11 +56,14 @@ __FBSDID("$FreeBSD$");
  * assumed that most allocations are successful.
  */
 
+static int vm_domainset_default_stride = 64;
+
 /*
  * Determine which policy is to be used for this allocation.
  */
 static void
-vm_domainset_iter_domain(struct vm_domainset_iter *di, struct vm_object *obj)
+vm_domainset_iter_init(struct vm_domainset_iter *di, struct vm_object *obj,
+    vm_pindex_t pindex)
 {
        struct domainset *domain;
 
@@ -76,18 +79,33 @@ vm_domainset_iter_domain(struct vm_domainset_iter *di,
                di->di_domain = curthread->td_domain.dr_policy;
                di->di_iter = &curthread->td_domain.dr_iterator;
        }
+       di->di_policy = di->di_domain->ds_policy;
+       if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE) {
+               if (vm_object_reserv(obj)) {
+                       /*
+                        * Color the pindex so we end up on the correct
+                        * reservation boundary.
+                        */
+                       pindex += obj->pg_color;
+                       pindex >>= VM_LEVEL_0_ORDER;
+               } else
+                       pindex /= vm_domainset_default_stride;
+               /*
+                * Offset pindex so the first page of each object does
+                * not end up in domain 0.
+                */
+               if (obj != NULL)
+                       pindex += (((uintptr_t)obj) / sizeof(*obj));
+               di->di_offset = pindex;
+       }
 }
 
 static void
 vm_domainset_iter_rr(struct vm_domainset_iter *di, int *domain)
 {
-       int d;
 
-       d = *di->di_iter;
-       do {
-               d = (d + 1) % di->di_domain->ds_max;
-       } while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask));
-       *di->di_iter = *domain = d;
+       *domain = di->di_domain->ds_order[
+           ++(*di->di_iter) % di->di_domain->ds_cnt];
 }
 
 static void
@@ -95,27 +113,38 @@ vm_domainset_iter_prefer(struct vm_domainset_iter *di,
 {
        int d;
 
-       d = *di->di_iter;
        do {
-               d = (d + 1) % di->di_domain->ds_max;
-       } while (!DOMAINSET_ISSET(d, &di->di_domain->ds_mask) || 
-           d == di->di_domain->ds_prefer);
-       *di->di_iter = *domain = d;
+               d = di->di_domain->ds_order[
+                   ++(*di->di_iter) % di->di_domain->ds_cnt];
+       } while (d == di->di_domain->ds_prefer);
+       *domain = d;
 }
 
 static void
+vm_domainset_iter_interleave(struct vm_domainset_iter *di, int *domain)
+{
+       int d;
+
+       d = di->di_offset % di->di_domain->ds_cnt;
+       *di->di_iter = d;
+       *domain = di->di_domain->ds_order[d];
+}
+
+static void
 vm_domainset_iter_next(struct vm_domainset_iter *di, int *domain)
 {
 
        KASSERT(di->di_n > 0,
            ("vm_domainset_iter_first: Invalid n %d", di->di_n));
-       switch (di->di_domain->ds_policy) {
+       switch (di->di_policy) {
        case DOMAINSET_POLICY_FIRSTTOUCH:
                /*
                 * To prevent impossible allocations we convert an invalid
                 * first-touch to round-robin.
                 */
                /* FALLTHROUGH */
+       case DOMAINSET_POLICY_INTERLEAVE:
+               /* FALLTHROUGH */
        case DOMAINSET_POLICY_ROUNDROBIN:
                vm_domainset_iter_rr(di, domain);
                break;
@@ -124,7 +153,7 @@ vm_domainset_iter_next(struct vm_domainset_iter *di, i
                break;
        default:
                panic("vm_domainset_iter_first: Unknown policy %d",
-                   di->di_domain->ds_policy);
+                   di->di_policy);
        }
        KASSERT(*domain < vm_ndomains,
            ("vm_domainset_iter_next: Invalid domain %d", *domain));
@@ -134,11 +163,15 @@ static void
 vm_domainset_iter_first(struct vm_domainset_iter *di, int *domain)
 {
 
-       switch (di->di_domain->ds_policy) {
+       switch (di->di_policy) {
        case DOMAINSET_POLICY_FIRSTTOUCH:
                *domain = PCPU_GET(domain);
                if (DOMAINSET_ISSET(*domain, &di->di_domain->ds_mask)) {
-                       di->di_n = 1;
+                       /*
+                        * Add an extra iteration because we will visit the
+                        * current domain a second time in the rr iterator.
+                        */
+                       di->di_n = di->di_domain->ds_cnt + 1;
                        break;
                }
                /*
@@ -154,9 +187,13 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, 
                *domain = di->di_domain->ds_prefer;
                di->di_n = di->di_domain->ds_cnt;
                break;
+       case DOMAINSET_POLICY_INTERLEAVE:
+               vm_domainset_iter_interleave(di, domain);
+               di->di_n = di->di_domain->ds_cnt;
+               break;
        default:
                panic("vm_domainset_iter_first: Unknown policy %d",
-                   di->di_domain->ds_policy);
+                   di->di_policy);
        }
        KASSERT(di->di_n > 0,
            ("vm_domainset_iter_first: Invalid n %d", di->di_n));
@@ -166,10 +203,10 @@ vm_domainset_iter_first(struct vm_domainset_iter *di, 
 
 void
 vm_domainset_iter_page_init(struct vm_domainset_iter *di, struct vm_object 
*obj,
-    int *domain, int *req)
+    vm_pindex_t pindex, int *domain, int *req)
 {
 
-       vm_domainset_iter_domain(di, obj);
+       vm_domainset_iter_init(di, obj, pindex);
        di->di_flags = *req;
        *req = (di->di_flags & ~(VM_ALLOC_WAITOK | VM_ALLOC_WAITFAIL)) |
            VM_ALLOC_NOWAIT;
@@ -213,7 +250,9 @@ vm_domainset_iter_malloc_init(struct vm_domainset_iter
     struct vm_object *obj, int *domain, int *flags)
 {
 
-       vm_domainset_iter_domain(di, obj);
+       vm_domainset_iter_init(di, obj, 0);
+       if (di->di_policy == DOMAINSET_POLICY_INTERLEAVE)
+               di->di_policy = DOMAINSET_POLICY_ROUNDROBIN;
        di->di_flags = *flags;
        *flags = (di->di_flags & ~M_WAITOK) | M_NOWAIT;
        vm_domainset_iter_first(di, domain);
@@ -253,7 +292,7 @@ vm_domainset_iter_page(struct vm_domainset_iter *di, i
 
 void
 vm_domainset_iter_page_init(struct vm_domainset_iter *di,
-            struct vm_object *obj, int *domain, int *flags)
+            struct vm_object *obj, vm_pindex_t pindex, int *domain, int *flags)
 {
 
        *domain = 0;

Modified: head/sys/vm/vm_domainset.h
==============================================================================
--- head/sys/vm/vm_domainset.h  Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/vm/vm_domainset.h  Thu Mar 29 02:54:50 2018        (r331723)
@@ -33,13 +33,15 @@
 struct vm_domainset_iter {
        struct domainset        *di_domain;
        int                     *di_iter;
+       vm_pindex_t             di_offset;
+       int                     di_policy;
        int                     di_flags;
        int                     di_n;
 };
 
 int    vm_domainset_iter_page(struct vm_domainset_iter *, int *, int *);
 void   vm_domainset_iter_page_init(struct vm_domainset_iter *,
-           struct vm_object *, int *, int *);
+           struct vm_object *, vm_pindex_t, int *, int *);
 int    vm_domainset_iter_malloc(struct vm_domainset_iter *, int *, int *);
 void   vm_domainset_iter_malloc_init(struct vm_domainset_iter *,
            struct vm_object *, int *, int *);

Modified: head/sys/vm/vm_page.c
==============================================================================
--- head/sys/vm/vm_page.c       Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/vm/vm_page.c       Thu Mar 29 02:54:50 2018        (r331723)
@@ -1660,7 +1660,7 @@ vm_page_alloc_after(vm_object_t object, vm_pindex_t pi
        vm_page_t m;
        int domain;
 
-       vm_domainset_iter_page_init(&di, object, &domain, &req);
+       vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
        do {
                m = vm_page_alloc_domain_after(object, pindex, domain, req,
                    mpred);
@@ -1893,7 +1893,7 @@ vm_page_alloc_contig(vm_object_t object, vm_pindex_t p
        vm_page_t m;
        int domain;
 
-       vm_domainset_iter_page_init(&di, object, &domain, &req);
+       vm_domainset_iter_page_init(&di, object, pindex, &domain, &req);
        do {
                m = vm_page_alloc_contig_domain(object, pindex, domain, req,
                    npages, low, high, alignment, boundary, memattr);
@@ -2092,7 +2092,7 @@ vm_page_alloc_freelist(int freelist, int req)
        vm_page_t m;
        int domain;
 
-       vm_domainset_iter_page_init(&di, kernel_object, &domain, &req);
+       vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
        do {
                m = vm_page_alloc_freelist_domain(domain, freelist, req);
                if (m != NULL)
@@ -2691,7 +2691,7 @@ vm_page_reclaim_contig(int req, u_long npages, vm_padd
        int domain;
        bool ret;
 
-       vm_domainset_iter_page_init(&di, kernel_object, &domain, &req);
+       vm_domainset_iter_page_init(&di, NULL, 0, &domain, &req);
        do {
                ret = vm_page_reclaim_contig_domain(domain, req, npages, low,
                    high, alignment, boundary);

Modified: head/sys/vm/vnode_pager.c
==============================================================================
--- head/sys/vm/vnode_pager.c   Thu Mar 29 02:50:57 2018        (r331722)
+++ head/sys/vm/vnode_pager.c   Thu Mar 29 02:54:50 2018        (r331723)
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
@@ -69,6 +70,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/conf.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to