Based on the Paul's feedback, I have simplified and cleaned up the
code quite a bit. 

o  I have taken care of most of the nits, except for the output
   format change for cpusets with isolated children. 
o  Also most of my documentation has been part of my earlier mails 
   and I have not yet added them to cpusets.txt. 
o  I still havent looked at the memory side of things. 
o  Most of the changes are in the cpusets code and almost none 
   in the sched code. (I'll do that next week)
o  Hopefully my earlier mails regarding the design have clarified
   many of the questions that were raised

So here goes version 0.2

-rw-r--r--    1 root     root        16548 Apr 21 20:54 cpuset.o.orig
-rw-r--r--    1 root     root        17548 Apr 21 22:09 cpuset.o.sd-v0.2

  Around ~6% increase in kernel text size of cpuset.o

 include/linux/init.h  |    2
 include/linux/sched.h |    1
 kernel/cpuset.c       |  153 +++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched.c        |  111 ++++++++++++++++++++++++------------
 4 files changed, 216 insertions(+), 51 deletions(-)


diff -Naurp linux-2.6.12-rc1-mm1.orig/include/linux/init.h 
linux-2.6.12-rc1-mm1/include/linux/init.h
--- linux-2.6.12-rc1-mm1.orig/include/linux/init.h      2005-03-18 
07:03:49.000000000 +0530
+++ linux-2.6.12-rc1-mm1/include/linux/init.h   2005-04-21 21:54:06.000000000 
+0530
@@ -217,7 +217,7 @@ void __init parse_early_param(void);
 #define __initdata_or_module __initdata
 #endif /*CONFIG_MODULES*/
 
-#ifdef CONFIG_HOTPLUG
+#if defined(CONFIG_HOTPLUG) || defined(CONFIG_CPUSETS)
 #define __devinit
 #define __devinitdata
 #define __devexit
diff -Naurp linux-2.6.12-rc1-mm1.orig/include/linux/sched.h 
linux-2.6.12-rc1-mm1/include/linux/sched.h
--- linux-2.6.12-rc1-mm1.orig/include/linux/sched.h     2005-04-21 
21:50:26.000000000 +0530
+++ linux-2.6.12-rc1-mm1/include/linux/sched.h  2005-04-21 21:53:57.000000000 
+0530
@@ -155,6 +155,7 @@ typedef struct task_struct task_t;
 extern void sched_init(void);
 extern void sched_init_smp(void);
 extern void init_idle(task_t *idle, int cpu);
+extern void rebuild_sched_domains(cpumask_t span1, cpumask_t span2);
 
 extern cpumask_t nohz_cpu_mask;
 
diff -Naurp linux-2.6.12-rc1-mm1.orig/kernel/cpuset.c 
linux-2.6.12-rc1-mm1/kernel/cpuset.c
--- linux-2.6.12-rc1-mm1.orig/kernel/cpuset.c   2005-04-21 21:50:26.000000000 
+0530
+++ linux-2.6.12-rc1-mm1/kernel/cpuset.c        2005-04-21 22:00:36.000000000 
+0530
@@ -57,7 +57,13 @@
 
 struct cpuset {
        unsigned long flags;            /* "unsigned long" so bitops work */
-       cpumask_t cpus_allowed;         /* CPUs allowed to tasks in cpuset */
+       /* 
+        * CPUs allowed to tasks in cpuset and 
+        * not part of any isolated children
+        */
+       cpumask_t cpus_allowed;         
+
+       cpumask_t isolated_map;         /* CPUs associated with isolated 
children */
        nodemask_t mems_allowed;        /* Memory Nodes allowed to tasks */
 
        atomic_t count;                 /* count tasks using this cpuset */
@@ -82,6 +88,7 @@ struct cpuset {
 /* bits in struct cpuset flags field */
 typedef enum {
        CS_CPU_EXCLUSIVE,
+       CS_CPU_ISOLATED,
        CS_MEM_EXCLUSIVE,
        CS_REMOVED,
        CS_NOTIFY_ON_RELEASE
@@ -93,6 +100,11 @@ static inline int is_cpu_exclusive(const
        return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
 }
 
+static inline int is_cpu_isolated(const struct cpuset *cs)
+{
+       return !!test_bit(CS_CPU_ISOLATED, &cs->flags);
+}
+
 static inline int is_mem_exclusive(const struct cpuset *cs)
 {
        return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
@@ -127,8 +139,10 @@ static inline int notify_on_release(cons
 static atomic_t cpuset_mems_generation = ATOMIC_INIT(1);
 
 static struct cpuset top_cpuset = {
-       .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
+       .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_CPU_ISOLATED) | 
+                 (1 << CS_MEM_EXCLUSIVE)),
        .cpus_allowed = CPU_MASK_ALL,
+       .isolated_map = CPU_MASK_NONE,
        .mems_allowed = NODE_MASK_ALL,
        .count = ATOMIC_INIT(0),
        .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
@@ -543,9 +557,14 @@ static void refresh_mems(void)
 
 static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
 {
-       return  cpus_subset(p->cpus_allowed, q->cpus_allowed) &&
+       cpumask_t all_map;
+
+       cpus_or(all_map, q->cpus_allowed, q->isolated_map);
+
+       return  cpus_subset(p->cpus_allowed, all_map) &&
                nodes_subset(p->mems_allowed, q->mems_allowed) &&
                is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
+               is_cpu_isolated(p) <= is_cpu_isolated(q) &&
                is_mem_exclusive(p) <= is_mem_exclusive(q);
 }
 
@@ -587,6 +606,11 @@ static int validate_change(const struct 
        if (!is_cpuset_subset(trial, par))
                return -EACCES;
 
+       /* An isolated cpuset has to be exclusive */
+       if ((is_cpu_isolated(trial) && !is_cpu_exclusive(cur)) 
+          || (!is_cpu_exclusive(trial) && is_cpu_isolated(cur)))
+               return -EINVAL;
+
        /* If either I or some sibling (!= me) is exclusive, we can't overlap */
        list_for_each_entry(c, &par->children, sibling) {
                if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
@@ -602,9 +626,56 @@ static int validate_change(const struct 
        return 0;
 }
 
+static void update_cpu_domains(struct cpuset *cs, cpumask_t old_map)
+{
+       struct cpuset *par = cs->parent, t, old_parent;
+       cpumask_t all_map, span;
+
+       t = old_parent = *par;
+       cpus_or(all_map, cs->cpus_allowed, cs->isolated_map);
+
+       /* If cpuset empty or top_cpuset, return */
+        if (cpus_empty(all_map) || par == NULL)
+                return;
+
+       /* If cpuset no longer isolated, return cpus back to parent */
+       if (is_removed(cs) || (!is_cpu_isolated(cs))) {
+               cpus_or(t.cpus_allowed, t.cpus_allowed, cs->cpus_allowed);
+               cpus_andnot(t.isolated_map, t.isolated_map, cs->cpus_allowed);
+               span = CPU_MASK_NONE;
+       } else {
+               /* Are we removing CPUs from an isolated cpuset? */
+               if (cpus_subset(cs->cpus_allowed, old_map)) {
+                       cpus_or(t.cpus_allowed, par->cpus_allowed, old_map);
+                       cpus_andnot(t.isolated_map, par->isolated_map, old_map);
+               }
+               cpus_andnot(t.cpus_allowed, t.cpus_allowed, cs->cpus_allowed);
+               cpus_or(t.isolated_map, t.isolated_map, cs->cpus_allowed);
+               span = cs->cpus_allowed;
+       }
+
+       /* If no change in both cpus_allowed and isolated_map, just return */
+       if ((cpus_equal(t.cpus_allowed, old_parent.cpus_allowed)
+            && cpus_equal(t.isolated_map, old_parent.isolated_map)))
+               return;
+
+       /* Make the change */
+       par->cpus_allowed = t.cpus_allowed;
+       par->isolated_map = t.isolated_map;
+
+       /* If sched domain same as before, we are done */
+       if (cpus_equal(cs->cpus_allowed, old_parent.cpus_allowed))
+               return;
+
+       lock_cpu_hotplug();
+       rebuild_sched_domains(par->cpus_allowed, span);
+       unlock_cpu_hotplug();
+}
+
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
+       cpumask_t old_map = cs->cpus_allowed;
        int retval;
 
        trialcs = *cs;
@@ -615,9 +686,21 @@ static int update_cpumask(struct cpuset 
        if (cpus_empty(trialcs.cpus_allowed))
                return -ENOSPC;
        retval = validate_change(cs, &trialcs);
-       if (retval == 0)
+       if (retval < 0)
+               return retval;
+       if (!is_cpu_isolated(cs)) {
                cs->cpus_allowed = trialcs.cpus_allowed;
-       return retval;
+               return 0;
+       }
+       /* 
+         * If current isolated cpuset has isolated children 
+         * disallow changes to cpu mask
+        */
+       if (!cpus_empty(cs->isolated_map))
+               return -EBUSY;
+       cs->cpus_allowed = trialcs.cpus_allowed;
+       update_cpu_domains(cs, old_map);
+       return 0;
 }
 
 static int update_nodemask(struct cpuset *cs, char *buf)
@@ -652,25 +735,28 @@ static int update_nodemask(struct cpuset
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
 {
        int turning_on;
-       struct cpuset trialcs;
+       struct cpuset trialcs, oldcs;
        int err;
 
        turning_on = (simple_strtoul(buf, NULL, 10) != 0);
 
-       trialcs = *cs;
+       trialcs = oldcs = *cs;
        if (turning_on)
                set_bit(bit, &trialcs.flags);
        else
                clear_bit(bit, &trialcs.flags);
 
        err = validate_change(cs, &trialcs);
-       if (err == 0) {
-               if (turning_on)
-                       set_bit(bit, &cs->flags);
-               else
-                       clear_bit(bit, &cs->flags);
-       }
-       return err;
+       if (err < 0)
+               return err;
+       if (turning_on)
+               set_bit(bit, &cs->flags);
+       else
+               clear_bit(bit, &cs->flags);
+
+       if (is_cpu_isolated(cs) != is_cpu_isolated(&oldcs))
+               update_cpu_domains(cs, cs->cpus_allowed);
+       return 0;
 }
 
 static int attach_task(struct cpuset *cs, char *buf)
@@ -735,6 +821,7 @@ typedef enum {
        FILE_CPULIST,
        FILE_MEMLIST,
        FILE_CPU_EXCLUSIVE,
+       FILE_CPU_ISOLATED,
        FILE_MEM_EXCLUSIVE,
        FILE_NOTIFY_ON_RELEASE,
        FILE_TASKLIST,
@@ -780,6 +867,9 @@ static ssize_t cpuset_common_file_write(
        case FILE_CPU_EXCLUSIVE:
                retval = update_flag(CS_CPU_EXCLUSIVE, cs, buffer);
                break;
+       case FILE_CPU_ISOLATED:
+               retval = update_flag(CS_CPU_ISOLATED, cs, buffer);
+               break;
        case FILE_MEM_EXCLUSIVE:
                retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
                break;
@@ -843,6 +933,26 @@ static int cpuset_sprintf_cpulist(char *
        return cpulist_scnprintf(page, PAGE_SIZE, mask);
 }
 
+static int cpuset_sprintf_isolist(char *page, struct cpuset *cs)
+{
+       cpumask_t mask = CPU_MASK_NONE;
+       char *tmp = page;
+
+       down(&cpuset_sem);
+       if (!cpus_empty(cs->isolated_map))
+               mask = cs->isolated_map;
+       up(&cpuset_sem);
+
+       if (cpus_empty(mask))
+               return 0;
+       
+       *tmp++ = '[';   
+       tmp += cpulist_scnprintf(tmp, PAGE_SIZE, mask);
+       *tmp++ = ']';
+
+       return (tmp-page);
+}
+
 static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
 {
        nodemask_t mask;
@@ -874,6 +984,7 @@ static ssize_t cpuset_common_file_read(s
        switch (type) {
        case FILE_CPULIST:
                s += cpuset_sprintf_cpulist(s, cs);
+               s += cpuset_sprintf_isolist(s, cs);
                break;
        case FILE_MEMLIST:
                s += cpuset_sprintf_memlist(s, cs);
@@ -881,6 +992,9 @@ static ssize_t cpuset_common_file_read(s
        case FILE_CPU_EXCLUSIVE:
                *s++ = is_cpu_exclusive(cs) ? '1' : '0';
                break;
+       case FILE_CPU_ISOLATED:
+               *s++ = is_cpu_isolated(cs) ? '1' : '0';
+               break;
        case FILE_MEM_EXCLUSIVE:
                *s++ = is_mem_exclusive(cs) ? '1' : '0';
                break;
@@ -1205,6 +1319,11 @@ static struct cftype cft_cpu_exclusive =
        .private = FILE_CPU_EXCLUSIVE,
 };
 
+static struct cftype cft_cpu_isolated = {
+       .name = "cpu_isolated",
+       .private = FILE_CPU_ISOLATED,
+};
+
 static struct cftype cft_mem_exclusive = {
        .name = "mem_exclusive",
        .private = FILE_MEM_EXCLUSIVE,
@@ -1225,6 +1344,8 @@ static int cpuset_populate_dir(struct de
                return err;
        if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
                return err;
+       if ((err = cpuset_add_file(cs_dentry, &cft_cpu_isolated)) < 0)
+               return err;
        if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
                return err;
        if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
@@ -1258,6 +1379,7 @@ static long cpuset_create(struct cpuset 
        if (notify_on_release(parent))
                set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
        cs->cpus_allowed = CPU_MASK_NONE;
+       cs->isolated_map = CPU_MASK_NONE;
        cs->mems_allowed = NODE_MASK_NONE;
        atomic_set(&cs->count, 0);
        INIT_LIST_HEAD(&cs->sibling);
@@ -1319,6 +1441,8 @@ static int cpuset_rmdir(struct inode *un
        spin_lock(&cs->dentry->d_lock);
        parent = cs->parent;
        set_bit(CS_REMOVED, &cs->flags);
+       if (is_cpu_isolated(cs))
+               update_cpu_domains(cs, cs->cpus_allowed);
        list_del(&cs->sibling); /* delete my sibling from parent->children */
        if (list_empty(&parent->children))
                check_for_release(parent);
@@ -1343,6 +1467,7 @@ int __init cpuset_init(void)
        int err;
 
        top_cpuset.cpus_allowed = CPU_MASK_ALL;
+       top_cpuset.isolated_map = CPU_MASK_NONE;
        top_cpuset.mems_allowed = NODE_MASK_ALL;
 
        atomic_inc(&cpuset_mems_generation);
diff -Naurp linux-2.6.12-rc1-mm1.orig/kernel/sched.c 
linux-2.6.12-rc1-mm1/kernel/sched.c
--- linux-2.6.12-rc1-mm1.orig/kernel/sched.c    2005-04-21 21:50:26.000000000 
+0530
+++ linux-2.6.12-rc1-mm1/kernel/sched.c 2005-04-21 21:53:24.000000000 +0530
@@ -4895,40 +4895,41 @@ static void check_sibling_maps(void)
 }
 #endif
 
-/*
- * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
- */
-static void __devinit arch_init_sched_domains(void)
+static void attach_domains(cpumask_t cpu_map)
 {
        int i;
-       cpumask_t cpu_default_map;
 
-#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
-       check_sibling_maps();
+       /* Attach the domains */
+       for_each_cpu_mask(i, cpu_map) {
+               struct sched_domain *sd;
+#ifdef CONFIG_SCHED_SMT
+               sd = &per_cpu(cpu_domains, i);
+#else
+               sd = &per_cpu(phys_domains, i);
 #endif
-       /*
-        * Setup mask for cpus without special case scheduling requirements.
-        * For now this just excludes isolated cpus, but could be used to
-        * exclude other special cases in the future.
-        */
-       cpus_complement(cpu_default_map, cpu_isolated_map);
-       cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
+               cpu_attach_domain(sd, i);
+       }
+}
+
+static void build_sched_domains(cpumask_t cpu_map)
+{
+       int i;
 
        /*
-        * Set up domains. Isolated domains just stay on the dummy domain.
+        * Set up domains.
         */
-       for_each_cpu_mask(i, cpu_default_map) {
+       for_each_cpu_mask(i, cpu_map) {
                int group;
                struct sched_domain *sd = NULL, *p;
                cpumask_t nodemask = node_to_cpumask(cpu_to_node(i));
 
-               cpus_and(nodemask, nodemask, cpu_default_map);
+               cpus_and(nodemask, nodemask, cpu_map);
 
 #ifdef CONFIG_NUMA
                sd = &per_cpu(node_domains, i);
                group = cpu_to_node_group(i);
                *sd = SD_NODE_INIT;
-               sd->span = cpu_default_map;
+               sd->span = cpu_map;
                sd->groups = &sched_group_nodes[group];
 #endif
 
@@ -4946,7 +4947,7 @@ static void __devinit arch_init_sched_do
                group = cpu_to_cpu_group(i);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
-               cpus_and(sd->span, sd->span, cpu_default_map);
+               cpus_and(sd->span, sd->span, cpu_map);
                sd->parent = p;
                sd->groups = &sched_group_cpus[group];
 #endif
@@ -4956,7 +4957,7 @@ static void __devinit arch_init_sched_do
        /* Set up CPU (sibling) groups */
        for_each_online_cpu(i) {
                cpumask_t this_sibling_map = cpu_sibling_map[i];
-               cpus_and(this_sibling_map, this_sibling_map, cpu_default_map);
+               cpus_and(this_sibling_map, this_sibling_map, cpu_map);
                if (i != first_cpu(this_sibling_map))
                        continue;
 
@@ -4969,7 +4970,7 @@ static void __devinit arch_init_sched_do
        for (i = 0; i < MAX_NUMNODES; i++) {
                cpumask_t nodemask = node_to_cpumask(i);
 
-               cpus_and(nodemask, nodemask, cpu_default_map);
+               cpus_and(nodemask, nodemask, cpu_map);
                if (cpus_empty(nodemask))
                        continue;
 
@@ -4979,12 +4980,12 @@ static void __devinit arch_init_sched_do
 
 #ifdef CONFIG_NUMA
        /* Set up node groups */
-       init_sched_build_groups(sched_group_nodes, cpu_default_map,
+       init_sched_build_groups(sched_group_nodes, cpu_map,
                                        &cpu_to_node_group);
 #endif
 
        /* Calculate CPU power for physical packages and nodes */
-       for_each_cpu_mask(i, cpu_default_map) {
+       for_each_cpu_mask(i, cpu_map) {
                int power;
                struct sched_domain *sd;
 #ifdef CONFIG_SCHED_SMT
@@ -5006,17 +5007,54 @@ static void __devinit arch_init_sched_do
                }
 #endif
        }
+}
 
-       /* Attach the domains */
-       for_each_online_cpu(i) {
-               struct sched_domain *sd;
-#ifdef CONFIG_SCHED_SMT
-               sd = &per_cpu(cpu_domains, i);
-#else
-               sd = &per_cpu(phys_domains, i);
+void rebuild_sched_domains(cpumask_t span1, cpumask_t span2)
+{
+       unsigned long flags;
+       cpumask_t change_map;
+       int i;
+
+       cpus_or(change_map, span1, span2);
+
+       local_irq_save(flags);
+
+       for_each_cpu_mask(i, change_map)
+               spin_lock(&cpu_rq(i)->lock);
+
+       if (!cpus_empty(span1))
+               build_sched_domains(span1);
+       if (!cpus_empty(span2))
+               build_sched_domains(span2);
+
+       for_each_cpu_mask(i, change_map)
+               spin_unlock(&cpu_rq(i)->lock);
+
+       attach_domains(change_map);
+
+       local_irq_restore(flags);
+}
+
+/*
+ * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
+ */
+static void __devinit arch_init_sched_domains(void)
+{
+       cpumask_t cpu_default_map;
+
+#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA)
+       check_sibling_maps();
 #endif
-               cpu_attach_domain(sd, i);
-       }
+       /*
+        * Setup mask for cpus without special case scheduling requirements.
+        * For now this just excludes isolated cpus, but could be used to
+        * exclude other special cases in the future.
+        */
+       cpus_complement(cpu_default_map, cpu_isolated_map);
+       cpus_and(cpu_default_map, cpu_default_map, cpu_online_map);
+
+       build_sched_domains(cpu_default_map);
+       attach_domains(cpu_default_map);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5046,13 +5084,13 @@ static int update_sched_domains(struct n
                                unsigned long action, void *hcpu)
 {
        int i;
+       cpumask_t temp_map, hotcpu = cpumask_of_cpu((long)hcpu);
 
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_DOWN_PREPARE:
-               for_each_online_cpu(i)
-                       cpu_attach_domain(&sched_domain_dummy, i);
-               arch_destroy_sched_domains();
+               cpus_andnot(temp_map, cpu_online_map, hotcpu);
+               rebuild_sched_domains(cpu_online_map, temp_map, CPU_MASK_NONE);
                return NOTIFY_OK;
 
        case CPU_UP_CANCELED:
@@ -5068,7 +5106,8 @@ static int update_sched_domains(struct n
        }
 
        /* The hotplug lock is already held by cpu_up/cpu_down */
-       arch_init_sched_domains();
+       cpus_or(temp_map, cpu_online_map, hotcpu);
+       rebuild_sched_domains(cpu_online_map, cpu_online_map, CPU_MASK_NONE);
 
        return NOTIFY_OK;
 }

Reply via email to