Remove stop_machine from module unloading by replacing module_ref
with atomic_t. Note that this can cause a performance regression
on big-SMP machine by direct memory access. For those machines,
you can lockdwon all modules. Since the lockdown skips reference
counting, it'll be more scalable than per-cpu module_ref counters.

Signed-off-by: Masami Hiramatsu <[email protected]>
Cc: Rusty Russell <[email protected]>
---
 include/linux/module.h        |   16 ------
 include/trace/events/module.h |    2 -
 kernel/module.c               |  108 +++++++++++++++--------------------------
 3 files changed, 41 insertions(+), 85 deletions(-)

diff --git a/include/linux/module.h b/include/linux/module.h
index 670cb2e..3ebe049 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -211,20 +211,6 @@ enum module_state {
        MODULE_STATE_UNFORMED,  /* Still setting it up. */
 };
 
-/**
- * struct module_ref - per cpu module reference counts
- * @incs: number of module get on this cpu
- * @decs: number of module put on this cpu
- *
- * We force an alignment on 8 or 16 bytes, so that alloc_percpu()
- * put @incs/@decs in same cache line, with no extra memory cost,
- * since alloc_percpu() is fine grained.
- */
-struct module_ref {
-       unsigned long incs;
-       unsigned long decs;
-} __attribute((aligned(2 * sizeof(unsigned long))));
-
 struct module {
        enum module_state state;
 
@@ -368,7 +354,7 @@ struct module {
        /* Destruction function. */
        void (*exit)(void);
 
-       struct module_ref __percpu *refptr;
+       atomic_t refcnt;
 #endif
 
 #ifdef CONFIG_CONSTRUCTORS
diff --git a/include/trace/events/module.h b/include/trace/events/module.h
index 7c5cbfe..81c4c18 100644
--- a/include/trace/events/module.h
+++ b/include/trace/events/module.h
@@ -80,7 +80,7 @@ DECLARE_EVENT_CLASS(module_refcnt,
 
        TP_fast_assign(
                __entry->ip     = ip;
-               __entry->refcnt = __this_cpu_read(mod->refptr->incs) - 
__this_cpu_read(mod->refptr->decs);
+               __entry->refcnt = atomic_read(&mod->refcnt);
                __assign_str(name, mod->name);
        ),
 
diff --git a/kernel/module.c b/kernel/module.c
index 85ffc1d..7af6ff7 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -42,7 +42,6 @@
 #include <linux/vermagic.h>
 #include <linux/notifier.h>
 #include <linux/sched.h>
-#include <linux/stop_machine.h>
 #include <linux/device.h>
 #include <linux/string.h>
 #include <linux/mutex.h>
@@ -98,7 +97,7 @@
  * 1) List of modules (also safely readable with preempt_disable),
  * 2) module_use links,
  * 3) module_addr_min/module_addr_max.
- * (delete uses stop_machine/add uses RCU list operations). */
+ * (delete and add uses RCU list operations). */
 DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
@@ -628,18 +627,26 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
 
 EXPORT_TRACEPOINT_SYMBOL(module_get);
 
+/*
+ * MODULE_REF_BASE must be 1, since we use atomic_inc_not_zero() for
+ * recovering refcnt (see try_release_module_ref() ).
+ */
+#define MODULE_REF_BASE        1
+
 /* Init the unload section of the module. */
 static int module_unload_init(struct module *mod)
 {
-       mod->refptr = alloc_percpu(struct module_ref);
-       if (!mod->refptr)
-               return -ENOMEM;
+       /*
+        * Initialize reference counter to MODULE_REF_BASE.
+        * refcnt == 0 means module is going.
+        */
+       atomic_set(&mod->refcnt, MODULE_REF_BASE);
 
        INIT_LIST_HEAD(&mod->source_list);
        INIT_LIST_HEAD(&mod->target_list);
 
        /* Hold reference count during initialization. */
-       raw_cpu_write(mod->refptr->incs, 1);
+       atomic_inc(&mod->refcnt);
 
        return 0;
 }
@@ -721,8 +728,6 @@ static void module_unload_free(struct module *mod)
                kfree(use);
        }
        mutex_unlock(&module_mutex);
-
-       free_percpu(mod->refptr);
 }
 
 #ifdef CONFIG_MODULE_FORCE_UNLOAD
@@ -740,60 +745,38 @@ static inline int try_force_unload(unsigned int flags)
 }
 #endif /* CONFIG_MODULE_FORCE_UNLOAD */
 
-struct stopref
+/* Try to release refcount of module, 0 means success. */
+static int try_release_module_ref(struct module *mod)
 {
-       struct module *mod;
-       int flags;
-       int *forced;
-};
+       int ret;
 
-/* Whole machine is stopped with interrupts off when this runs. */
-static int __try_stop_module(void *_sref)
-{
-       struct stopref *sref = _sref;
+       /* Try to decrement refcnt which we set at loading */
+       ret = atomic_sub_return(MODULE_REF_BASE, &mod->refcnt);
+       if (ret)
+               /* Someone can put this right now, recover with checking */
+               ret = atomic_inc_not_zero(&mod->refcnt);
+
+       return ret;
+}
 
+static int try_stop_module(struct module *mod, int flags, int *forced)
+{
        /* If it's not unused, quit unless we're forcing. */
-       if (module_is_locked(sref->mod) || module_refcount(sref->mod) != 0) {
-               if (!(*sref->forced = try_force_unload(sref->flags)))
+       if (module_is_locked(mod) || try_release_module_ref(mod) != 0) {
+               *forced = try_force_unload(flags);
+               if (!(*forced))
                        return -EWOULDBLOCK;
        }
 
        /* Mark it as dying. */
-       sref->mod->state = MODULE_STATE_GOING;
-       return 0;
-}
+       mod->state = MODULE_STATE_GOING;
 
-static int try_stop_module(struct module *mod, int flags, int *forced)
-{
-       struct stopref sref = { mod, flags, forced };
-
-       return stop_machine(__try_stop_module, &sref, NULL);
+       return 0;
 }
 
 unsigned long module_refcount(struct module *mod)
 {
-       unsigned long incs = 0, decs = 0;
-       int cpu;
-
-       for_each_possible_cpu(cpu)
-               decs += per_cpu_ptr(mod->refptr, cpu)->decs;
-       /*
-        * ensure the incs are added up after the decs.
-        * module_put ensures incs are visible before decs with smp_wmb.
-        *
-        * This 2-count scheme avoids the situation where the refcount
-        * for CPU0 is read, then CPU0 increments the module refcount,
-        * then CPU1 drops that refcount, then the refcount for CPU1 is
-        * read. We would record a decrement but not its corresponding
-        * increment so we would see a low count (disaster).
-        *
-        * Rare situation? But module_refcount can be preempted, and we
-        * might be tallying up 4096+ CPUs. So it is not impossible.
-        */
-       smp_rmb();
-       for_each_possible_cpu(cpu)
-               incs += per_cpu_ptr(mod->refptr, cpu)->incs;
-       return incs - decs;
+       return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE;
 }
 EXPORT_SYMBOL(module_refcount);
 
@@ -935,10 +918,8 @@ static struct module_attribute modinfo_refcnt =
 void __module_get(struct module *module)
 {
        if (module) {
-               preempt_disable();
-               __this_cpu_inc(module->refptr->incs);
+               atomic_inc(&module->refcnt);
                trace_module_get(module, _RET_IP_);
-               preempt_enable();
        }
 }
 EXPORT_SYMBOL(__module_get);
@@ -947,21 +928,14 @@ bool try_module_get(struct module *module)
 {
        bool ret = true;
 
-       if (module) {
-               if (module_is_locked(module))
-                       goto end;
-
-               preempt_disable();
-
-               if (likely(module_is_live(module))) {
-                       __this_cpu_inc(module->refptr->incs);
+       if (module && !module_is_locked(module)) {
+               if (module_is_live(module) &&
+                   atomic_inc_not_zero(&module->refcnt) != 0)
                        trace_module_get(module, _RET_IP_);
-               } else
+               else
                        ret = false;
-
-               preempt_enable();
        }
-end:
+
        return ret;
 }
 EXPORT_SYMBOL(try_module_get);
@@ -969,12 +943,8 @@ EXPORT_SYMBOL(try_module_get);
 void module_put(struct module *module)
 {
        if (module && !module_is_locked(module)) {
-               preempt_disable();
-               smp_wmb(); /* see comment in module_refcount */
-               __this_cpu_inc(module->refptr->decs);
-
+               atomic_dec(&module->refcnt);
                trace_module_put(module, _RET_IP_);
-               preempt_enable();
        }
 }
 EXPORT_SYMBOL(module_put);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to