On Wed, Mar 01, 2017 at 01:28:43PM +0100, Peter Zijlstra wrote:
> On Wed, Mar 01, 2017 at 02:43:23PM +0900, Byungchul Park wrote:

> > It's an optimization, but very essential and important optimization.

Since its not for correctness, please put it in a separate patch with a
good Changelog, the below would make a good beginning of that.

Also, I feel, the source comments can be improved.

> >           in hlocks[]
> >           ------------
> >           A gen_id (4) --+
> >                          | previous gen_id
> >           B gen_id (3) <-+
> >           C gen_id (3)
> >           D gen_id (2)
> > oldest -> E gen_id (1)
> > 
> >           in xhlocks[]
> >           ------------
> >        ^  A gen_id (4) prev_gen_id (3: B's gen id)
> >        |  B gen_id (3) prev_gen_id (3: C's gen id)
> >        |  C gen_id (3) prev_gen_id (2: D's gen id)
> >        |  D gen_id (2) prev_gen_id (1: E's gen id)
> >        |  E gen_id (1) prev_gen_id (NA)
> > 
> > Let's consider the case that the gen id of xlock to commit is 3.
> > 
> > In this case, it's engough to generate 'the xlock -> C'. 'the xlock -> B'
> > and 'the xlock -> A' are unnecessary since it's covered by 'C -> B' and
> > 'B -> A' which are already generated by original lockdep.
> > 
> > I use the prev_gen_id to avoid adding this kind of redundant
> > dependencies. In other words, xhlock->prev_gen_id >= xlock->hlock.gen_id
> > means that the previous lock in hlocks[] is able to handle the
> > dependency on its commit stage.
> > 
> 
> Aah, I completely missed it was against held_locks.
> 
> Hurm.. it feels like this is solving a problem we shouldn't be solving
> though.
> 
> That is, ideally we'd already be able to (quickly) tell if a relation
> exists or not, but given how the whole chain_hash stuff is build now, it
> looks like we cannot.
> 
> 
> Let me think about this a bit more.

OK, so neither this nor the chain-hash completely avoid redundant
dependencies. The only way to do that is doing graph-walks for every
proposed link.

Now, we already do a ton of __bfs() walks in check_prev_add(), so one
more might not hurt too much [*].

Esp. with the chain-hash avoiding all the obvious duplicate work, this
might just work.

diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1..7baea89 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -1860,6 +1860,17 @@ check_prev_add(struct task_struct *curr, struct 
held_lock *prev,
                }
        }
 
+       /*
+        * Is the <prev> -> <next> redundant?
+        */
+       this.class = hlock_class(prev);
+       this.parent = NULL;
+       ret = check_noncircular(&this, hlock_class(next), &target_entry);
+       if (!ret) /* exists, redundant */
+               return 2;
+       if (ret < 0)
+               return print_bfs_bug(ret);
+
        if (!*stack_saved) {
                if (!save_trace(&trace))
                        return 0;



[*] A while ago someone, and I cannot find the email just now, asked if
we could not implement the RECLAIM_FS inversion stuff with a 'fake' lock
like we use for other things like workqueues etc. I think this should be
possible which allows reducing the 'irq' states and will reduce the
amount of __bfs() lookups we do.

Removing the 1 IRQ state, would result in 4 less __bfs() walks if I'm
not mistaken, more than making up for the 1 we'd have to add to detect
redundant links.


 include/linux/lockdep.h         | 11 +-----
 include/linux/sched.h           |  1 -
 kernel/locking/lockdep.c        | 87 +----------------------------------------
 kernel/locking/lockdep_states.h |  1 -
 mm/internal.h                   | 40 +++++++++++++++++++
 mm/page_alloc.c                 | 13 ++++--
 mm/slab.h                       |  7 +++-
 mm/slob.c                       |  8 +++-
 mm/vmscan.c                     | 13 +++---
 9 files changed, 71 insertions(+), 110 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 1e327bb..6ba1a65 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -29,7 +29,7 @@ extern int lock_stat;
  * We'd rather not expose kernel/lockdep_states.h this wide, but we do need
  * the total number of states... :-(
  */
-#define XXX_LOCK_USAGE_STATES          (1+3*4)
+#define XXX_LOCK_USAGE_STATES          (1+2*4)
 
 /*
  * NR_LOCKDEP_CACHING_CLASSES ... Number of classes
@@ -361,10 +361,6 @@ static inline void lock_set_subclass(struct lockdep_map 
*lock,
        lock_set_class(lock, lock->name, lock->key, subclass, ip);
 }
 
-extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
-extern void lockdep_clear_current_reclaim_state(void);
-extern void lockdep_trace_alloc(gfp_t mask);
-
 struct pin_cookie { unsigned int val; };
 
 #define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
@@ -373,7 +369,7 @@ extern struct pin_cookie lock_pin_lock(struct lockdep_map 
*lock);
 extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
 extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
-# define INIT_LOCKDEP                          .lockdep_recursion = 0, 
.lockdep_reclaim_gfp = 0,
+# define INIT_LOCKDEP                          .lockdep_recursion = 0,
 
 #define lockdep_depth(tsk)     (debug_locks ? (tsk)->lockdep_depth : 0)
 
@@ -413,9 +409,6 @@ static inline void lockdep_on(void)
 # define lock_release(l, n, i)                 do { } while (0)
 # define lock_set_class(l, n, k, s, i)         do { } while (0)
 # define lock_set_subclass(l, s, i)            do { } while (0)
-# define lockdep_set_current_reclaim_state(g)  do { } while (0)
-# define lockdep_clear_current_reclaim_state() do { } while (0)
-# define lockdep_trace_alloc(g)                        do { } while (0)
 # define lockdep_info()                                do { } while (0)
 # define lockdep_init_map(lock, name, key, sub) \
                do { (void)(name); (void)(key); } while (0)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..0fa8a8f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -806,7 +806,6 @@ struct task_struct {
        int                             lockdep_depth;
        unsigned int                    lockdep_recursion;
        struct held_lock                held_locks[MAX_LOCK_DEPTH];
-       gfp_t                           lockdep_reclaim_gfp;
 #endif
 
 #ifdef CONFIG_UBSAN
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index a95e5d1..1051600 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -343,14 +343,12 @@ EXPORT_SYMBOL(lockdep_on);
 #if VERBOSE
 # define HARDIRQ_VERBOSE       1
 # define SOFTIRQ_VERBOSE       1
-# define RECLAIM_VERBOSE       1
 #else
 # define HARDIRQ_VERBOSE       0
 # define SOFTIRQ_VERBOSE       0
-# define RECLAIM_VERBOSE       0
 #endif
 
-#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE || RECLAIM_VERBOSE
+#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
 /*
  * Quick filtering for interesting events:
  */
@@ -2553,14 +2551,6 @@ static int SOFTIRQ_verbose(struct lock_class *class)
        return 0;
 }
 
-static int RECLAIM_FS_verbose(struct lock_class *class)
-{
-#if RECLAIM_VERBOSE
-       return class_filter(class);
-#endif
-       return 0;
-}
-
 #define STRICT_READ_CHECKS     1
 
 static int (*state_verbose_f[])(struct lock_class *class) = {
@@ -2856,51 +2846,6 @@ void trace_softirqs_off(unsigned long ip)
                debug_atomic_inc(redundant_softirqs_off);
 }
 
-static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
-{
-       struct task_struct *curr = current;
-
-       if (unlikely(!debug_locks))
-               return;
-
-       /* no reclaim without waiting on it */
-       if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
-               return;
-
-       /* this guy won't enter reclaim */
-       if ((curr->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
-               return;
-
-       /* We're only interested __GFP_FS allocations for now */
-       if (!(gfp_mask & __GFP_FS))
-               return;
-
-       /*
-        * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
-        */
-       if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
-               return;
-
-       mark_held_locks(curr, RECLAIM_FS);
-}
-
-static void check_flags(unsigned long flags);
-
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-       unsigned long flags;
-
-       if (unlikely(current->lockdep_recursion))
-               return;
-
-       raw_local_irq_save(flags);
-       check_flags(flags);
-       current->lockdep_recursion = 1;
-       __lockdep_trace_alloc(gfp_mask, flags);
-       current->lockdep_recursion = 0;
-       raw_local_irq_restore(flags);
-}
-
 static int mark_irqflags(struct task_struct *curr, struct held_lock *hlock)
 {
        /*
@@ -2946,22 +2891,6 @@ static int mark_irqflags(struct task_struct *curr, 
struct held_lock *hlock)
                }
        }
 
-       /*
-        * We reuse the irq context infrastructure more broadly as a general
-        * context checking code. This tests GFP_FS recursion (a lock taken
-        * during reclaim for a GFP_FS allocation is held over a GFP_FS
-        * allocation).
-        */
-       if (!hlock->trylock && (curr->lockdep_reclaim_gfp & __GFP_FS)) {
-               if (hlock->read) {
-                       if (!mark_lock(curr, hlock, 
LOCK_USED_IN_RECLAIM_FS_READ))
-                                       return 0;
-               } else {
-                       if (!mark_lock(curr, hlock, LOCK_USED_IN_RECLAIM_FS))
-                                       return 0;
-               }
-       }
-
        return 1;
 }
 
@@ -3020,10 +2949,6 @@ static inline int separate_irq_context(struct 
task_struct *curr,
        return 0;
 }
 
-void lockdep_trace_alloc(gfp_t gfp_mask)
-{
-}
-
 #endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 
 /*
@@ -3859,16 +3784,6 @@ void lock_unpin_lock(struct lockdep_map *lock, struct 
pin_cookie cookie)
 }
 EXPORT_SYMBOL_GPL(lock_unpin_lock);
 
-void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
-{
-       current->lockdep_reclaim_gfp = gfp_mask;
-}
-
-void lockdep_clear_current_reclaim_state(void)
-{
-       current->lockdep_reclaim_gfp = 0;
-}
-
 #ifdef CONFIG_LOCK_STAT
 static int
 print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
index 995b0cc..35ca09f 100644
--- a/kernel/locking/lockdep_states.h
+++ b/kernel/locking/lockdep_states.h
@@ -6,4 +6,3 @@
  */
 LOCKDEP_STATE(HARDIRQ)
 LOCKDEP_STATE(SOFTIRQ)
-LOCKDEP_STATE(RECLAIM_FS)
diff --git a/mm/internal.h b/mm/internal.h
index ccfc2a2..88b9107 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -15,6 +15,8 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/tracepoint-defs.h>
+#include <linux/lockdep.h>
+#include <linux/sched/mm.h>
 
 /*
  * The set of flags that only affect watermark checking and reclaim
@@ -498,4 +500,42 @@ extern const struct trace_print_flags pageflag_names[];
 extern const struct trace_print_flags vmaflag_names[];
 extern const struct trace_print_flags gfpflag_names[];
 
+
+#ifdef CONFIG_LOCKDEP
+extern struct lockdep_map __fs_reclaim_map;
+
+static inline bool __need_fs_reclaim(gfp_t gfp_mask)
+{
+       gfp_mask = memalloc_noio_flags(gfp_mask);
+
+       /* no reclaim without waiting on it */
+       if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
+               return false;
+
+       /* this guy won't enter reclaim */
+       if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC))
+               return false;
+
+       /* We're only interested __GFP_FS allocations for now */
+       if (!(gfp_mask & __GFP_FS))
+               return false;
+
+       return true;
+}
+
+static inline void fs_reclaim_acquire(gfp_t gfp_mask)
+{
+       if (__need_fs_reclaim(gfp_mask))
+               lock_map_acquire(&__fs_reclaim_map);
+}
+static inline void fs_reclaim_release(gfp_t gfp_mask)
+{
+       if (__need_fs_reclaim(gfp_mask))
+               lock_map_release(&__fs_reclaim_map);
+}
+#else
+static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
+static inline void fs_reclaim_release(gfp_t gfp_mask) { }
+#endif
+
 #endif /* __MM_INTERNAL_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa64d2..85ea8bf 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3387,6 +3387,12 @@ should_compact_retry(struct alloc_context *ac, unsigned 
int order, int alloc_fla
 }
 #endif /* CONFIG_COMPACTION */
 
+
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __fs_reclaim_map =
+       STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
+#endif
+
 /* Perform direct synchronous page reclaim */
 static int
 __perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -3400,7 +3406,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
        /* We now go into synchronous reclaim */
        cpuset_memory_pressure_bump();
        current->flags |= PF_MEMALLOC;
-       lockdep_set_current_reclaim_state(gfp_mask);
+       fs_reclaim_acquire(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        current->reclaim_state = &reclaim_state;
 
@@ -3408,7 +3414,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order,
                                                                ac->nodemask);
 
        current->reclaim_state = NULL;
-       lockdep_clear_current_reclaim_state();
+       fs_reclaim_release(gfp_mask);
        current->flags &= ~PF_MEMALLOC;
 
        cond_resched();
@@ -3913,7 +3919,8 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, 
unsigned int order,
                        *alloc_flags |= ALLOC_CPUSET;
        }
 
-       lockdep_trace_alloc(gfp_mask);
+       fs_reclaim_acquire(gfp_mask);
+       fs_reclaim_release(gfp_mask);
 
        might_sleep_if(gfp_mask & __GFP_DIRECT_RECLAIM);
 
diff --git a/mm/slab.h b/mm/slab.h
index 65e7c3f..753f552 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -44,6 +44,8 @@ struct kmem_cache {
 #include <linux/kmemleak.h>
 #include <linux/random.h>
 
+#include "internal.h"
+
 /*
  * State of the slab allocator.
  *
@@ -428,7 +430,10 @@ static inline struct kmem_cache 
*slab_pre_alloc_hook(struct kmem_cache *s,
                                                     gfp_t flags)
 {
        flags &= gfp_allowed_mask;
-       lockdep_trace_alloc(flags);
+
+       fs_reclaim_acquire(flags);
+       fs_reclaim_release(flags);
+
        might_sleep_if(gfpflags_allow_blocking(flags));
 
        if (should_failslab(s, flags))
diff --git a/mm/slob.c b/mm/slob.c
index eac04d43..3e32280 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -73,6 +73,8 @@
 #include <linux/atomic.h>
 
 #include "slab.h"
+#include "internal.h"
+
 /*
  * slob_block has a field 'units', which indicates size of block if +ve,
  * or offset of next block if -ve (in SLOB_UNITs).
@@ -432,7 +434,8 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, 
unsigned long caller)
 
        gfp &= gfp_allowed_mask;
 
-       lockdep_trace_alloc(gfp);
+       fs_reclaim_acquire(gfp);
+       fs_reclaim_release(gfp);
 
        if (size < PAGE_SIZE - align) {
                if (!size)
@@ -538,7 +541,8 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t 
flags, int node)
 
        flags &= gfp_allowed_mask;
 
-       lockdep_trace_alloc(flags);
+       fs_reclaim_acquire(flags);
+       fs_reclaim_release(flags);
 
        if (c->size < PAGE_SIZE) {
                b = slob_alloc(c->size, flags, c->align, node);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bc8031e..2f57e36 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3418,8 +3418,6 @@ static int kswapd(void *p)
        };
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
-       lockdep_set_current_reclaim_state(GFP_KERNEL);
-
        if (!cpumask_empty(cpumask))
                set_cpus_allowed_ptr(tsk, cpumask);
        current->reclaim_state = &reclaim_state;
@@ -3475,7 +3473,9 @@ static int kswapd(void *p)
                 */
                trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
                                                alloc_order);
+               fs_reclaim_acquire(GFP_KERNEL);
                reclaim_order = balance_pgdat(pgdat, alloc_order, 
classzone_idx);
+               fs_reclaim_release(GFP_KERNEL);
                if (reclaim_order < alloc_order)
                        goto kswapd_try_sleep;
 
@@ -3485,7 +3485,6 @@ static int kswapd(void *p)
 
        tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
        current->reclaim_state = NULL;
-       lockdep_clear_current_reclaim_state();
 
        return 0;
 }
@@ -3550,14 +3549,14 @@ unsigned long shrink_all_memory(unsigned long 
nr_to_reclaim)
        unsigned long nr_reclaimed;
 
        p->flags |= PF_MEMALLOC;
-       lockdep_set_current_reclaim_state(sc.gfp_mask);
+       fs_reclaim_acquire(sc.gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
        nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
        p->reclaim_state = NULL;
-       lockdep_clear_current_reclaim_state();
+       fs_reclaim_release(sc.gfp_mask);
        p->flags &= ~PF_MEMALLOC;
 
        return nr_reclaimed;
@@ -3741,7 +3740,7 @@ static int __node_reclaim(struct pglist_data *pgdat, 
gfp_t gfp_mask, unsigned in
         * and RECLAIM_UNMAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
-       lockdep_set_current_reclaim_state(gfp_mask);
+       fs_reclaim_acquire(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
@@ -3756,8 +3755,8 @@ static int __node_reclaim(struct pglist_data *pgdat, 
gfp_t gfp_mask, unsigned in
        }
 
        p->reclaim_state = NULL;
+       fs_reclaim_release(gfp_mask);
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
-       lockdep_clear_current_reclaim_state();
        return sc.nr_reclaimed >= nr_pages;
 }
 

Reply via email to