[PATCH v3 3/5] rcu/tree: Clean up dynticks counter usage

2020-05-04 Thread Joel Fernandes (Google)
The dynticks counter are confusing due to crowbar writes of
DYNTICK_IRQ_NONIDLE whose purpose is to detect half-interrupts (i.e. we
see rcu_irq_enter() but not rcu_irq_exit() due to a usermode upcall) and
if so then do a reset of the dyntick_nmi_nesting counters. This patch
tries to get rid of DYNTICK_IRQ_NONIDLE while still keeping the code
working, fully functional, and less confusing. The confusion recently
has even led to patches forgetting that DYNTICK_IRQ_NONIDLE was written
to which wasted lots of time.

The patch has the following changes:

(1) Use dynticks_nesting instead of dynticks_nmi_nesting for determining
outer most "EQS exit". This is needed to detect in
rcu_nmi_enter_common() if we have already EQS-exited, such as because of
a syscall. Currently we rely on a forced write of DYNTICK_IRQ_NONIDLE
from rcu_eqs_exit() for this purpose. This is one purpose of the
DYNTICK_IRQ_NONIDLE write (other than detecting half-interrupts).
However, we do not need to do that. dyntick_nesting already tells us that
we have EQS-exited so just use that thus removing the dependence of
dynticks_nmi_nesting for this purpose.

(2) Keep dynticks_nmi_nesting around because:

  (a) rcu_is_cpu_rrupt_from_idle() needs to be able to detect first
  interrupt nesting level.

  (b) We need to detect half-interrupts till we are sure they're not an
  issue. However, change the comparison to DYNTICK_IRQ_NONIDLE with 0.

(3) Since we got rid of DYNTICK_IRQ_NONIDLE, we also do cheaper
comparisons with zero instead for the code that keeps the tick on in
rcu_nmi_enter_common().

In the next patch, both of the concerns of (2) will be addressed and
then we can get rid of dynticks_nmi_nesting, however one step at a time.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/rcu.h  |  4 
 kernel/rcu/tree.c | 58 +++
 2 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index cf66a3ccd7573..b9f64abc48b85 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,10 +12,6 @@
 
 #include 
 
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE((LONG_MAX / 2) + 1)
-
-
 /*
  * Grace-period counter management.
  */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index eb7a4d90b3b91..d6df8abdcc21f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -84,7 +84,6 @@
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
-   .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
.dynticks = ATOMIC_INIT(1),
 };
 static struct rcu_state rcu_state = {
@@ -553,17 +552,19 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
 /*
  * Enter an RCU extended quiescent state, which can be either the
  * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
  */
 static void rcu_eqs_enter(bool user)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
-   WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
+   /*
+* Entering usermode/idle from interrupt is not handled. These would
+* mean usermode upcalls or idle exit happened from interrupts. Remove
+* the warning by 2020.
+*/
+   if (WARN_ON_ONCE(rdp->dynticks_nmi_nesting != 0))
+   WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
+
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
 rdp->dynticks_nesting == 0);
if (rdp->dynticks_nesting != 1) {
@@ -641,26 +642,29 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 * (We are exiting an NMI handler, so RCU better be paying attention
 * to us!)
 */
+   WARN_ON_ONCE(rdp->dynticks_nesting <= 0);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
 
+   WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
+  rdp->dynticks_nmi_nesting - 1);
/*
 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
 * leave it in non-RCU-idle state.
 */
-   if (rdp->dynticks_nmi_nesting != 1) {
+   if (rdp->dynticks_nesting != 1) {
trace_rcu_dyntick(TPS("StillNonIdle"), TPS("IRQ"),
- rdp->dynticks_nmi_nesting,
- rdp->dynticks_nmi_nesting - 2, 
atomic_read(>dynticks));
-   WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
-  rdp->dynticks_nmi_nesting - 2);
+ rdp->dynticks_nesting,
+   

[PATCH v3 5/5] trace: events: rcu: Change description of rcu_dyntick trace event

2020-05-04 Thread Joel Fernandes (Google)
From: Madhuparna Bhowmik 

The different strings used for describing the polarity are
Start, End and StillNonIdle. Since StillIdle is not used in any trace
point for rcu_dyntick, it can be removed and StillNonIdle can be added
in the description. Because StillNonIdle is used in a few tracepoints for
rcu_dyntick.

Similarly, USER, IDLE and IRQ are used for describing context in
the rcu_dyntick tracepoints. Since, "KERNEL" is not used for any
of the rcu_dyntick tracepoints, remove it from the description.

Signed-off-by: Madhuparna Bhowmik 
Signed-off-by: Joel Fernandes (Google) 
---
 include/trace/events/rcu.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 164c43b72ca29..02dcd119f3263 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -437,9 +437,9 @@ TRACE_EVENT_RCU(rcu_fqs,
 /*
  * Tracepoint for dyntick-idle entry/exit events.  These take 2 strings
  * as argument:
- * polarilty: "Start", "End", "StillIdle" for entering, exiting or still being
- * in dyntick-idle mode.
- * context: "USER" or "KERNEL" or "IRQ".
+ * polarilty: "Start", "End", "StillNonIdle" for entering, exiting or still not
+ *being in dyntick-idle mode.
+ * context: "USER" or "IDLE" or "IRQ".
  * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context.
  *
  * These events also take a pair of numbers, which indicate the nesting
-- 
2.26.2.526.g744177e7f7-goog



[PATCH v3 2/5] rcu/tree: Add better tracing for dyntick-idle

2020-05-04 Thread Joel Fernandes (Google)
The dyntick-idle traces are a bit confusing. This patch makes it simpler
and adds some missing cases such as EQS-enter due to user vs idle mode.

Following are the changes:
(1) Add a new context field to trace_rcu_dyntick tracepoint. This
context field can be "USER", "IDLE" or "IRQ".

(2) Remove the "++=" and "--=" strings and replace them with
   "StillNonIdle". This is much easier on the eyes, and the -- and ++
   are easily apparent in the dynticks_nesting counters we are printing
   anyway.

Signed-off-by: Joel Fernandes (Google) 
---
 include/trace/events/rcu.h | 29 -
 kernel/rcu/tree.c  | 20 +---
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index f9a7811148e2a..164c43b72ca29 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -435,26 +435,28 @@ TRACE_EVENT_RCU(rcu_fqs,
 #endif /* #if defined(CONFIG_TREE_RCU) */
 
 /*
- * Tracepoint for dyntick-idle entry/exit events.  These take a string
- * as argument: "Start" for entering dyntick-idle mode, "Startirq" for
- * entering it from irq/NMI, "End" for leaving it, "Endirq" for leaving it
- * to irq/NMI, "--=" for events moving towards idle, and "++=" for events
- * moving away from idle.
+ * Tracepoint for dyntick-idle entry/exit events.  These take 2 strings
+ * as argument:
+ * polarilty: "Start", "End", "StillIdle" for entering, exiting or still being
+ * in dyntick-idle mode.
+ * context: "USER" or "KERNEL" or "IRQ".
+ * NMIs nested in IRQs are inferred with dynticks_nesting > 1 in IRQ context.
  *
  * These events also take a pair of numbers, which indicate the nesting
  * depth before and after the event of interest, and a third number that is
- * the ->dynticks counter.  Note that task-related and interrupt-related
- * events use two separate counters, and that the "++=" and "--=" events
- * for irq/NMI will change the counter by two, otherwise by one.
+ * the ->dynticks counter. During NMI nesting within IRQs, the dynticks_nesting
+ * counter changes by two, otherwise one.
  */
 TRACE_EVENT_RCU(rcu_dyntick,
 
-   TP_PROTO(const char *polarity, long oldnesting, long newnesting, int 
dynticks),
+   TP_PROTO(const char *polarity, const char *context, long oldnesting,
+long newnesting, int dynticks),
 
-   TP_ARGS(polarity, oldnesting, newnesting, dynticks),
+   TP_ARGS(polarity, context, oldnesting, newnesting, dynticks),
 
TP_STRUCT__entry(
__field(const char *, polarity)
+   __field(const char *, context)
__field(long, oldnesting)
__field(long, newnesting)
__field(int, dynticks)
@@ -462,14 +464,15 @@ TRACE_EVENT_RCU(rcu_dyntick,
 
TP_fast_assign(
__entry->polarity = polarity;
+   __entry->context = context;
__entry->oldnesting = oldnesting;
__entry->newnesting = newnesting;
__entry->dynticks = dynticks;
),
 
-   TP_printk("%s %lx %lx %#3x", __entry->polarity,
- __entry->oldnesting, __entry->newnesting,
- __entry->dynticks & 0xfff)
+   TP_printk("%s %s %lx %lx %#3x", __entry->polarity,
+   __entry->context, __entry->oldnesting, __entry->newnesting,
+   __entry->dynticks & 0xfff)
 );
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1ec7b1d4a03c4..eb7a4d90b3b91 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -573,7 +573,8 @@ static void rcu_eqs_enter(bool user)
}
 
lockdep_assert_irqs_disabled();
-   trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, 
atomic_read(>dynticks));
+   trace_rcu_dyntick(TPS("Start"), (user ? TPS("USER") : TPS("IDLE")),
+ rdp->dynticks_nesting, 0, 
atomic_read(>dynticks));
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && 
!is_idle_task(current));
rdp = this_cpu_ptr(_data);
do_nocb_deferred_wakeup(rdp);
@@ -648,15 +649,17 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 * leave it in non-RCU-idle state.
 */
if (rdp->dynticks_nmi_nesting != 1) {
-   trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, 
rdp->dynticks_nmi_nesting - 2,
- atomic_read(>dynticks));
+   trace_rcu_dyntick(TPS("StillNonIdle"), TPS("IRQ"),
+ rdp->dynticks_nmi_nesting,
+ rdp->

[PATCH v3 4/5] rcu/tree: Remove dynticks_nmi_nesting counter

2020-05-04 Thread Joel Fernandes (Google)
The dynticks_nmi_nesting counter serves 3 purposes:

  (a) rcu_is_cpu_rrupt_from_idle() needs to be able to detect first
  interrupt nesting level.

  (b) We need to detect half-interrupts till we are sure they're not an
  issue. However, change the comparison to DYNTICK_IRQ_NONIDLE with 0.

  (c) When a quiescent state report is needed from a nohz_full CPU.
  The nesting counter detects we are a first level interrupt.

For (a), we can just use dyntick_nesting == 1 to determine this. Only the
outermost interrupt that interrupted an RCU-idle state can set it to 1.

For (b), this warning condition has not occurred for several kernel
releases.  But we still keep the warning but change it to use
in_irq() instead of the nesting counter. In a later year, we can remove
the warning.

For (c), the nest check is not really necessary since forced_tick would
have been set to true in the outermost interrupt, so the nested/NMI
interrupts will check forced_tick anyway, and bail.

Signed-off-by: Joel Fernandes (Google) 
---
 .../Data-Structures/Data-Structures.rst   | 31 ---
 Documentation/RCU/stallwarn.rst   |  6 +--
 kernel/rcu/tree.c | 54 +++
 kernel/rcu/tree.h |  4 +-
 kernel/rcu/tree_stall.h   |  4 +-
 5 files changed, 36 insertions(+), 63 deletions(-)

diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst 
b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
index 4a48e20a46f2b..a5a907f434a1a 100644
--- a/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
+++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.rst
@@ -936,10 +936,9 @@ This portion of the rcu_data structure is declared as 
follows:
 ::
 
  1   long dynticks_nesting;
- 2   long dynticks_nmi_nesting;
- 3   atomic_t dynticks;
- 4   bool rcu_need_heavy_qs;
- 5   bool rcu_urgent_qs;
+ 2   atomic_t dynticks;
+ 3   bool rcu_need_heavy_qs;
+ 4   bool rcu_urgent_qs;
 
 These fields in the rcu_data structure maintain the per-CPU dyntick-idle
 state for the corresponding CPU. The fields may be accessed only from
@@ -948,26 +947,14 @@ the corresponding CPU (and from tracing) unless otherwise 
stated.
 The ``->dynticks_nesting`` field counts the nesting depth of process
 execution, so that in normal circumstances this counter has value zero
 or one. NMIs, irqs, and tracers are counted by the
-``->dynticks_nmi_nesting`` field. Because NMIs cannot be masked, changes
+``->dynticks_nesting`` field as well. Because NMIs cannot be masked, changes
 to this variable have to be undertaken carefully using an algorithm
 provided by Andy Lutomirski. The initial transition from idle adds one,
 and nested transitions add two, so that a nesting level of five is
-represented by a ``->dynticks_nmi_nesting`` value of nine. This counter
+represented by a ``->dynticks_nesting`` value of nine. This counter
 can therefore be thought of as counting the number of reasons why this
-CPU cannot be permitted to enter dyntick-idle mode, aside from
-process-level transitions.
-
-However, it turns out that when running in non-idle kernel context, the
-Linux kernel is fully capable of entering interrupt handlers that never
-exit and perhaps also vice versa. Therefore, whenever the
-``->dynticks_nesting`` field is incremented up from zero, the
-``->dynticks_nmi_nesting`` field is set to a large positive number, and
-whenever the ``->dynticks_nesting`` field is decremented down to zero,
-the the ``->dynticks_nmi_nesting`` field is set to zero. Assuming that
-the number of misnested interrupts is not sufficient to overflow the
-counter, this approach corrects the ``->dynticks_nmi_nesting`` field
-every time the corresponding CPU enters the idle loop from process
-context.
+CPU cannot be permitted to enter dyntick-idle mode. It counts both the
+process-level and interrupt transitions.
 
 The ``->dynticks`` field counts the corresponding CPU's transitions to
 and from either dyntick-idle or user mode, so that this counter has an
@@ -1000,7 +987,9 @@ code.
 +---+
 | Because this would fail in the presence of interrupts whose handlers  |
 | never return and of handlers that manage to return from a made-up |
-| interrupt.|
+| interrupt. NOTE: The counters have now been combined however  |
+| a temporary warning has been left to make sure this condition never   |
+| occurs.   |
 +---+
 
 Additional fields are present for some special-purpose builds, and are
diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst
index 08bc9aec4606a..d7042d893b167 100644
--- 

[PATCH v3 1/5] Revert b8c17e6664c4 ("rcu: Maintain special bits at bottom of ->dynticks counter")

2020-05-04 Thread Joel Fernandes (Google)
This code is unused and can be removed now. Revert was straightforward.

Tested with rcutorture on all TREE configurations.

Link: 
http://lore.kernel.org/r/CALCETrWNPOOdTrFabTDd=h7+wc6xj9rjceg6ol1s0rtv5pf...@mail.gmail.com
Suggested-by: Andy Lutomirski 
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rcutiny.h |  3 --
 kernel/rcu/tree.c   | 93 +++--
 2 files changed, 24 insertions(+), 72 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 3465ba704a111..dbcddc7b26b94 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,9 +14,6 @@
 
 #include  /* for HZ */
 
-/* Never flag non-existent other CPUs! */
-static inline bool rcu_eqs_special_set(int cpu) { return false; }
-
 static inline unsigned long get_state_synchronize_rcu(void)
 {
return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6d39485f7f517..1ec7b1d4a03c4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -82,20 +82,10 @@
 
 /* Data structures. */
 
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control.  Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
-#ifndef rcu_eqs_special_exit
-#define rcu_eqs_special_exit() do { } while (0)
-#endif
-
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
-   .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+   .dynticks = ATOMIC_INIT(1),
 };
 static struct rcu_state rcu_state = {
.level = { _state.node[0] },
@@ -245,21 +235,18 @@ void rcu_softirq_qs(void)
 static void rcu_dynticks_eqs_enter(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
-   int seq;
+   int special;
+
+   rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
 
/*
-* CPUs seeing atomic_add_return() must see prior RCU read-side
+* CPUs seeing atomic_inc_return() must see prior RCU read-side
 * critical sections, and we also must force ordering with the
 * next idle sojourn.
 */
-   rcu_dynticks_task_trace_enter();  // Before ->dynticks update!
-   seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, >dynticks);
+   special = atomic_inc_return(>dynticks);
// RCU is no longer watching.  Better be in extended quiescent state!
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-(seq & RCU_DYNTICK_CTRL_CTR));
-   /* Better not have special action (TLB flush) pending! */
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-(seq & RCU_DYNTICK_CTRL_MASK));
+   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
 }
 
 /*
@@ -270,24 +257,18 @@ static void rcu_dynticks_eqs_enter(void)
 static void rcu_dynticks_eqs_exit(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
-   int seq;
+   int special;
 
/*
-* CPUs seeing atomic_add_return() must see prior idle sojourns,
+* CPUs seeing atomic_inc_return() must see prior idle sojourns,
 * and we also must force ordering with the next RCU read-side
 * critical section.
 */
-   seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, >dynticks);
+   special = atomic_inc_return(>dynticks);
// RCU is now watching.  Better not be in an extended quiescent state!
+   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
+
rcu_dynticks_task_trace_exit();  // After ->dynticks update!
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-!(seq & RCU_DYNTICK_CTRL_CTR));
-   if (seq & RCU_DYNTICK_CTRL_MASK) {
-   atomic_andnot(RCU_DYNTICK_CTRL_MASK, >dynticks);
-   smp_mb__after_atomic(); /* _exit after clearing mask. */
-   /* Prefer duplicate flushes to losing a flush. */
-   rcu_eqs_special_exit();
-   }
 }
 
 /*
@@ -304,9 +285,9 @@ static void rcu_dynticks_eqs_online(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   if (atomic_read(>dynticks) & RCU_DYNTICK_CTRL_CTR)
+   if (atomic_read(>dynticks) & 0x1)
return;
-   atomic_add(RCU_DYNTICK_CTRL_CTR, >dynticks);
+   atomic_add(0x1, >dynticks);
 }
 
 /*
@@ -318,7 +299,7 @@ static bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   return !(atomic_read(>dynticks) & RCU_DYNTICK_CTRL_CTR);
+   return !(atomic_read(>dynticks) & 0x1);
 }
 
 /*
@@ -329,7 +310,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
 {
int snap = atomic_add_return(0, >dynticks);
 
-   return snap & ~RCU_DYNTICK_CTRL_MASK;
+   return snap;
 }
 
 /*
@@ -338,7 +

[PATCH v3 0/5] RCU dyntick nesting counter cleanups for rcu -dev

2020-05-04 Thread Joel Fernandes (Google)
These patches clean up the usage of dynticks nesting counters simplifying the
code, while preserving the usecases.

It is a much needed simplification, makes the code less confusing, and prevents
future bugs such as those that arise from forgetting that the
dynticks_nmi_nesting counter is not a simple counter and can be "crowbarred" in
common situations.

rcutorture testing with all TREE RCU configurations succeed with
CONFIG_RCU_EQS_DEBUG=y and CONFIG_PROVE_LOCKING=y.

v1->v2:
- Rebase on v5.6-rc6

v2->v3:
- Rebase on rcu/dev with adjustments for tasks-RCU.

Joel Fernandes (Google) (4):
Revert b8c17e6664c4 ("rcu: Maintain special bits at bottom of
->dynticks counter")
rcu/tree: Add better tracing for dyntick-idle
rcu/tree: Clean up dynticks counter usage
rcu/tree: Remove dynticks_nmi_nesting counter

Madhuparna Bhowmik (1):
trace: events: rcu: Change description of rcu_dyntick trace event

.../Data-Structures/Data-Structures.rst   |  31 +--
Documentation/RCU/stallwarn.rst   |   6 +-
include/linux/rcutiny.h   |   3 -
include/trace/events/rcu.h|  29 +--
kernel/rcu/rcu.h  |   4 -
kernel/rcu/tree.c | 199 +++---
kernel/rcu/tree.h |   4 +-
kernel/rcu/tree_stall.h   |   4 +-
8 files changed, 110 insertions(+), 170 deletions(-)

--
2.26.2.526.g744177e7f7-goog



[tip: perf/core] perf_event: Add support for LSM and SELinux checks

2019-10-18 Thread tip-bot2 for Joel Fernandes (Google)
The following commit has been merged into the perf/core branch of tip:

Commit-ID: da97e18458fb42d7c00fac5fd1c56a3896ec666e
Gitweb:
https://git.kernel.org/tip/da97e18458fb42d7c00fac5fd1c56a3896ec666e
Author:Joel Fernandes (Google) 
AuthorDate:Mon, 14 Oct 2019 13:03:08 -04:00
Committer: Peter Zijlstra 
CommitterDate: Thu, 17 Oct 2019 21:31:55 +02:00

perf_event: Add support for LSM and SELinux checks

In current mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl.  This has a number of
limitations:

1. The sysctl is only a single value. Many types of accesses are controlled
   based on the single value thus making the control very limited and
   coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
   all processes get access to perf_event_open(2) opening the door to
   security issues.

This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.

5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
   syscall itself. The hook is called from all the places that the
   perf_event_paranoid sysctl is checked to keep it consistent with the
   systctl. The hook gets passed a 'type' argument which controls CPU,
   kernel and tracepoint accesses (in this context, CPU, kernel and
   tracepoint have the same semantics as the perf_event_paranoid sysctl).
   Additionally, I added an 'open' type which is similar to
   perf_event_paranoid sysctl == 3 patch carried in Android and several other
   distros but was rejected in mainline [1] in 2016.

2. perf_event_alloc: This allocates a new security object for the event
   which stores the current SID within the event. It will be useful when
   the perf event's FD is passed through IPC to another process which may
   try to read the FD. Appropriate security checks will limit access.

3. perf_event_free: Called when the event is closed.

4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.

5. perf_event_write: Called from the ioctl(2) syscalls for the event.

[1] https://lwn.net/Articles/696240/

Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.

To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.

Suggested-by: Peter Zijlstra 
Co-developed-by: Peter Zijlstra 
Signed-off-by: Joel Fernandes (Google) 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: James Morris 
Cc: Arnaldo Carvalho de Melo 
Cc: rost...@goodmis.org
Cc: Yonghong Song 
Cc: Kees Cook 
Cc: Ingo Molnar 
Cc: Alexei Starovoitov 
Cc: je...@google.com
Cc: Jiri Olsa 
Cc: Daniel Borkmann 
Cc: primi...@google.com
Cc: Song Liu 
Cc: rsavit...@google.com
Cc: Namhyung Kim 
Cc: Matthew Garrett 
Link: https://lkml.kernel.org/r/20191014170308.70668-1-j...@joelfernandes.org
---
 arch/powerpc/perf/core-book3s.c | 18 +++
 arch/x86/events/intel/bts.c |  8 +--
 arch/x86/events/intel/core.c|  5 +-
 arch/x86/events/intel/p4.c  |  5 +-
 include/linux/lsm_hooks.h   | 15 ++-
 include/linux/perf_event.h  | 36 ---
 include/linux/security.h| 38 ++-
 kernel/events/core.c| 57 ++-
 kernel/trace/trace_event_perf.c | 15 --
 security/security.c | 27 +++-
 security/selinux/hooks.c| 69 -
 security/selinux/include/classmap.h |  2 +-
 security/selinux/include/objsec.h   |  6 +-
 13 files changed, 261 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index ca92e01..4860462 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -96,7 +96,7 @@ static inline unsigned long perf_ip_adjust(struct pt_regs 
*regs)
 {
return 0;
 }
-static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { }
+static inline void perf_get_data_addr(struct perf_event *event, struct pt_regs 
*regs, u64 *addrp) { }
 static inline u32 perf_get_misc_flags(struct pt_regs *regs)
 {
return 0;
@@ -127,7 +127,7 @@ static unsigned long ebb_switch_in(bool ebb, struct 
cpu_hw_events *cpuhw)
 static inline void power_pmu_bhrb_enable(struct perf_event *event) {}
 static inline void power_pmu_bhrb_disable(struct perf_event *event) {}
 static void power_pmu_sched_task(struct perf_event_context *ctx, bool 
sched_in) {}
-static inline void power_pmu_bhrb_read(struct cpu_hw_events *cpuhw) {}
+static inline void power_pmu_bhrb_read(struct

[PATCH v2] perf_event: Add support for LSM and SELinux checks

2019-10-14 Thread Joel Fernandes (Google)
In current mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl.  This has a number of
limitations:

1. The sysctl is only a single value. Many types of accesses are controlled
   based on the single value thus making the control very limited and
   coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
   all processes get access to perf_event_open(2) opening the door to
   security issues.

This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.

5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
   syscall itself. The hook is called from all the places that the
   perf_event_paranoid sysctl is checked to keep it consistent with the
   systctl. The hook gets passed a 'type' argument which controls CPU,
   kernel and tracepoint accesses (in this context, CPU, kernel and
   tracepoint have the same semantics as the perf_event_paranoid sysctl).
   Additionally, I added an 'open' type which is similar to
   perf_event_paranoid sysctl == 3 patch carried in Android and several other
   distros but was rejected in mainline [1] in 2016.

2. perf_event_alloc: This allocates a new security object for the event
   which stores the current SID within the event. It will be useful when
   the perf event's FD is passed through IPC to another process which may
   try to read the FD. Appropriate security checks will limit access.

3. perf_event_free: Called when the event is closed.

4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.

5. perf_event_write: Called from the ioctl(2) syscalls for the event.

[1] https://lwn.net/Articles/696240/

Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.

To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.

Cc: Peter Zijlstra 
Cc: rost...@goodmis.org
Cc: primi...@google.com
Cc: rsavit...@google.com
Cc: je...@google.com
Cc: kernel-t...@android.com
Acked-by: James Morris 
Co-developed-by: Peter Zijlstra 
Suggested-by: Peter Zijlstra 
Signed-off-by: Joel Fernandes (Google) 
---

Changes since v1:
 o Fixes from Peter Ziljstra.
 o Added Ack from James Morris and Co-developed-by tag for Peter.
Changes since RFC:
 o Small nits, style changes (James Morris).
 o Consolidation of code (Peter Zijlstra).


 arch/x86/events/intel/bts.c |  8 ++--
 arch/x86/events/intel/core.c|  5 ++-
 arch/x86/events/intel/p4.c  |  5 ++-
 include/linux/lsm_hooks.h   | 15 +++
 include/linux/perf_event.h  | 28 +---
 include/linux/security.h| 39 +++-
 include/uapi/linux/perf_event.h |  9 
 kernel/events/core.c| 57 +++-
 kernel/trace/trace_event_perf.c | 15 ---
 security/security.c | 27 +++
 security/selinux/hooks.c| 69 +
 security/selinux/include/classmap.h |  2 +
 security/selinux/include/objsec.h   |  6 ++-
 13 files changed, 255 insertions(+), 30 deletions(-)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 5ee3fed881d3..38de4a7f6752 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event)
 * Note that the default paranoia setting permits unprivileged
 * users to profile the kernel.
 */
-   if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-   !capable(CAP_SYS_ADMIN))
-   return -EACCES;
+   if (event->attr.exclude_kernel) {
+   ret = perf_allow_kernel(>attr);
+   if (ret)
+   return ret;
+   }
 
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 27ee47a7be66..32967a9e9962 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (x86_pmu.version < 3)
return -EINVAL;
 
-   if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-   return -EACCES;
+   ret = perf_allow_cpu(>attr);
+   if (ret)
+   return ret;
 
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
 
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579efb2b2..a4cc66005ce8 100

[PATCH] perf_event: Add support for LSM and SELinux checks

2019-10-11 Thread Joel Fernandes (Google)
In currentl mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl.  This has a number of
limitations:

1. The sysctl is only a single value. Many types of accesses are controlled
   based on the single value thus making the control very limited and
   coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
   all processes get access to perf_event_open(2) opening the door to
   security issues.

This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.

5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
   syscall itself. The hook is called from all the places that the
   perf_event_paranoid sysctl is checked to keep it consistent with the
   systctl. The hook gets passed a 'type' argument which controls CPU,
   kernel and tracepoint accesses (in this context, CPU, kernel and
   tracepoint have the same semantics as the perf_event_paranoid sysctl).
   Additionally, I added an 'open' type which is similar to
   perf_event_paranoid sysctl == 3 patch carried in Android and several other
   distros but was rejected in mainline [1] in 2016.

2. perf_event_alloc: This allocates a new security object for the event
   which stores the current SID within the event. It will be useful when
   the perf event's FD is passed through IPC to another process which may
   try to read the FD. Appropriate security checks will limit access.

3. perf_event_free: Called when the event is closed.

4. perf_event_read: Called from the read(2) and mmap(2) syscalls for the event.

5. perf_event_write: Called from the ioctl(2) syscalls for the event.

[1] https://lwn.net/Articles/696240/

Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.

To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.

Suggested-by: Peter Zijlstra 
Cc: Peter Zijlstra 
Cc: rost...@goodmis.org
Cc: primi...@google.com
Cc: rsavit...@google.com
Cc: je...@google.com
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 
---
Changes since RFC:
 o Small nits, style changes (James Morris).
 o Consolidation of code (Peter Zijlstra).


 arch/x86/events/intel/bts.c |  8 ++--
 arch/x86/events/intel/core.c|  5 ++-
 arch/x86/events/intel/p4.c  |  5 ++-
 include/linux/lsm_hooks.h   | 15 +++
 include/linux/perf_event.h  | 28 +---
 include/linux/security.h| 39 +++-
 include/uapi/linux/perf_event.h |  9 
 kernel/events/core.c| 52 +-
 kernel/trace/trace_event_perf.c | 15 ---
 security/security.c | 27 +++
 security/selinux/hooks.c| 69 +
 security/selinux/include/classmap.h |  2 +
 security/selinux/include/objsec.h   |  6 ++-
 13 files changed, 250 insertions(+), 30 deletions(-)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 5ee3fed881d3..38de4a7f6752 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -549,9 +549,11 @@ static int bts_event_init(struct perf_event *event)
 * Note that the default paranoia setting permits unprivileged
 * users to profile the kernel.
 */
-   if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
-   !capable(CAP_SYS_ADMIN))
-   return -EACCES;
+   if (event->attr.exclude_kernel) {
+   ret = perf_allow_kernel(>attr);
+   if (ret)
+   return ret;
+   }
 
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 27ee47a7be66..32967a9e9962 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3315,8 +3315,9 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (x86_pmu.version < 3)
return -EINVAL;
 
-   if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
-   return -EACCES;
+   ret = perf_allow_cpu(>attr);
+   if (ret)
+   return ret;
 
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
 
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579efb2b2..a4cc66005ce8 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,8 +776,9 @@ static int p4_validate_raw_event(struct perf_event *event)
 * the user n

[PATCH RFC] perf_event: Add support for LSM and SELinux checks

2019-10-09 Thread Joel Fernandes (Google)
In currentl mainline, the degree of access to perf_event_open(2) system
call depends on the perf_event_paranoid sysctl.  This has a number of
limitations:

1. The sysctl is only a single value. Many types of accesses are controlled
   based on the single value thus making the control very limited and
   coarse grained.
2. The sysctl is global, so if the sysctl is changed, then that means
   all processes get access to perf_event_open(2) opening the door to
   security issues.

This patch adds LSM and SELinux access checking which will be used in
Android to access perf_event_open(2) for the purposes of attaching BPF
programs to tracepoints, perf profiling and other operations from
userspace. These operations are intended for production systems.

5 new LSM hooks are added:
1. perf_event_open: This controls access during the perf_event_open(2)
   syscall itself. The hook is called from all the places that the
   perf_event_paranoid sysctl is checked to keep it consistent with the
   systctl. The hook gets passed a 'type' argument which controls CPU,
   kernel and tracepoint accesses (in this context, CPU, kernel and
   tracepoint have the same semantics as the perf_event_paranoid sysctl).
   Additionally, I added an 'open' type which is similar to
   perf_event_paranoid sysctl == 3 patch carried in Android and several other
   distros but was rejected in mainline [1] in 2016.

2. perf_event_alloc: This allocates a new security object for the event
   which stores the current SID within the event. It will be useful when
   the perf event's FD is passed through IPC to another process which may
   try to read the FD. Appropriate security checks will limit access.

3. perf_event_free: Called when the event is closed.

4. perf_event_read: Called from the read(2) system call path for the event.

5. perf_event_write: Called from the read(2) system call path for the event.

[1] https://lwn.net/Articles/696240/

Since Peter had suggest LSM hooks in 2016 [1], I am adding his
Suggested-by tag below.

To use this patch, we set the perf_event_paranoid sysctl to -1 and then
apply selinux checking as appropriate (default deny everything, and then
add policy rules to give access to domains that need it). In the future
we can remove the perf_event_paranoid sysctl altogether.

Suggested-by: Peter Zijlstra 
Cc: Peter Zijlstra 
Cc: rost...@goodmis.org
Cc: primi...@google.com
Cc: rsavit...@google.com
Cc: je...@google.com
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 

---
 arch/x86/events/intel/bts.c |  5 +++
 arch/x86/events/intel/core.c|  5 +++
 arch/x86/events/intel/p4.c  |  5 +++
 include/linux/lsm_hooks.h   | 15 +++
 include/linux/perf_event.h  |  3 ++
 include/linux/security.h| 39 +++-
 include/uapi/linux/perf_event.h | 13 ++
 kernel/events/core.c| 59 +---
 kernel/trace/trace_event_perf.c | 15 ++-
 security/security.c | 33 ++
 security/selinux/hooks.c| 69 +
 security/selinux/include/classmap.h |  2 +
 security/selinux/include/objsec.h   |  6 ++-
 13 files changed, 259 insertions(+), 10 deletions(-)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 5ee3fed881d3..9796fc094dad 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -553,6 +554,10 @@ static int bts_event_init(struct perf_event *event)
!capable(CAP_SYS_ADMIN))
return -EACCES;
 
+   ret = security_perf_event_open(>attr, PERF_SECURITY_KERNEL);
+   if (ret)
+   return ret;
+
if (x86_add_exclusive(x86_lbr_exclusive_bts))
return -EBUSY;
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 27ee47a7be66..75b6b9b239ae 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -11,6 +11,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -3318,6 +3319,10 @@ static int intel_pmu_hw_config(struct perf_event *event)
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
return -EACCES;
 
+   ret = security_perf_event_open(>attr, PERF_SECURITY_CPU);
+   if (ret)
+   return ret;
+
event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
 
return 0;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index dee579efb2b2..6ac1a0328710 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -8,6 +8,7 @@
  */
 
 #include 
+#include 
 
 #include 
 #include 
@@ -778,6 +779,10 @@ static int p4_validate_raw_event(struct perf_event *event)
if (p4_ht_active() && p4_event_bind_map[v].shared) {
if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 

[PATCH] Remove GP_REPLAY state from rcu_sync

2019-10-04 Thread Joel Fernandes (Google)
From: Joel Fernandes 

Please consider this is an RFC for discussion only. Just want to discuss
why the GP_REPLAY state is needed at all.

Here's the intention AFAICS:
When rcu_sync_exit() has happened, the gp_state changes to GP_EXIT while
we wait for a grace period before transitioning to GP_IDLE. In the
meanwhile, if we receive another rcu_sync_exit(), then we want to wait
for another GP to account for that.

Drawing some timing diagrams, it looks like this:

Legend:
rse = rcu_sync_enter
rsx = rcu_sync_exit
i = GP_IDLE
x = GP_EXIT
r = GP_REPLAY
e = GP_ENTER
p = GP_PASSED
rx = GP_REPLAY changes to GP_EXIT

GP num = The GP we are one.

note: A GP passes between the states:
  e and p
  x and i
  x and rx
  rx and i

In a simple case, the timing and states look like:
time
-->
GP num 111222
GP state  ie px i
CPU0 : rsersx

However we can enter the replay state like this:
time
-->
GP num 1112333
GP state  ie px  r rxi
CPU0 : rsersx
CPU1 : rse rsx

Due to the second rse + rsx, we had to wait for another GP.

I believe the rationale is, if another rsx happens, another GP has to
happen.

But this is not always true if you consider the following events:

time
-->
GP num 11 2
GP state  ie px r  rx i
CPU0 : rsersx
CPU1 : rse rsx
CPU2 : rse rsx

Here, we had 3 grace periods that elapsed, 1 for the rcu_sync_enter(),
and 2 for the rcu_sync_exit(s).

However, we had 3 rcu_sync_exit()s, not 2. In other words, the
rcu_sync_exit() got batched.

So my point here is, rcu_sync_exit() does not really always cause a new
GP to happen and we can end up having the rcu_sync_exit()s as batched
and sharing the same grace period.

Then what is the point of the GP_REPLAY state at all if it does not
always wait for a new GP?  Taking a step back, why did we intend to have
to wait for a new GP if another rcu_sync_exit() comes while one is still
in progress?

Cc: bris...@redhat.com
Cc: pet...@infradead.org
Cc: o...@redhat.com
Cc: paul...@kernel.org
Cc: r...@vger.kernel.org
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/sync.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
index d4558ab7a07d..4f3aad67992c 100644
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -10,7 +10,7 @@
 #include 
 #include 
 
-enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT, GP_REPLAY };
+enum { GP_IDLE = 0, GP_ENTER, GP_PASSED, GP_EXIT };
 
 #definerss_lockgp_wait.lock
 
@@ -85,13 +85,6 @@ static void rcu_sync_func(struct rcu_head *rhp)
 */
WRITE_ONCE(rsp->gp_state, GP_PASSED);
wake_up_locked(>gp_wait);
-   } else if (rsp->gp_state == GP_REPLAY) {
-   /*
-* A new rcu_sync_exit() has happened; requeue the callback to
-* catch a later GP.
-*/
-   WRITE_ONCE(rsp->gp_state, GP_EXIT);
-   rcu_sync_call(rsp);
} else {
/*
 * We're at least a GP after the last rcu_sync_exit(); eveybody
@@ -167,16 +160,13 @@ void rcu_sync_enter(struct rcu_sync *rsp)
  */
 void rcu_sync_exit(struct rcu_sync *rsp)
 {
-   WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE);
-   WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0);
+   WARN_ON_ONCE(READ_ONCE(rsp->gp_state) < GP_PASSED);
 
spin_lock_irq(>rss_lock);
if (!--rsp->gp_count) {
if (rsp->gp_state == GP_PASSED) {
WRITE_ONCE(rsp->gp_state, GP_EXIT);
rcu_sync_call(rsp);
-   } else if (rsp->gp_state == GP_EXIT) {
-   WRITE_ONCE(rsp->gp_state, GP_REPLAY);
}
}
spin_unlock_irq(>rss_lock);
-- 
2.23.0.581.g78d2f28ef7-goog



[PATCH] MAINTAINERS: Add me for Linux Kernel memory consistency model (LKMM)

2019-10-02 Thread Joel Fernandes (Google)
Quite interested in the LKMM, I have submitted patches before and used
it a lot. I would like to be a part of the maintainers for this project.

Cc: Paul McKenney 
Suggested-by: Alan Stern 
Signed-off-by: Joel Fernandes (Google) 

---
 MAINTAINERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 296de2b51c83..ecf6d265a88d 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9473,6 +9473,7 @@ M:David Howells 
 M: Jade Alglave 
 M: Luc Maranget 
 M: "Paul E. McKenney" 
+M: Joel Fernandes 
 R: Akira Yokosawa 
 R: Daniel Lustig 
 L: linux-kernel@vger.kernel.org
-- 
2.23.0.444.g18eeb5a265-goog



[PATCH v3] mm: emit tracepoint when RSS changes

2019-10-01 Thread Joel Fernandes (Google)
Useful to track how RSS is changing per TGID to detect spikes in RSS and
memory hogs. Several Android teams have been using this patch in various
kernel trees for half a year now. Many reported to me it is really
useful so I'm posting it upstream.

Initial patch developed by Tim Murray. Changes I made from original patch:
o Prevent any additional space consumed by mm_struct.

Regarding the fact that the RSS may change too often thus flooding the
traces - note that, there is some "hysterisis" with this already. That
is - We update the counter only if we receive 64 page faults due to
SPLIT_RSS_ACCOUNTING. However, during zapping or copying of pte range,
the RSS is updated immediately which can become noisy/flooding. In a
previous discussion, we agreed that BPF or ftrace can be used to rate
limit the signal if this becomes an issue.

Also note that I added wrappers to trace_rss_stat to prevent compiler
errors where linux/mm.h is included from tracing code, causing errors
such as:
  CC  kernel/trace/power-traces.o
In file included from ./include/trace/define_trace.h:102,
 from ./include/trace/events/kmem.h:342,
 from ./include/linux/mm.h:31,
 from ./include/linux/ring_buffer.h:5,
 from ./include/linux/trace_events.h:6,
 from ./include/trace/events/power.h:12,
 from kernel/trace/power-traces.c:15:
./include/trace/trace_events.h:113:22: error: field ‘ent’ has incomplete type
   struct trace_entry ent;\

Link: http://lore.kernel.org/r/20190903200905.198642-1-j...@joelfernandes.org
Acked-by: Michal Hocko 
Co-developed-by: Tim Murray 
Signed-off-by: Tim Murray 
Signed-off-by: Joel Fernandes (Google) 

---

v2->v3: Removed optimization for rate limitting and we can do so from
tracing code.
Added Michal's ack after private discussion.

v1->v2: Added more commit message.

Cc: carmenjack...@google.com
Cc: mayankgu...@google.com
Cc: dan...@google.com
Cc: rost...@goodmis.org
Cc: minc...@kernel.org
Cc: a...@linux-foundation.org
Cc: kernel-t...@android.com

 include/linux/mm.h  | 14 +++---
 include/trace/events/kmem.h | 21 +
 mm/memory.c |  6 ++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..fb8619c5a87d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct 
mm_struct *mm, int member)
return (unsigned long)val;
 }
 
+void mm_trace_rss_stat(int member, long count);
+
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-   atomic_long_add(value, >rss_stat.count[member]);
+   long count = atomic_long_add_return(value, >rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-   atomic_long_inc(>rss_stat.count[member]);
+   long count = atomic_long_inc_return(>rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-   atomic_long_dec(>rss_stat.count[member]);
+   long count = atomic_long_dec_return(>rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count);
 }
 
 /* Optimized variant when page is already known not to be PageAnon */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index eb57e3037deb..8b88e04fafbf 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->change_ownership)
 );
 
+TRACE_EVENT(rss_stat,
+
+   TP_PROTO(int member,
+   long count),
+
+   TP_ARGS(member, count),
+
+   TP_STRUCT__entry(
+   __field(int, member)
+   __field(long, size)
+   ),
+
+   TP_fast_assign(
+   __entry->member = member;
+   __entry->size = (count << PAGE_SHIFT);
+   ),
+
+   TP_printk("member=%d size=%ldB",
+   __entry->member,
+   __entry->size)
+   );
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..4b31ac2fef42 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -140,6 +142,10 @@ static int __init init_zero_pfn(void)
 }
 core_initcall(init_zero_pfn);
 
+void mm_trace_rss_stat(int member, long count)
+{
+   trace_rss_stat(member, count);
+}
 
 #if defined(SPLIT_RSS_COUNTING)
 
-- 
2.23.0.444.g18eeb5a265-goog


[PATCH] binder: Fix comment headers on binder_alloc_prepare_to_free()

2019-09-30 Thread Joel Fernandes (Google)
binder_alloc_buffer_lookup() doesn't exist and is named
"binder_alloc_prepare_to_free()". Correct the code comments to reflect
this.

Signed-off-by: Joel Fernandes (Google) 

---
 drivers/android/binder_alloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c
index 6d79a1b0d446..d42a8b2f636a 100644
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -156,7 +156,7 @@ static struct binder_buffer 
*binder_alloc_prepare_to_free_locked(
 }
 
 /**
- * binder_alloc_buffer_lookup() - get buffer given user ptr
+ * binder_alloc_prepare_to_free() - get buffer given user ptr
  * @alloc: binder_alloc for this proc
  * @user_ptr:  User pointer to buffer data
  *
-- 
2.23.0.444.g18eeb5a265-goog



[PATCH v2] mm: emit tracepoint when RSS changes by threshold

2019-09-03 Thread Joel Fernandes (Google)
Useful to track how RSS is changing per TGID to detect spikes in RSS and
memory hogs. Several Android teams have been using this patch in various
kernel trees for half a year now. Many reported to me it is really
useful so I'm posting it upstream.

Initial patch developed by Tim Murray. Changes I made from original patch:
o Prevent any additional space consumed by mm_struct.
o Keep overhead low by checking if tracing is enabled.
o Add some noise reduction and lower overhead by emitting only on
  threshold changes.

Co-developed-by: Tim Murray 
Signed-off-by: Tim Murray 
Signed-off-by: Joel Fernandes (Google) 

---

v1->v2: Added more commit message.

Cc: carmenjack...@google.com
Cc: mayankgu...@google.com
Cc: dan...@google.com
Cc: rost...@goodmis.org
Cc: minc...@kernel.org
Cc: a...@linux-foundation.org
Cc: kernel-t...@android.com

 include/linux/mm.h  | 14 +++---
 include/trace/events/kmem.h | 21 +
 mm/memory.c | 20 
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..823aaf759bdb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct 
mm_struct *mm, int member)
return (unsigned long)val;
 }
 
+void mm_trace_rss_stat(int member, long count, long value);
+
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-   atomic_long_add(value, >rss_stat.count[member]);
+   long count = atomic_long_add_return(value, >rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count, value);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-   atomic_long_inc(>rss_stat.count[member]);
+   long count = atomic_long_inc_return(>rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count, 1);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-   atomic_long_dec(>rss_stat.count[member]);
+   long count = atomic_long_dec_return(>rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count, -1);
 }
 
 /* Optimized variant when page is already known not to be PageAnon */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index eb57e3037deb..8b88e04fafbf 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->change_ownership)
 );
 
+TRACE_EVENT(rss_stat,
+
+   TP_PROTO(int member,
+   long count),
+
+   TP_ARGS(member, count),
+
+   TP_STRUCT__entry(
+   __field(int, member)
+   __field(long, size)
+   ),
+
+   TP_fast_assign(
+   __entry->member = member;
+   __entry->size = (count << PAGE_SHIFT);
+   ),
+
+   TP_printk("member=%d size=%ldB",
+   __entry->member,
+   __entry->size)
+   );
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..9d81322c24a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
 }
 core_initcall(init_zero_pfn);
 
+/*
+ * This threshold is the boundary in the value space, that the counter has to
+ * advance before we trace it. Should be a power of 2. It is to reduce unwanted
+ * trace overhead. The counter is in units of number of pages.
+ */
+#define TRACE_MM_COUNTER_THRESHOLD 128
+
+void mm_trace_rss_stat(int member, long count, long value)
+{
+   long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
+
+   if (!trace_rss_stat_enabled())
+   return;
+
+   /* Threshold roll-over, trace it */
+   if ((count & thresh_mask) != ((count - value) & thresh_mask))
+   trace_rss_stat(member, count);
+}
 
 #if defined(SPLIT_RSS_COUNTING)
 
-- 
2.23.0.187.g17f5b7556c-goog


[PATCH] mm: emit tracepoint when RSS changes by threshold

2019-09-03 Thread Joel Fernandes (Google)
Useful to track how RSS is changing per TGID. Several Android teams have
been using this patch in various kernel trees for half a year now. Many
reported to me it is really useful.

Initial patch developed by Tim Murray. Changes I made from original patch:
o Prevent any additional space consumed by mm_struct.
o Keep overhead low by checking if tracing is enabled.
o Add some noise reduction and lower overhead by emitting only on
  threshold changes.

Co-developed-by: Tim Murray 
Signed-off-by: Tim Murray 
Signed-off-by: Joel Fernandes (Google) 

---

Cc: carmenjack...@google.com
Cc: mayankgu...@google.com
Cc: dan...@google.com
Cc: rost...@goodmis.org
Cc: minc...@kernel.org
Cc: a...@linux-foundation.org
Cc: kernel-t...@android.com

 include/linux/mm.h  | 14 +++---
 include/trace/events/kmem.h | 21 +
 mm/memory.c | 20 
 3 files changed, 52 insertions(+), 3 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0334ca97c584..823aaf759bdb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1671,19 +1671,27 @@ static inline unsigned long get_mm_counter(struct 
mm_struct *mm, int member)
return (unsigned long)val;
 }
 
+void mm_trace_rss_stat(int member, long count, long value);
+
 static inline void add_mm_counter(struct mm_struct *mm, int member, long value)
 {
-   atomic_long_add(value, >rss_stat.count[member]);
+   long count = atomic_long_add_return(value, >rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count, value);
 }
 
 static inline void inc_mm_counter(struct mm_struct *mm, int member)
 {
-   atomic_long_inc(>rss_stat.count[member]);
+   long count = atomic_long_inc_return(>rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count, 1);
 }
 
 static inline void dec_mm_counter(struct mm_struct *mm, int member)
 {
-   atomic_long_dec(>rss_stat.count[member]);
+   long count = atomic_long_dec_return(>rss_stat.count[member]);
+
+   mm_trace_rss_stat(member, count, -1);
 }
 
 /* Optimized variant when page is already known not to be PageAnon */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index eb57e3037deb..8b88e04fafbf 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -315,6 +315,27 @@ TRACE_EVENT(mm_page_alloc_extfrag,
__entry->change_ownership)
 );
 
+TRACE_EVENT(rss_stat,
+
+   TP_PROTO(int member,
+   long count),
+
+   TP_ARGS(member, count),
+
+   TP_STRUCT__entry(
+   __field(int, member)
+   __field(long, size)
+   ),
+
+   TP_fast_assign(
+   __entry->member = member;
+   __entry->size = (count << PAGE_SHIFT);
+   ),
+
+   TP_printk("member=%d size=%ldB",
+   __entry->member,
+   __entry->size)
+   );
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/mm/memory.c b/mm/memory.c
index e2bb51b6242e..9d81322c24a3 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -72,6 +72,8 @@
 #include 
 #include 
 
+#include 
+
 #include 
 #include 
 #include 
@@ -140,6 +142,24 @@ static int __init init_zero_pfn(void)
 }
 core_initcall(init_zero_pfn);
 
+/*
+ * This threshold is the boundary in the value space, that the counter has to
+ * advance before we trace it. Should be a power of 2. It is to reduce unwanted
+ * trace overhead. The counter is in units of number of pages.
+ */
+#define TRACE_MM_COUNTER_THRESHOLD 128
+
+void mm_trace_rss_stat(int member, long count, long value)
+{
+   long thresh_mask = ~(TRACE_MM_COUNTER_THRESHOLD - 1);
+
+   if (!trace_rss_stat_enabled())
+   return;
+
+   /* Threshold roll-over, trace it */
+   if ((count & thresh_mask) != ((count - value) & thresh_mask))
+   trace_rss_stat(member, count);
+}
 
 #if defined(SPLIT_RSS_COUNTING)
 
-- 
2.23.0.187.g17f5b7556c-goog


[PATCH 1/2] pci: Convert to use built-in RCU list checking

2019-08-30 Thread Joel Fernandes (Google)
CONFIG_PROVE_RCU_LIST requires list_for_each_entry_rcu() to pass a
lockdep expression if using srcu or locking for protection. It can only
check regular RCU protection, all other protection needs to be passed as
lockdep expression.

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/pci/controller/vmd.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c
index 4575e0c6dc4b..127631d0c6da 100644
--- a/drivers/pci/controller/vmd.c
+++ b/drivers/pci/controller/vmd.c
@@ -718,7 +718,8 @@ static irqreturn_t vmd_irq(int irq, void *data)
int idx;
 
idx = srcu_read_lock(>srcu);
-   list_for_each_entry_rcu(vmdirq, >irq_list, node)
+   list_for_each_entry_rcu(vmdirq, >irq_list, node,
+   srcu_read_lock_held(>srcu))
generic_handle_irq(vmdirq->virq);
srcu_read_unlock(>srcu, idx);
 
-- 
2.23.0.187.g17f5b7556c-goog



[PATCH 2/2] ipc/sem: Convert to use built-in RCU list checking

2019-08-30 Thread Joel Fernandes (Google)
CONFIG_PROVE_RCU_LIST requires list_for_each_entry_rcu() to pass a
lockdep expression if using srcu or locking for protection. It can only
check regular RCU protection, all other protection needs to be passed as
lockdep expression.

Signed-off-by: Joel Fernandes (Google) 
---
 ipc/sem.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ipc/sem.c b/ipc/sem.c
index 7da4504bcc7c..ec97a7072413 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1852,7 +1852,8 @@ static struct sem_undo *__lookup_undo(struct 
sem_undo_list *ulp, int semid)
 {
struct sem_undo *un;
 
-   list_for_each_entry_rcu(un, >list_proc, list_proc) {
+   list_for_each_entry_rcu(un, >list_proc, list_proc,
+   spin_is_locked(>lock)) {
if (un->semid == semid)
return un;
}
-- 
2.23.0.187.g17f5b7556c-goog



[PATCH v2 -rcu dev 2/5] rcu/tree: Add multiple in-flight batches of kfree_rcu work

2019-08-30 Thread Joel Fernandes (Google)
During testing, it was observed that amount of memory consumed due
kfree_rcu() batching is 300-400MB. Previously we had only a single
head_free pointer pointing to the list of rcu_head(s) that are to be
freed after a grace period. Until this list is drained, we cannot queue
any more objects on it since such objects may not be ready to be
reclaimed when the worker thread eventually gets to drainin g the
head_free list.

We can do better by maintaining multiple lists as done by this patch.
Testing shows that memory consumption came down by around 100-150MB with
just adding another list. Adding more than 1 additional list did not
show any improvement.

Suggested-by: Paul E. McKenney 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 80 +--
 1 file changed, 56 insertions(+), 24 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 68ebf0eb64c8..2e1772469de9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2688,28 +2688,37 @@ EXPORT_SYMBOL_GPL(call_rcu);
 
 /* Maximum number of jiffies to wait before draining a batch. */
 #define KFREE_DRAIN_JIFFIES (HZ / 50)
+#define KFREE_N_BATCHES 2
+
+struct kfree_rcu_work {
+   /* The rcu_work node for queuing work with queue_rcu_work(). The work
+* is done after a grace period.
+*/
+   struct rcu_work rcu_work;
+
+   /* The list of objects that have now left ->head and are queued for
+* freeing after a grace period.
+*/
+   struct rcu_head *head_free;
+
+   struct kfree_rcu_cpu *krcp;
+};
 
 /*
  * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
  * kfree(s) is queued for freeing after a grace period, right away.
  */
 struct kfree_rcu_cpu {
-   /* The rcu_work node for queuing work with queue_rcu_work(). The work
-* is done after a grace period.
-*/
-   struct rcu_work rcu_work;
 
/* The list of objects being queued in a batch but are not yet
 * scheduled to be freed.
 */
struct rcu_head *head;
 
-   /* The list of objects that have now left ->head and are queued for
-* freeing after a grace period.
-*/
-   struct rcu_head *head_free;
+   /* Pointer to the per-cpu array of kfree_rcu_work structures */
+   struct kfree_rcu_work krw_arr[KFREE_N_BATCHES];
 
-   /* Protect concurrent access to this structure. */
+   /* Protect concurrent access to this structure and kfree_rcu_work. */
spinlock_t lock;
 
/* The delayed work that flushes ->head to ->head_free incase ->head
@@ -2717,7 +2726,7 @@ struct kfree_rcu_cpu {
 * is busy, ->head just continues to grow and we retry flushing later.
 */
struct delayed_work monitor_work;
-   int monitor_todo;   /* Is a delayed work pending execution? */
+   bool monitor_todo;  /* Is a delayed work pending execution? */
 };
 
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
@@ -2730,12 +2739,15 @@ static void kfree_rcu_work(struct work_struct *work)
 {
unsigned long flags;
struct rcu_head *head, *next;
-   struct kfree_rcu_cpu *krcp = container_of(to_rcu_work(work),
-   struct kfree_rcu_cpu, rcu_work);
+   struct kfree_rcu_work *krwp = container_of(to_rcu_work(work),
+   struct kfree_rcu_work, rcu_work);
+   struct kfree_rcu_cpu *krcp;
+
+   krcp = krwp->krcp;
 
spin_lock_irqsave(>lock, flags);
-   head = krcp->head_free;
-   krcp->head_free = NULL;
+   head = krwp->head_free;
+   krwp->head_free = NULL;
spin_unlock_irqrestore(>lock, flags);
 
/*
@@ -2758,19 +2770,30 @@ static void kfree_rcu_work(struct work_struct *work)
  */
 static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
 {
+   int i = 0;
+   struct kfree_rcu_work *krwp = NULL;
+
lockdep_assert_held(>lock);
+   while (i < KFREE_N_BATCHES) {
+   if (!krcp->krw_arr[i].head_free) {
+   krwp = &(krcp->krw_arr[i]);
+   break;
+   }
+   i++;
+   }
 
-   /* If a previous RCU batch work is already in progress, we cannot queue
+   /*
+* If both RCU batches are already in progress, we cannot queue
 * another one, just refuse the optimization and it will be retried
 * again in KFREE_DRAIN_JIFFIES time.
 */
-   if (krcp->head_free)
+   if (!krwp)
return false;
 
-   krcp->head_free = krcp->head;
+   krwp->head_free = krcp->head;
krcp->head = NULL;
-   INIT_RCU_WORK(>rcu_work, kfree_rcu_work);
-   queue_rcu_work(system_wq, >rcu_work);
+   INIT_RCU_WORK(>rcu_work, kfree_rcu_work);
+   queue_rcu_work(system_wq, >rcu_work);
 
return true;
 

[PATCH v2 -rcu dev 0/5] kfree_rcu() additions for -rcu

2019-08-30 Thread Joel Fernandes (Google)
Hi,

This is a series on top of the patch "rcu/tree: Add basic support for 
kfree_rcu() batching".

It adds performance tests, some clean ups and removal of "lazy" RCU callbacks.

Now that kfree_rcu() is handled separately from call_rcu(), we also get rid of
kfree "lazy" handling from tree RCU as suggested by Paul which will be unused.

Based on patch:
Link: http://lore.kernel.org/r/20190814160411.58591-1-j...@joelfernandes.org


v1 series:
https://lkml.org/lkml/2019/8/27/1315
https://lore.kernel.org/patchwork/project/lkml/list/?series=408218

Joel Fernandes (Google) (5):
rcu/rcuperf: Add kfree_rcu() performance Tests
rcu/tree: Add multiple in-flight batches of kfree_rcu work
rcu/tree: Add support for debug_objects debugging for kfree_rcu()
rcu: Remove kfree_rcu() special casing and lazy handling
rcu: Remove kfree_call_rcu_nobatch()

Documentation/RCU/stallwarn.txt   |  11 +-
.../admin-guide/kernel-parameters.txt |  13 ++
include/linux/rcu_segcblist.h |   2 -
include/linux/rcutiny.h   |   5 -
include/linux/rcutree.h   |   1 -
include/trace/events/rcu.h|  32 ++--
kernel/rcu/rcu.h  |  27 ---
kernel/rcu/rcu_segcblist.c|  25 +--
kernel/rcu/rcu_segcblist.h|  25 +--
kernel/rcu/rcuperf.c  | 173 +-
kernel/rcu/srcutree.c |   4 +-
kernel/rcu/tiny.c |  29 ++-
kernel/rcu/tree.c | 155 ++--
kernel/rcu/tree.h |   1 -
kernel/rcu/tree_plugin.h  |  48 ++---
kernel/rcu/tree_stall.h   |   6 +-
16 files changed, 343 insertions(+), 214 deletions(-)

--
2.23.0.187.g17f5b7556c-goog



[PATCH v2 -rcu dev 5/5] rcu: Remove kfree_call_rcu_nobatch()

2019-08-30 Thread Joel Fernandes (Google)
Now that kfree_rcu() special casing have been removed from tree RCU,
remove kfree_call_rcu_nobatch() since it is not needed.

Signed-off-by: Joel Fernandes (Google) 
---
 .../admin-guide/kernel-parameters.txt |  4 ---
 include/linux/rcutiny.h   |  5 ---
 include/linux/rcutree.h   |  1 -
 kernel/rcu/rcuperf.c  | 10 +-
 kernel/rcu/tree.c | 34 ---
 5 files changed, 15 insertions(+), 39 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 24fe8aefb12c..56be0e30100b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3909,10 +3909,6 @@
Number of loops doing rcuperf.kfree_alloc_num number
of allocations and frees.
 
-   rcuperf.kfree_no_batch= [KNL]
-   Use the non-batching (less efficient) version of 
kfree_rcu().
-   This is useful for comparing with the batched version.
-
rcuperf.nreaders= [KNL]
Set number of RCU readers.  The value -1 selects
N, where N is the number of CPUs.  A value
diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b7607e2667ae..37b6f0c2b79d 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -39,11 +39,6 @@ static inline void kfree_call_rcu(struct rcu_head *head, 
rcu_callback_t func)
call_rcu(head, func);
 }
 
-static inline void kfree_call_rcu_nobatch(struct rcu_head *head, 
rcu_callback_t func)
-{
-   call_rcu(head, func);
-}
-
 void rcu_qs(void);
 
 static inline void rcu_softirq_qs(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 961b7e05d141..0b68aa952f8b 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,7 +34,6 @@ static inline void rcu_virt_note_context_switch(int cpu)
 
 void synchronize_rcu_expedited(void);
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
-void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
 
 void rcu_barrier(void);
 bool rcu_eqs_special_set(int cpu);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index c1e25fd10f2a..da94b89cd531 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -593,7 +593,6 @@ rcu_perf_shutdown(void *arg)
 torture_param(int, kfree_nthreads, -1, "Number of threads running loops of 
kfree_rcu().");
 torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees 
done in an iteration.");
 torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num 
allocations and frees.");
-torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version 
of kfree_rcu().");
 
 static struct task_struct **kfree_reader_tasks;
 static int kfree_nrealthreads;
@@ -632,14 +631,7 @@ kfree_perf_thread(void *arg)
if (!alloc_ptr)
return -ENOMEM;
 
-   if (!kfree_no_batch) {
-   kfree_rcu(alloc_ptr, rh);
-   } else {
-   rcu_callback_t cb;
-
-   cb = (rcu_callback_t)(unsigned 
long)offsetof(struct kfree_obj, rh);
-   kfree_call_rcu_nobatch(&(alloc_ptr->rh), cb);
-   }
+   kfree_rcu(alloc_ptr, rh);
}
 
cond_resched();
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index cd7bbc74ae20..72550343843f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2770,8 +2770,10 @@ static void kfree_rcu_work(struct work_struct *work)
rcu_lock_acquire(_callback_map);
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
 
-   /* Could be possible to optimize with kfree_bulk in future */
-   kfree((void *)head - offset);
+   if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset))) {
+   /* Could be optimized with kfree_bulk() in future. */
+   kfree((void *)head - offset);
+   }
 
rcu_lock_release(_callback_map);
cond_resched_tasks_rcu_qs();
@@ -2857,16 +2859,6 @@ static void kfree_rcu_monitor(struct work_struct *work)
}
 }
 
-/*
- * This version of kfree_call_rcu does not do batching of kfree_rcu() requests.
- * Used only by rcuperf torture test for comparison with kfree_rcu_batch().
- */
-void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func)
-{
-   __call_rcu(head, func);
-}
-EXPORT_SYMBOL_GPL(kfree_call_rcu_nobatch);
-
 /*
  * Queue a request for lazy invocation of kfree() after a grace period.
  *
@@ -2886,12 +2878,6 @@ void kfree_call_rcu(struct rcu_head *head, 
r

[PATCH v2 -rcu dev 3/5] rcu/tree: Add support for debug_objects debugging for kfree_rcu()

2019-08-30 Thread Joel Fernandes (Google)
Make use of RCU's debug_objects debugging support
(CONFIG_DEBUG_OBJECTS_RCU_HEAD) similar to call_rcu() and other flavors.
We queue the object during the kfree_rcu() call and dequeue it during
reclaim.

Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects
double kfree_rcu() calls.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2e1772469de9..de13805d1bd0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2757,6 +2757,7 @@ static void kfree_rcu_work(struct work_struct *work)
for (; head; head = next) {
next = head->next;
/* Could be possible to optimize with kfree_bulk in future */
+   debug_rcu_head_unqueue(head);
__rcu_reclaim(rcu_state.name, head);
cond_resched_tasks_rcu_qs();
}
@@ -2876,6 +2877,13 @@ void kfree_call_rcu(struct rcu_head *head, 
rcu_callback_t func)
if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
return kfree_call_rcu_nobatch(head, func);
 
+   if (debug_rcu_head_queue(head)) {
+   /* Probable double kfree_rcu() */
+   WARN_ONCE(1, "kfree_call_rcu(): Double-freed call. rcu_head 
%p\n",
+ head);
+   return;
+   }
+
head->func = func;
 
local_irq_save(flags);  /* For safely calling this_cpu_ptr(). */
-- 
2.23.0.187.g17f5b7556c-goog



[PATCH v2 -rcu dev 4/5] rcu: Remove kfree_rcu() special casing and lazy handling

2019-08-30 Thread Joel Fernandes (Google)
Remove kfree_rcu() special casing and lazy handling from RCU.
For Tiny RCU we fold the special handling into just Tiny RCU code.

Results in a nice negative delta as well.

Suggested-by: Paul E. McKenney 
Signed-off-by: Joel Fernandes (Google) 
---
 Documentation/RCU/stallwarn.txt | 11 +++-
 include/linux/rcu_segcblist.h   |  2 --
 include/trace/events/rcu.h  | 32 +-
 kernel/rcu/rcu.h| 27 ---
 kernel/rcu/rcu_segcblist.c  | 25 +++--
 kernel/rcu/rcu_segcblist.h  | 25 ++---
 kernel/rcu/srcutree.c   |  4 +--
 kernel/rcu/tiny.c   | 29 +++-
 kernel/rcu/tree.c   | 41 +++-
 kernel/rcu/tree.h   |  1 -
 kernel/rcu/tree_plugin.h| 48 -
 kernel/rcu/tree_stall.h |  6 ++---
 12 files changed, 91 insertions(+), 160 deletions(-)

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index f48f4621ccbc..a360a8796710 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -225,18 +225,13 @@ an estimate of the total number of RCU callbacks queued 
across all CPUs
 In kernels with CONFIG_RCU_FAST_NO_HZ, more information is printed
 for each CPU:
 
-   0: (64628 ticks this GP) idle=dd5/3fff/0 softirq=82/543 
last_accelerate: a345/d342 Nonlazy posted: ..D
+   0: (64628 ticks this GP) idle=dd5/3fff/0 softirq=82/543 
last_accelerate: a345/d342 dyntick_enabled: 1
 
 The "last_accelerate:" prints the low-order 16 bits (in hex) of the
 jiffies counter when this CPU last invoked rcu_try_advance_all_cbs()
 from rcu_needs_cpu() or last invoked rcu_accelerate_cbs() from
-rcu_prepare_for_idle().  The "Nonlazy posted:" indicates lazy-callback
-status, so that an "l" indicates that all callbacks were lazy at the start
-of the last idle period and an "L" indicates that there are currently
-no non-lazy callbacks (in both cases, "." is printed otherwise, as
-shown above) and "D" indicates that dyntick-idle processing is enabled
-("." is printed otherwise, for example, if disabled via the "nohz="
-kernel boot parameter).
+rcu_prepare_for_idle(). "dyntick_enabled: 1" indicates that dyntick-idle
+processing is enabled.
 
 If the grace period ends just as the stall warning starts printing,
 there will be a spurious stall-warning message, which will include
diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h
index 646759042333..b36afe7b22c9 100644
--- a/include/linux/rcu_segcblist.h
+++ b/include/linux/rcu_segcblist.h
@@ -22,7 +22,6 @@ struct rcu_cblist {
struct rcu_head *head;
struct rcu_head **tail;
long len;
-   long len_lazy;
 };
 
 #define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail =  }
@@ -73,7 +72,6 @@ struct rcu_segcblist {
 #else
long len;
 #endif
-   long len_lazy;
u8 enabled;
u8 offloaded;
 };
diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 66122602bd08..4ab16fcda895 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -481,16 +481,14 @@ TRACE_EVENT_RCU(rcu_dyntick,
  */
 TRACE_EVENT_RCU(rcu_callback,
 
-   TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen_lazy,
-long qlen),
+   TP_PROTO(const char *rcuname, struct rcu_head *rhp, long qlen),
 
-   TP_ARGS(rcuname, rhp, qlen_lazy, qlen),
+   TP_ARGS(rcuname, rhp, qlen),
 
TP_STRUCT__entry(
__field(const char *, rcuname)
__field(void *, rhp)
__field(void *, func)
-   __field(long, qlen_lazy)
__field(long, qlen)
),
 
@@ -498,13 +496,12 @@ TRACE_EVENT_RCU(rcu_callback,
__entry->rcuname = rcuname;
__entry->rhp = rhp;
__entry->func = rhp->func;
-   __entry->qlen_lazy = qlen_lazy;
__entry->qlen = qlen;
),
 
-   TP_printk("%s rhp=%p func=%ps %ld/%ld",
+   TP_printk("%s rhp=%p func=%ps %ld",
  __entry->rcuname, __entry->rhp, __entry->func,
- __entry->qlen_lazy, __entry->qlen)
+ __entry->qlen)
 );
 
 /*
@@ -518,15 +515,14 @@ TRACE_EVENT_RCU(rcu_callback,
 TRACE_EVENT_RCU(rcu_kfree_callback,
 
TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long 
offset,
-long qlen_lazy, long qlen),
+long qlen),
 
-   TP_ARGS(rcuname, rhp, offset, qlen_lazy, qlen),
+   TP_ARGS(rcuname, rhp, offset, qlen),
 
TP_STRUCT__entry(
__field(const char *, rcuname)
__field(void *, rhp)
__field(unsigned long, offset)
-   __field(long, qlen

[PATCH v2 -rcu dev 1/5] rcu/rcuperf: Add kfree_rcu() performance Tests

2019-08-30 Thread Joel Fernandes (Google)
This test runs kfree_rcu() in a loop to measure performance of the new
kfree_rcu() batching functionality.

The following table shows results when booting with arguments:
rcuperf.kfree_loops=2 rcuperf.kfree_alloc_num=8000
rcuperf.kfree_rcu_test=1 rcuperf.kfree_no_batch=X

rcuperf.kfree_no_batch=X# Grace Periods Test Duration (s)
  X=1 (old behavior)  9133 11.5
  X=0 (new behavior)  1732 12.5

On a 16 CPU system with the above boot parameters, we see that the total
number of grace periods that elapse during the test drops from 9133 when
not batching to 1732 when batching (a 5X improvement). The kfree_rcu()
flood itself slows down a bit when batching, though, as shown.

Note that the active memory consumption during the kfree_rcu() flood
does increase to around 200-250MB due to the batching (from around 50MB
without batching). However, this memory consumption is relatively
constant. In other words, the system is able to keep up with the
kfree_rcu() load. The memory consumption comes down considerably if
KFREE_DRAIN_JIFFIES is increased from HZ/50 to HZ/80. A later patch will
reduce memory consumption further by using multiple lists.

Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and
CONFIG_PROVE_RCU for realistic comparisons with/without batching.

Signed-off-by: Joel Fernandes (Google) 
---
 .../admin-guide/kernel-parameters.txt |  17 ++
 kernel/rcu/rcuperf.c  | 181 +-
 2 files changed, 190 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 79b983bedcaa..24fe8aefb12c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3896,6 +3896,23 @@
test until boot completes in order to avoid
interference.
 
+   rcuperf.kfree_rcu_test= [KNL]
+   Set to measure performance of kfree_rcu() flooding.
+
+   rcuperf.kfree_nthreads= [KNL]
+   The number of threads running loops of kfree_rcu().
+
+   rcuperf.kfree_alloc_num= [KNL]
+   Number of allocations and frees done in an iteration.
+
+   rcuperf.kfree_loops= [KNL]
+   Number of loops doing rcuperf.kfree_alloc_num number
+   of allocations and frees.
+
+   rcuperf.kfree_no_batch= [KNL]
+   Use the non-batching (less efficient) version of 
kfree_rcu().
+   This is useful for comparing with the batched version.
+
rcuperf.nreaders= [KNL]
Set number of RCU readers.  The value -1 selects
N, where N is the number of CPUs.  A value
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 5f884d560384..c1e25fd10f2a 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
  "Shutdown at end of performance tests.");
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
 torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to 
disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
 
 static char *perf_type = "rcu";
 module_param(perf_type, charp, 0444);
@@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished;
 static wait_queue_head_t shutdown_wq;
 static u64 t_rcu_perf_writer_started;
 static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_perf_writer_started;
-static unsigned long b_rcu_perf_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
 static DEFINE_PER_CPU(atomic_t, n_async_inflight);
 
 #define MAX_MEAS 1
@@ -378,10 +379,10 @@ rcu_perf_writer(void *arg)
if (atomic_inc_return(_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
if (gp_exp) {
-   b_rcu_perf_writer_started =
+   b_rcu_gp_test_started =
cur_ops->exp_completed() / 2;
} else {
-   b_rcu_perf_writer_started = cur_ops->get_gp_seq();
+   b_rcu_gp_test_started = cur_ops->get_gp_seq();
}
}
 
@@ -429,10 +430,10 @@ rcu_perf_writer(void *arg)
PERFOUT_STRING("Test complete");
t_rcu_perf_writer_finished = t;
if (gp_exp) {
-   b_rcu_perf_writer_finished =
+   b_rcu_gp_test_finished =

[PATCH v2 -rcu dev 2/2] rcu/dyntick-idle: Add better tracing

2019-08-30 Thread Joel Fernandes (Google)
The dyntick-idle traces are a bit confusing. This patch makes it simpler
and adds some missing cases such as EQS-enter due to user vs idle mode.

Following are the changes:
(1) Add a new context field to trace_rcu_dyntick tracepoint. This
context field can be "USER", "IDLE" or "IRQ".

(2) Remove the "++=" and "--=" strings and replace them with
   "StillNonIdle". This is much easier on the eyes, and the -- and ++
   are easily apparent in the dynticks_nesting counters we are printing
   anyway.

Signed-off-by: Joel Fernandes (Google) 
---
 include/trace/events/rcu.h | 13 -
 kernel/rcu/tree.c  | 19 +--
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 66122602bd08..474c1f7e7104 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -449,12 +449,14 @@ TRACE_EVENT_RCU(rcu_fqs,
  */
 TRACE_EVENT_RCU(rcu_dyntick,
 
-   TP_PROTO(const char *polarity, long oldnesting, long newnesting, 
atomic_t dynticks),
+   TP_PROTO(const char *polarity, const char *context, long oldnesting,
+long newnesting, atomic_t dynticks),
 
-   TP_ARGS(polarity, oldnesting, newnesting, dynticks),
+   TP_ARGS(polarity, context, oldnesting, newnesting, dynticks),
 
TP_STRUCT__entry(
__field(const char *, polarity)
+   __field(const char *, context)
__field(long, oldnesting)
__field(long, newnesting)
__field(int, dynticks)
@@ -462,14 +464,15 @@ TRACE_EVENT_RCU(rcu_dyntick,
 
TP_fast_assign(
__entry->polarity = polarity;
+   __entry->context = context;
__entry->oldnesting = oldnesting;
__entry->newnesting = newnesting;
__entry->dynticks = atomic_read();
),
 
-   TP_printk("%s %lx %lx %#3x", __entry->polarity,
- __entry->oldnesting, __entry->newnesting,
- __entry->dynticks & 0xfff)
+   TP_printk("%s %s %lx %lx %#3x", __entry->polarity,
+   __entry->context, __entry->oldnesting, __entry->newnesting,
+   __entry->dynticks & 0xfff)
 );
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 417dd00b9e87..463407762b5a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -533,7 +533,8 @@ static void rcu_eqs_enter(bool user)
}
 
lockdep_assert_irqs_disabled();
-   trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, 
rdp->dynticks);
+   trace_rcu_dyntick(TPS("Start"), (user ? TPS("USER") : TPS("IDLE")),
+ rdp->dynticks_nesting, 0, rdp->dynticks);
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && 
!is_idle_task(current));
rdp = this_cpu_ptr(_data);
do_nocb_deferred_wakeup(rdp);
@@ -606,14 +607,17 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 * leave it in non-RCU-idle state.
 */
if (rdp->dynticks_nmi_nesting != 1) {
-   trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, 
rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+   trace_rcu_dyntick(TPS("StillNonIdle"), TPS("IRQ"),
+ rdp->dynticks_nmi_nesting,
+ rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
   rdp->dynticks_nmi_nesting - 2);
return;
}
 
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-   trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, 
rdp->dynticks);
+   trace_rcu_dyntick(TPS("Start"), TPS("IRQ"), rdp->dynticks_nmi_nesting,
+ 0, rdp->dynticks);
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
if (irq)
@@ -700,7 +704,8 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
-   trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+   trace_rcu_dyntick(TPS("End"), (user ? TPS("USER") : TPS("IDLE")),
+ rdp->dynticks_nesting, 1, rdp->dynticks);
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && 
!is_idle_task(current));
WRITE_ONCE(rdp->dynticks_nesting, 1);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting);
@@ -787,9 +792,11 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
 

[PATCH v2 -rcu dev 1/2] Revert b8c17e6664c4 ("rcu: Maintain special bits at bottom of ->dynticks counter")

2019-08-30 Thread Joel Fernandes (Google)
This code is unused and can be removed now. Revert was straightforward.

Tested with light rcutorture.

Link: 
http://lore.kernel.org/r/CALCETrWNPOOdTrFabTDd=h7+wc6xj9rjceg6ol1s0rtv5pf...@mail.gmail.com
Suggested-by: Andy Lutomirski 
Signed-off-by: Joel Fernandes (Google) 


---
Only made some commit message changes in this since v1.

 include/linux/rcutiny.h |  3 --
 kernel/rcu/tree.c   | 82 ++---
 2 files changed, 19 insertions(+), 66 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b7607e2667ae..b3f689711289 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,9 +14,6 @@
 
 #include  /* for HZ */
 
-/* Never flag non-existent other CPUs! */
-static inline bool rcu_eqs_special_set(int cpu) { return false; }
-
 static inline unsigned long get_state_synchronize_rcu(void)
 {
return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 68ebf0eb64c8..417dd00b9e87 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -69,20 +69,10 @@
 
 /* Data structures. */
 
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control.  Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
-#ifndef rcu_eqs_special_exit
-#define rcu_eqs_special_exit() do { } while (0)
-#endif
-
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
-   .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+   .dynticks = ATOMIC_INIT(1),
 };
 struct rcu_state rcu_state = {
.level = { _state.node[0] },
@@ -229,20 +219,15 @@ void rcu_softirq_qs(void)
 static void rcu_dynticks_eqs_enter(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
-   int seq;
+   int special;
 
/*
-* CPUs seeing atomic_add_return() must see prior RCU read-side
+* CPUs seeing atomic_inc_return() must see prior RCU read-side
 * critical sections, and we also must force ordering with the
 * next idle sojourn.
 */
-   seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, >dynticks);
-   /* Better be in an extended quiescent state! */
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-(seq & RCU_DYNTICK_CTRL_CTR));
-   /* Better not have special action (TLB flush) pending! */
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-(seq & RCU_DYNTICK_CTRL_MASK));
+   special = atomic_inc_return(>dynticks);
+   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
 }
 
 /*
@@ -252,22 +237,15 @@ static void rcu_dynticks_eqs_enter(void)
 static void rcu_dynticks_eqs_exit(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
-   int seq;
+   int special;
 
/*
-* CPUs seeing atomic_add_return() must see prior idle sojourns,
+* CPUs seeing atomic_inc_return() must see prior idle sojourns,
 * and we also must force ordering with the next RCU read-side
 * critical section.
 */
-   seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, >dynticks);
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-!(seq & RCU_DYNTICK_CTRL_CTR));
-   if (seq & RCU_DYNTICK_CTRL_MASK) {
-   atomic_andnot(RCU_DYNTICK_CTRL_MASK, >dynticks);
-   smp_mb__after_atomic(); /* _exit after clearing mask. */
-   /* Prefer duplicate flushes to losing a flush. */
-   rcu_eqs_special_exit();
-   }
+   special = atomic_inc_return(>dynticks);
+   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
 }
 
 /*
@@ -284,9 +262,9 @@ static void rcu_dynticks_eqs_online(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   if (atomic_read(>dynticks) & RCU_DYNTICK_CTRL_CTR)
+   if (atomic_read(>dynticks) & 0x1)
return;
-   atomic_add(RCU_DYNTICK_CTRL_CTR, >dynticks);
+   atomic_add(0x1, >dynticks);
 }
 
 /*
@@ -298,7 +276,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   return !(atomic_read(>dynticks) & RCU_DYNTICK_CTRL_CTR);
+   return !(atomic_read(>dynticks) & 0x1);
 }
 
 /*
@@ -309,7 +287,7 @@ int rcu_dynticks_snap(struct rcu_data *rdp)
 {
int snap = atomic_add_return(0, >dynticks);
 
-   return snap & ~RCU_DYNTICK_CTRL_MASK;
+   return snap;
 }
 
 /*
@@ -318,7 +296,7 @@ int rcu_dynticks_snap(struct rcu_data *rdp)
  */
 static bool rcu_dynticks_in_eqs(int snap)
 {
-   return !(snap & RCU_DYNTICK_CTRL_CTR);
+   return !(snap & 0x1);
 }
 
 /*
@@ -331,28 +309,6 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data 
*rdp, int snap)
return sna

[PATCH -rcu dev 1/2] Revert b8c17e6664c4 ("rcu: Maintain special bits at bottom of ->dynticks counter")

2019-08-30 Thread Joel Fernandes (Google)
This code is unused and can be removed now. Revert was straightforward.

Tested with light rcutorture.

Link: 
http://lore.kernel.org/r/CALCETrWNPOOdTrFabTDd=h7+wc6xj9rjceg6ol1s0rtv5pf...@mail.gmail.com
Suggested-by: Andy Lutomirski 
Signed-off-by: Joel Fernandes (Google) 

---
 include/linux/rcutiny.h |  3 --
 kernel/rcu/tree.c   | 82 ++---
 2 files changed, 19 insertions(+), 66 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index b7607e2667ae..b3f689711289 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -14,9 +14,6 @@
 
 #include  /* for HZ */
 
-/* Never flag non-existent other CPUs! */
-static inline bool rcu_eqs_special_set(int cpu) { return false; }
-
 static inline unsigned long get_state_synchronize_rcu(void)
 {
return 0;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 68ebf0eb64c8..417dd00b9e87 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -69,20 +69,10 @@
 
 /* Data structures. */
 
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control.  Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR  (RCU_DYNTICK_CTRL_MASK + 1)
-#ifndef rcu_eqs_special_exit
-#define rcu_eqs_special_exit() do { } while (0)
-#endif
-
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
-   .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+   .dynticks = ATOMIC_INIT(1),
 };
 struct rcu_state rcu_state = {
.level = { _state.node[0] },
@@ -229,20 +219,15 @@ void rcu_softirq_qs(void)
 static void rcu_dynticks_eqs_enter(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
-   int seq;
+   int special;
 
/*
-* CPUs seeing atomic_add_return() must see prior RCU read-side
+* CPUs seeing atomic_inc_return() must see prior RCU read-side
 * critical sections, and we also must force ordering with the
 * next idle sojourn.
 */
-   seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, >dynticks);
-   /* Better be in an extended quiescent state! */
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-(seq & RCU_DYNTICK_CTRL_CTR));
-   /* Better not have special action (TLB flush) pending! */
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-(seq & RCU_DYNTICK_CTRL_MASK));
+   special = atomic_inc_return(>dynticks);
+   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1);
 }
 
 /*
@@ -252,22 +237,15 @@ static void rcu_dynticks_eqs_enter(void)
 static void rcu_dynticks_eqs_exit(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
-   int seq;
+   int special;
 
/*
-* CPUs seeing atomic_add_return() must see prior idle sojourns,
+* CPUs seeing atomic_inc_return() must see prior idle sojourns,
 * and we also must force ordering with the next RCU read-side
 * critical section.
 */
-   seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, >dynticks);
-   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
-!(seq & RCU_DYNTICK_CTRL_CTR));
-   if (seq & RCU_DYNTICK_CTRL_MASK) {
-   atomic_andnot(RCU_DYNTICK_CTRL_MASK, >dynticks);
-   smp_mb__after_atomic(); /* _exit after clearing mask. */
-   /* Prefer duplicate flushes to losing a flush. */
-   rcu_eqs_special_exit();
-   }
+   special = atomic_inc_return(>dynticks);
+   WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1));
 }
 
 /*
@@ -284,9 +262,9 @@ static void rcu_dynticks_eqs_online(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   if (atomic_read(>dynticks) & RCU_DYNTICK_CTRL_CTR)
+   if (atomic_read(>dynticks) & 0x1)
return;
-   atomic_add(RCU_DYNTICK_CTRL_CTR, >dynticks);
+   atomic_add(0x1, >dynticks);
 }
 
 /*
@@ -298,7 +276,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   return !(atomic_read(>dynticks) & RCU_DYNTICK_CTRL_CTR);
+   return !(atomic_read(>dynticks) & 0x1);
 }
 
 /*
@@ -309,7 +287,7 @@ int rcu_dynticks_snap(struct rcu_data *rdp)
 {
int snap = atomic_add_return(0, >dynticks);
 
-   return snap & ~RCU_DYNTICK_CTRL_MASK;
+   return snap;
 }
 
 /*
@@ -318,7 +296,7 @@ int rcu_dynticks_snap(struct rcu_data *rdp)
  */
 static bool rcu_dynticks_in_eqs(int snap)
 {
-   return !(snap & RCU_DYNTICK_CTRL_CTR);
+   return !(snap & 0x1);
 }
 
 /*
@@ -331,28 +309,6 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_data 
*rdp, int snap)
return snap != rcu_dynticks_snap(rdp);
 }
 
-/*
- * Set the spe

[PATCH -rcu dev 2/2] rcu/dyntick-idle: Add better tracing

2019-08-30 Thread Joel Fernandes (Google)
The dyntick-idle traces are a bit confusing. This patch makes it simpler
and adds some missing cases such as EQS-enter due to user vs idle mode.

Following are the changes:
(1) Add a new context field to trace_rcu_dyntick tracepoint. This
context field can be "USER", "IDLE" or "IRQ".

(2) Remove the "++=" and "--=" strings and replace them with
   "StillNonIdle". This is much easier on the eyes, and the -- and ++
   are easily apparent in the dynticks_nesting counters we are printing
   anyway.

This patch is based on the previous patches to simplify rcu_dyntick
counters [1] and with these traces, I have verified the counters are
working properly.

[1]
Link: https://lore.kernel.org/patchwork/patch/1120021/
Link: https://lore.kernel.org/patchwork/patch/1120022/

Signed-off-by: Joel Fernandes (Google) 
---
 include/trace/events/rcu.h | 13 -
 kernel/rcu/tree.c  | 19 +--
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h
index 66122602bd08..474c1f7e7104 100644
--- a/include/trace/events/rcu.h
+++ b/include/trace/events/rcu.h
@@ -449,12 +449,14 @@ TRACE_EVENT_RCU(rcu_fqs,
  */
 TRACE_EVENT_RCU(rcu_dyntick,
 
-   TP_PROTO(const char *polarity, long oldnesting, long newnesting, 
atomic_t dynticks),
+   TP_PROTO(const char *polarity, const char *context, long oldnesting,
+long newnesting, atomic_t dynticks),
 
-   TP_ARGS(polarity, oldnesting, newnesting, dynticks),
+   TP_ARGS(polarity, context, oldnesting, newnesting, dynticks),
 
TP_STRUCT__entry(
__field(const char *, polarity)
+   __field(const char *, context)
__field(long, oldnesting)
__field(long, newnesting)
__field(int, dynticks)
@@ -462,14 +464,15 @@ TRACE_EVENT_RCU(rcu_dyntick,
 
TP_fast_assign(
__entry->polarity = polarity;
+   __entry->context = context;
__entry->oldnesting = oldnesting;
__entry->newnesting = newnesting;
__entry->dynticks = atomic_read();
),
 
-   TP_printk("%s %lx %lx %#3x", __entry->polarity,
- __entry->oldnesting, __entry->newnesting,
- __entry->dynticks & 0xfff)
+   TP_printk("%s %s %lx %lx %#3x", __entry->polarity,
+   __entry->context, __entry->oldnesting, __entry->newnesting,
+   __entry->dynticks & 0xfff)
 );
 
 /*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 417dd00b9e87..463407762b5a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -533,7 +533,8 @@ static void rcu_eqs_enter(bool user)
}
 
lockdep_assert_irqs_disabled();
-   trace_rcu_dyntick(TPS("Start"), rdp->dynticks_nesting, 0, 
rdp->dynticks);
+   trace_rcu_dyntick(TPS("Start"), (user ? TPS("USER") : TPS("IDLE")),
+ rdp->dynticks_nesting, 0, rdp->dynticks);
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && 
!is_idle_task(current));
rdp = this_cpu_ptr(_data);
do_nocb_deferred_wakeup(rdp);
@@ -606,14 +607,17 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 * leave it in non-RCU-idle state.
 */
if (rdp->dynticks_nmi_nesting != 1) {
-   trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, 
rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
+   trace_rcu_dyntick(TPS("StillNonIdle"), TPS("IRQ"),
+ rdp->dynticks_nmi_nesting,
+ rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
   rdp->dynticks_nmi_nesting - 2);
return;
}
 
/* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */
-   trace_rcu_dyntick(TPS("Startirq"), rdp->dynticks_nmi_nesting, 0, 
rdp->dynticks);
+   trace_rcu_dyntick(TPS("Start"), TPS("IRQ"), rdp->dynticks_nmi_nesting,
+ 0, rdp->dynticks);
WRITE_ONCE(rdp->dynticks_nmi_nesting, 0); /* Avoid store tearing. */
 
if (irq)
@@ -700,7 +704,8 @@ static void rcu_eqs_exit(bool user)
rcu_dynticks_task_exit();
rcu_dynticks_eqs_exit();
rcu_cleanup_after_idle();
-   trace_rcu_dyntick(TPS("End"), rdp->dynticks_nesting, 1, rdp->dynticks);
+   trace_rcu_dyntick(TPS("End"), (user ? TPS("USER") : TPS("IDLE")),
+ rdp->dynticks_nesting, 1, rdp->dynticks);
WARN_ON_ONC

[PATCH 3/5] rcu/tree: Add support for debug_objects debugging for kfree_rcu()

2019-08-27 Thread Joel Fernandes (Google)
Make use of RCU's debug_objects debugging support
(CONFIG_DEBUG_OBJECTS_RCU_HEAD) similar to call_rcu() and other flavors.
We queue the object during the kfree_rcu() call and dequeue it during
reclaim.

Tested that enabling CONFIG_DEBUG_OBJECTS_RCU_HEAD successfully detects
double kfree_rcu() calls.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9b9ae4db1c2d..64568f12641d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2757,6 +2757,7 @@ static void kfree_rcu_work(struct work_struct *work)
for (; head; head = next) {
next = head->next;
/* Could be possible to optimize with kfree_bulk in future */
+   debug_rcu_head_unqueue(head);
__rcu_reclaim(rcu_state.name, head);
cond_resched_tasks_rcu_qs();
}
@@ -2868,6 +2869,13 @@ void kfree_call_rcu(struct rcu_head *head, 
rcu_callback_t func)
if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING)
return kfree_call_rcu_nobatch(head, func);
 
+   if (debug_rcu_head_queue(head)) {
+   /* Probable double kfree_rcu() */
+   WARN_ONCE(1, "kfree_call_rcu(): Double-freed call. rcu_head 
%p\n",
+ head);
+   return;
+   }
+
head->func = func;
 
local_irq_save(flags);  /* For safely calling this_cpu_ptr(). */
-- 
2.23.0.187.g17f5b7556c-goog



[RFC v1 1/2] rcu/tree: Clean up dynticks counter usage

2019-08-26 Thread Joel Fernandes (Google)
The dynticks counter are confusing due to crowbar writes of
DYNTICK_IRQ_NONIDLE whose purpose is to detect half-interrupts (i.e. we
see rcu_irq_enter() but not rcu_irq_exit() due to a usermode upcall) and
if so then do a reset of the dyntick_nmi_nesting counters. This patch
tries to get rid of DYNTICK_IRQ_NONIDLE while still keeping the code
working, fully functional, and less confusing. The confusion recently
has even led to patches forgetting that DYNTICK_IRQ_NONIDLE was written
to which wasted lots of time.

The patch has the following changes:

(1) Use dynticks_nesting instead of dynticks_nmi_nesting for determining
outer most "EQS exit". This is needed to detect in
rcu_nmi_enter_common() if we have already EQS-exited, such as because of
a syscall. Currently we rely on a forced write of DYNTICK_IRQ_NONIDLE
from rcu_eqs_exit() for this purpose. This is one purpose of the
DYNTICK_IRQ_NONIDLE write (other than detecting half-interrupts).
However, we do not need to do that. dyntick_nesting already tells us that
we have EQS-exited so just use that thus removing the dependence of
dynticks_nmi_nesting for this purpose.

(2) Keep dynticks_nmi_nesting around because:

  (a) rcu_is_cpu_rrupt_from_idle() needs to be able to detect first
  interrupt nesting level.

  (b) We need to detect half-interrupts till we are sure they're not an
  issue. However, change the comparison to DYNTICK_IRQ_NONIDLE with 0.

(3) Since we got rid of DYNTICK_IRQ_NONIDLE, we also do cheaper
comparisons with zero instead for the code that keeps the tick on in
rcu_nmi_enter_common().

In the next patch, both of the concerns of (2) will be addressed and
then we can get rid of dynticks_nmi_nesting, however one step at a time.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/rcu.h  |  4 
 kernel/rcu/tree.c | 60 ---
 2 files changed, 36 insertions(+), 28 deletions(-)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index aeec70fda82c..046833f3784b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -12,10 +12,6 @@
 
 #include 
 
-/* Offset to allow distinguishing irq vs. task-based idle entry/exit. */
-#define DYNTICK_IRQ_NONIDLE((LONG_MAX / 2) + 1)
-
-
 /*
  * Grace-period counter management.
  */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 68ebf0eb64c8..255cd6835526 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -81,7 +81,7 @@
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
-   .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
+   .dynticks_nmi_nesting = 0,
.dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
 };
 struct rcu_state rcu_state = {
@@ -558,17 +558,18 @@ EXPORT_SYMBOL_GPL(rcutorture_get_gp_data);
 /*
  * Enter an RCU extended quiescent state, which can be either the
  * idle loop or adaptive-tickless usermode execution.
- *
- * We crowbar the ->dynticks_nmi_nesting field to zero to allow for
- * the possibility of usermode upcalls having messed up our count
- * of interrupt nesting level during the prior busy period.
  */
 static void rcu_eqs_enter(bool user)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
 
-   WARN_ON_ONCE(rdp->dynticks_nmi_nesting != DYNTICK_IRQ_NONIDLE);
-   WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
+   /* Entering usermode/idle from interrupt is not handled. These would
+* mean usermode upcalls or idle entry happened from interrupts. But,
+* reset the counter if we warn.
+*/
+   if (WARN_ON_ONCE(rdp->dynticks_nmi_nesting != 0))
+   WRITE_ONCE(rdp->dynticks_nmi_nesting, 0);
+
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
 rdp->dynticks_nesting == 0);
if (rdp->dynticks_nesting != 1) {
@@ -642,23 +643,27 @@ static __always_inline void rcu_nmi_exit_common(bool irq)
 * (We are exiting an NMI handler, so RCU better be paying attention
 * to us!)
 */
+   WARN_ON_ONCE(rdp->dynticks_nesting <= 0);
WARN_ON_ONCE(rdp->dynticks_nmi_nesting <= 0);
WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs());
 
+   WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
+  rdp->dynticks_nmi_nesting - 1);
/*
 * If the nesting level is not 1, the CPU wasn't RCU-idle, so
 * leave it in non-RCU-idle state.
 */
-   if (rdp->dynticks_nmi_nesting != 1) {
-   trace_rcu_dyntick(TPS("--="), rdp->dynticks_nmi_nesting, 
rdp->dynticks_nmi_nesting - 2, rdp->dynticks);
-   WRITE_ONCE(rdp->dynticks_nmi_nesting, /* No store tearing. */
-  rdp->dynticks_nmi_nesting - 2);
+   if (rdp->dynticks_nesting != 1) {
+   trace_rcu_dyntick(TPS("--="), rdp->dynticks_nesting,
+ rdp->dyn

[RFC v2] rcu/tree: Try to invoke_rcu_core() if in_irq() during unlock

2019-08-18 Thread Joel Fernandes (Google)
When we're in hard interrupt context in rcu_read_unlock_special(), we
can still benefit from invoke_rcu_core() doing wake ups of rcuc
threads when the !use_softirq parameter is passed.  This is safe
to do so because:

1. We avoid the scheduler deadlock issues thanks to the deferred_qs bit
introduced in commit 23634ebc1d94 ("rcu: Check for wakeup-safe
conditions in rcu_read_unlock_special()") by checking for the same in
this patch.

2. in_irq() implies in_interrupt() which implies raising softirq will
not do any wake ups.

The rcuc thread which is awakened will run when the interrupt returns.

We also honor 25102de ("rcu: Only do rcu_read_unlock_special() wakeups
if expedited") thus doing the rcuc awakening only when none of the
following are true:
  1. Critical section is blocking an expedited GP.
  2. A nohz_full CPU.
If neither of these cases are true (exp == false), then the "else" block
will run to do the irq_work stuff.

This commit is based on a partial revert of d143b3d1cd89 ("rcu: Simplify
rcu_read_unlock_special() deferred wakeups") with an additional in_irq()
check added.

Signed-off-by: Joel Fernandes (Google) 

---
v1->v2: Some minor character encoding issues in changelog corrected.

Note that I am still testing this patch, but I sent an early RFC for your
feedback. Thanks!

 kernel/rcu/tree_plugin.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 2defc7fe74c3..f4b3055026dc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -621,6 +621,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
// Using softirq, safe to awaken, and we get
// no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
+   } else if (exp && in_irq() && !use_softirq &&
+  !t->rcu_read_unlock_special.b.deferred_qs) {
+   // Safe to awaken rcuc kthread which will be
+   // scheduled in from the interrupt return path.
+   invoke_rcu_core();
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting or NO_HZ_FULL, slow is OK.
-- 
2.23.0.rc1.153.gdeed80330f-goog



[RFC] rcu/tree: Try to invoke_rcu_core() if in_irq() during unlock

2019-08-18 Thread Joel Fernandes (Google)
When we're in hard interrupt context in rcu_read_unlock_special(), we
can still benefit from invoke_rcu_core() doing wake ups of rcuc
threads when the !use_softirq parameter is passed.  This is safe
to do so because:

1. We avoid the scheduler deadlock issues thanks to the deferred_qs bit
introduced in commit 23634ebc1d94 ("rcu: Check for wakeup-safe
conditions in rcu_read_unlock_special()") by checking for the same in
this patch.

2. in_irq() implies in_interrupt() which implies raising softirq will
not do any wake ups.

The rcuc thread which is awakened will run when the interrupt returns.

We also honor 25102de (“rcu: Only do rcu_read_unlock_special() wakeups
if expedited”) thus doing the rcuc awakening only when none of the
following are true:
  1. Critical section is blocking an expedited GP.
  2. A nohz_full CPU.
If neither of these cases are true (exp == false), then the "else" block
will run to do the irq_work stuff.

This commit is based on a partial revert of d143b3d1cd89 ("rcu: Simplify
rcu_read_unlock_special() deferred wakeups") with an additional in_irq()
check added.

Signed-off-by: Joel Fernandes (Google) 

---
 kernel/rcu/tree_plugin.h | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 2defc7fe74c3..f4b3055026dc 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -621,6 +621,11 @@ static void rcu_read_unlock_special(struct task_struct *t)
// Using softirq, safe to awaken, and we get
// no help from enabling irqs, unlike bh/preempt.
raise_softirq_irqoff(RCU_SOFTIRQ);
+   } else if (exp && in_irq() && !use_softirq &&
+  !t->rcu_read_unlock_special.b.deferred_qs) {
+   // Safe to awaken rcuc thread which will be
+   // scheduled in from the interrupt return path.
+   invoke_rcu_core();
} else {
// Enabling BH or preempt does reschedule, so...
// Also if no expediting or NO_HZ_FULL, slow is OK.
-- 
2.23.0.rc1.153.gdeed80330f-goog



[PATCH -rcu/dev] Please squash: fixup! rcu/tree: Add basic support for kfree_rcu() batching

2019-08-16 Thread Joel Fernandes (Google)
xchg() on a bool is causing issues on riscv and arm32. Please squash
this into the -rcu dev branch to resolve the issue.

Please squash this fix.

Fixes: -rcu dev commit 3cbd3aa7d9c7bdf ("rcu/tree: Add basic support for 
kfree_rcu() batching")

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4f7c3096d786..33192a58b39a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2717,7 +2717,7 @@ struct kfree_rcu_cpu {
 * is busy, ->head just continues to grow and we retry flushing later.
 */
struct delayed_work monitor_work;
-   bool monitor_todo;  /* Is a delayed work pending execution? */
+   int monitor_todo;   /* Is a delayed work pending execution? */
 };
 
 static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
@@ -2790,7 +2790,7 @@ static inline void kfree_rcu_drain_unlock(struct 
kfree_rcu_cpu *krcp,
/* Previous batch that was queued to RCU did not get free yet, let us
 * try again soon.
 */
-   if (!xchg(>monitor_todo, true))
+   if (!xchg(>monitor_todo, 1))
schedule_delayed_work(>monitor_work, KFREE_DRAIN_JIFFIES);
spin_unlock_irqrestore(>lock, flags);
 }
@@ -2806,7 +2806,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
 monitor_work.work);
 
spin_lock_irqsave(>lock, flags);
-   if (xchg(>monitor_todo, false))
+   if (xchg(>monitor_todo, 0))
kfree_rcu_drain_unlock(krcp, flags);
else
spin_unlock_irqrestore(>lock, flags);
@@ -2858,7 +2858,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t 
func)
krcp->head = head;
 
/* Schedule monitor for timely drain after KFREE_DRAIN_JIFFIES. */
-   if (!xchg(>monitor_todo, true))
+   if (!xchg(>monitor_todo, 1))
schedule_delayed_work(>monitor_work, KFREE_DRAIN_JIFFIES);
 
spin_unlock(>lock);
-- 
2.23.0.rc1.153.gdeed80330f-goog



[PATCH v2 -rcu dev 2/3] rcu/tree: Fix issue where sometimes rcu_urgent_qs is not set on IPI

2019-08-15 Thread Joel Fernandes (Google)
Sometimes I see rcu_urgent_qs is not set. This could be when the last
IPI was a long time ago, however, the grace period just started. Set
rcu_urgent_qs so the tick can indeed not be stopped.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 322b1b57967c..856d3c9f1955 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1091,6 +1091,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
   time_after(jiffies,
  READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
+   WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
-- 
2.23.0.rc1.153.gdeed80330f-goog



[PATCH -rcu dev 2/3] rcu/tree: Fix issue where sometimes rcu_urgent_qs is not set on IPI

2019-08-15 Thread Joel Fernandes (Google)
Sometimes I see rcu_urgent_qs is not set. This could be when the last
IPI was a long time ago, however, the grace period just started. Set
rcu_urgent_qs so the tick can indeed be stopped.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 322b1b57967c..856d3c9f1955 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1091,6 +1091,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (tick_nohz_full_cpu(rdp->cpu) &&
   time_after(jiffies,
  READ_ONCE(rdp->last_fqs_resched) + jtsq * 3)) {
+   WRITE_ONCE(*ruqp, true);
resched_cpu(rdp->cpu);
WRITE_ONCE(rdp->last_fqs_resched, jiffies);
}
-- 
2.23.0.rc1.153.gdeed80330f-goog



[PATCH -rcu dev 3/3] RFC: rcu/tree: Read dynticks_nmi_nesting in advance

2019-08-15 Thread Joel Fernandes (Google)
I really cannot explain this patch, but without it, the "else if" block
just doesn't execute thus causing the tick's dep mask to not be set and
causes the tick to be turned off.

I tried various _ONCE() macros but the only thing that works is this
patch.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 856d3c9f1955..ac6bcf7614d7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -802,6 +802,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
 {
struct rcu_data *rdp = this_cpu_ptr(_data);
long incby = 2;
+   int dnn = rdp->dynticks_nmi_nesting;
 
/* Complain about underflow. */
WARN_ON_ONCE(rdp->dynticks_nmi_nesting < 0);
@@ -826,7 +827,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
 
incby = 1;
} else if (tick_nohz_full_cpu(rdp->cpu) &&
-  !rdp->dynticks_nmi_nesting &&
+  !dnn &&
   rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
rdp->rcu_forced_tick = true;
tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
-- 
2.23.0.rc1.153.gdeed80330f-goog



[PATCH -rcu dev 1/3] rcu/tree: tick_dep_set/clear_cpu should accept bits instead of masks

2019-08-15 Thread Joel Fernandes (Google)
This commit fixes the issue.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0512de9ead20..322b1b57967c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -829,7 +829,7 @@ static __always_inline void rcu_nmi_enter_common(bool irq)
   !rdp->dynticks_nmi_nesting &&
   rdp->rcu_urgent_qs && !rdp->rcu_forced_tick) {
rdp->rcu_forced_tick = true;
-   tick_dep_set_cpu(rdp->cpu, TICK_DEP_MASK_RCU);
+   tick_dep_set_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
}
trace_rcu_dyntick(incby == 1 ? TPS("Endirq") : TPS("++="),
  rdp->dynticks_nmi_nesting,
@@ -898,7 +898,7 @@ void rcu_irq_enter_irqson(void)
 void rcu_disable_tick_upon_qs(struct rcu_data *rdp)
 {
if (tick_nohz_full_cpu(rdp->cpu) && rdp->rcu_forced_tick) {
-   tick_dep_clear_cpu(rdp->cpu, TICK_DEP_MASK_RCU);
+   tick_dep_clear_cpu(rdp->cpu, TICK_DEP_BIT_RCU);
rdp->rcu_forced_tick = false;
}
 }
@@ -2123,8 +2123,9 @@ int rcutree_dead_cpu(unsigned int cpu)
do_nocb_deferred_wakeup(per_cpu_ptr(_data, cpu));
 
// Stop-machine done, so allow nohz_full to disable tick.
-   for_each_online_cpu(c)
-   tick_dep_clear_cpu(c, TICK_DEP_MASK_RCU);
+   for_each_online_cpu(c) {
+   tick_dep_clear_cpu(c, TICK_DEP_BIT_RCU);
+   }
return 0;
 }
 
@@ -2175,8 +2176,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
rcu_nocb_unlock_irqrestore(rdp, flags);
 
/* Invoke callbacks. */
-   if (IS_ENABLED(CONFIG_NO_HZ_FULL))
-   tick_dep_set_task(current, TICK_DEP_MASK_RCU);
+   if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+   tick_dep_set_task(current, TICK_DEP_BIT_RCU);
+   }
rhp = rcu_cblist_dequeue();
for (; rhp; rhp = rcu_cblist_dequeue()) {
debug_rcu_head_unqueue(rhp);
@@ -2243,8 +2245,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Re-invoke RCU core processing if there are callbacks remaining. */
if (!offloaded && rcu_segcblist_ready_cbs(>cblist))
invoke_rcu_core();
-   if (IS_ENABLED(CONFIG_NO_HZ_FULL))
-   tick_dep_clear_task(current, TICK_DEP_MASK_RCU);
+   if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
+   tick_dep_clear_task(current, TICK_DEP_BIT_RCU);
+   }
 }
 
 /*
@@ -3118,8 +3121,9 @@ int rcutree_online_cpu(unsigned int cpu)
rcutree_affinity_setting(cpu, -1);
 
// Stop-machine done, so allow nohz_full to disable tick.
-   for_each_online_cpu(c)
-   tick_dep_clear_cpu(c, TICK_DEP_MASK_RCU);
+   for_each_online_cpu(c) {
+   tick_dep_clear_cpu(c, TICK_DEP_BIT_RCU);
+   }
return 0;
 }
 
@@ -3143,8 +3147,9 @@ int rcutree_offline_cpu(unsigned int cpu)
rcutree_affinity_setting(cpu, cpu);
 
// nohz_full CPUs need the tick for stop-machine to work quickly
-   for_each_online_cpu(c)
-   tick_dep_set_cpu(c, TICK_DEP_MASK_RCU);
+   for_each_online_cpu(c) {
+   tick_dep_set_cpu(c, TICK_DEP_BIT_RCU);
+   }
return 0;
 }
 
-- 
2.23.0.rc1.153.gdeed80330f-goog



[PATCH v4 1/2] rcu/tree: Add basic support for kfree_rcu() batching

2019-08-14 Thread Joel Fernandes (Google)
Recently a discussion about stability and performance of a system
involving a high rate of kfree_rcu() calls surfaced on the list [1]
which led to another discussion how to prepare for this situation.

This patch adds basic batching support for kfree_rcu(). It is "basic"
because we do none of the slab management, dynamic allocation, code
moving or any of the other things, some of which previous attempts did
[2]. These fancier improvements can be follow-up patches and there are
different ideas being discussed in those regards. This is an effort to
start simple, and build up from there. In the future, an extension to
use kfree_bulk and possibly per-slab batching could be done to further
improve performance due to cache-locality and slab-specific bulk free
optimizations. By using an array of pointers, the worker thread
processing the work would need to read lesser data since it does not
need to deal with large rcu_head(s) any longer.

Torture tests follow in the next patch and show improvements of around
5x reduction in number of  grace periods on a 16 CPU system. More
details and test data are in that patch.

There is an implication with rcu_barrier() with this patch. Since the
kfree_rcu() calls can be batched, and may not be handed yet to the RCU
machinery in fact, the monitor may not have even run yet to do the
queue_rcu_work(), there seems no easy way of implementing rcu_barrier()
to wait for those kfree_rcu()s that are already made. So this means a
kfree_rcu() followed by an rcu_barrier() does not imply that memory will
be freed once rcu_barrier() returns.

Another implication is higher active memory usage (although not
run-away..) until the kfree_rcu() flooding ends, in comparison to
without batching. More details about this are in the second patch which
adds an rcuperf test.

Finally, in the near future we will get rid of kfree_rcu() special casing
within RCU such as in rcu_do_batch and switch everything to just
batching. Currently we don't do that since timer subsystem is not yet up
and we cannot schedule the kfree_rcu() monitor as the timer subsystem's
lock are not initialized. That would also mean getting rid of
kfree_call_rcu_nobatch() entirely.

[1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-...@kernel.org
[2] https://lkml.org/lkml/2017/12/19/824

Cc: kernel-t...@android.com
Cc: kernel-t...@lge.com
Co-developed-by: Byungchul Park 
Signed-off-by: Byungchul Park 
Signed-off-by: Joel Fernandes (Google) 

---
v3->v4: Some corrections by Paul.
Used xchg in places to simplify code.

v2->v3: Just some code comment changes thanks to Byungchul.

RFCv1->PATCH v2: Removed limits on the ->head list, just let it grow.
   Dropped KFREE_MAX_JIFFIES to HZ/50 from HZ/20 to reduce OOM 
occurrence.
   Removed sleeps in rcuperf test, just using cond_resched()in 
loop.
   Better code comments ;)

 include/linux/rcutiny.h |   5 ++
 include/linux/rcutree.h |   1 +
 kernel/rcu/tree.c   | 194 ++--
 3 files changed, 194 insertions(+), 6 deletions(-)

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 8e727f57d814..383f2481750f 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -39,6 +39,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, 
rcu_callback_t func)
call_rcu(head, func);
 }
 
+static inline void kfree_call_rcu_nobatch(struct rcu_head *head, 
rcu_callback_t func)
+{
+   call_rcu(head, func);
+}
+
 void rcu_qs(void);
 
 static inline void rcu_softirq_qs(void)
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 735601ac27d3..7e38b39ec634 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -34,6 +34,7 @@ static inline void rcu_virt_note_context_switch(int cpu)
 
 void synchronize_rcu_expedited(void);
 void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func);
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
 
 void rcu_barrier(void);
 bool rcu_eqs_special_set(int cpu);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..1d1847cadea2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2593,17 +2593,185 @@ void call_rcu(struct rcu_head *head, rcu_callback_t 
func)
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+
+/* Maximum number of jiffies to wait before draining a batch. */
+#define KFREE_DRAIN_JIFFIES (HZ / 50)
+
 /*
- * Queue an RCU callback for lazy invocation after a grace period.
- * This will likely be later named something like "call_rcu_lazy()",
- * but this change will require some way of tagging the lazy RCU
- * callbacks in the list of pending callbacks. Until then, this
- * function may only be called from __kfree_rcu().
+ * Maximum number of kfree(s) to batch, if this limit is hit then the batch of
+ * kfree(s) is queued for freeing after a grace period, right away.
  */
-void kfree_call_rcu(struct rcu_head *he

[PATCH v4 2/2] rcuperf: Add kfree_rcu() performance Tests

2019-08-14 Thread Joel Fernandes (Google)
This test runs kfree_rcu in a loop to measure performance of the new
kfree_rcu batching functionality.

The following table shows results when booting with arguments:
rcuperf.kfree_loops=20 rcuperf.kfree_alloc_num=1000 rcuperf.kfree_rcu_test=1

In addition, rcuperf.kfree_no_batch is used to toggle the batching of
kfree_rcu()s for a test run.

rcuperf.kfree_no_batch  GPs time (seconds)
 0 (default)173215.9
 1  913314.5

Note that the results are the same for the case:
1. Patch is not applied and rcuperf.kfree_no_batch=0
2. Patch is applied and rcuperf.kfree_no_batch=1

On a 16 CPU system with the above boot parameters, we see that the total
number of grace periods that elapse during the test drops from 9133 when
not batching to 1732 when batching (a 5X improvement). The kfree_rcu()
flood itself slows down a bit when batching, though, as shown. This is
likely due to rcuperf threads contending with the additional worker
threads that are now running both before (the monitor) and after (the
work done to kfree()) the grace period.

Note that the active memory consumption during the kfree_rcu() flood
does increase to around 300-400MB due to the batching (from around 50MB
without batching). However, this memory consumption is relatively
constant and is just an effect of the buffering. In other words, the
system is able to keep up with the kfree_rcu() load. The memory
consumption comes down to 200-300MB if KFREE_DRAIN_JIFFIES is
increased from HZ/50 to HZ/80.

Also, when running the test, please disable CONFIG_DEBUG_PREEMPT and
CONFIG_PROVE_RCU for realistic comparisons with/without batching.

Signed-off-by: Joel Fernandes (Google) 
---
 .../admin-guide/kernel-parameters.txt |  17 ++
 kernel/rcu/rcuperf.c  | 189 +-
 2 files changed, 198 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 7ccd158b3894..a9156ca5de24 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3895,6 +3895,23 @@
test until boot completes in order to avoid
interference.
 
+   rcuperf.kfree_rcu_test= [KNL]
+   Set to measure performance of kfree_rcu() flooding.
+
+   rcuperf.kfree_nthreads= [KNL]
+   The number of threads running loops of kfree_rcu().
+
+   rcuperf.kfree_alloc_num= [KNL]
+   Number of allocations and frees done in an iteration.
+
+   rcuperf.kfree_loops= [KNL]
+   Number of loops doing rcuperf.kfree_alloc_num number
+   of allocations and frees.
+
+   rcuperf.kfree_no_batch= [KNL]
+   Use the non-batching (slower) version of kfree_rcu.
+   This is useful for comparing with the batched version.
+
rcuperf.nreaders= [KNL]
Set number of RCU readers.  The value -1 selects
N, where N is the number of CPUs.  A value
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..70d6ac19cbff 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -86,6 +86,7 @@ torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
  "Shutdown at end of performance tests.");
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
 torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to 
disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu perf test?");
 
 static char *perf_type = "rcu";
 module_param(perf_type, charp, 0444);
@@ -105,8 +106,8 @@ static atomic_t n_rcu_perf_writer_finished;
 static wait_queue_head_t shutdown_wq;
 static u64 t_rcu_perf_writer_started;
 static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_perf_writer_started;
-static unsigned long b_rcu_perf_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
 static DEFINE_PER_CPU(atomic_t, n_async_inflight);
 
 static int rcu_perf_writer_state;
@@ -379,10 +380,10 @@ rcu_perf_writer(void *arg)
if (atomic_inc_return(_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
if (gp_exp) {
-   b_rcu_perf_writer_started =
+   b_rcu_gp_test_started =
cur_ops->exp_completed() / 2;
} else {
-   b_rcu_perf_writer_started = cur_ops->get_gp_seq();
+   b_rcu_gp_test_started = cur_ops->get_gp_seq();
}
}
 
@@ -435,10 +436,10 @@ rcu_perf_writer(void *arg)
PERFOUT_STRING("Test complete");
t_

[PATCH RFC v1 2/2] rcuperf: Add kfree_rcu performance Tests

2019-08-06 Thread Joel Fernandes (Google)
This test runs kfree_rcu in a loop to measure performance of the new
kfree_rcu, with and without patch.

To see improvement, run with boot parameters:
rcuperf.kfree_loops=2000 rcuperf.kfree_alloc_num=100 rcuperf.perf_type=kfree

Without patch, test runs in 6.9 seconds.
With patch, test runs in 6.1 seconds (+13% improvement)

If it is desired to run the test but with the traditional (non-batched)
kfree_rcu, for example to compare results, then you could pass along the
rcuperf.kfree_no_batch=1 boot parameter.

Cc: max.byungchul.p...@gmail.com
Cc: byungchul.p...@lge.com
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/rcuperf.c | 169 ++-
 1 file changed, 168 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 7a6890b23c5f..34658760da5e 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -89,7 +89,7 @@ torture_param(int, writer_holdoff, 0, "Holdoff (us) between 
GPs, zero to disable
 
 static char *perf_type = "rcu";
 module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, 
...)");
+MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, rcu_bh, 
kfree,...)");
 
 static int nrealreaders;
 static int nrealwriters;
@@ -592,6 +592,170 @@ rcu_perf_shutdown(void *arg)
return -EINVAL;
 }
 
+/*
+ * kfree_rcu performance tests: Start a kfree_rcu loop on all CPUs for number
+ * of iterations and measure total time for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of RCU reader threads");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees 
done by a thread");
+torture_param(int, kfree_alloc_size, 16,  "Size of each allocation");
+torture_param(int, kfree_loops, 10, "Size of each allocation");
+torture_param(int, kfree_no_batch, 0, "Use the non-batching (slower) version 
of kfree_rcu");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_perf_thread_started;
+static atomic_t n_kfree_perf_thread_ended;
+
+#define KFREE_OBJ_BYTES 8
+
+struct kfree_obj {
+   char kfree_obj[KFREE_OBJ_BYTES];
+   struct rcu_head rh;
+};
+
+void kfree_call_rcu_nobatch(struct rcu_head *head, rcu_callback_t func);
+
+static int
+kfree_perf_thread(void *arg)
+{
+   int i, l = 0;
+   long me = (long)arg;
+   struct kfree_obj **alloc_ptrs;
+   u64 start_time, end_time;
+
+   VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
+   set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+   set_user_nice(current, MAX_NICE);
+   atomic_inc(_kfree_perf_thread_started);
+
+   alloc_ptrs = (struct kfree_obj **)kmalloc(sizeof(struct kfree_obj *) * 
kfree_alloc_num,
+ GFP_KERNEL);
+   if (!alloc_ptrs)
+   return -ENOMEM;
+
+   start_time = ktime_get_mono_fast_ns();
+   do {
+   for (i = 0; i < kfree_alloc_num; i++) {
+   alloc_ptrs[i] = kmalloc(sizeof(struct kfree_obj), 
GFP_KERNEL);
+   if (!alloc_ptrs[i])
+   return -ENOMEM;
+   }
+
+   for (i = 0; i < kfree_alloc_num; i++) {
+   if (!kfree_no_batch) {
+   kfree_rcu(alloc_ptrs[i], rh);
+   } else {
+   rcu_callback_t cb;
+
+   cb = (rcu_callback_t)(unsigned 
long)offsetof(struct kfree_obj, rh);
+   kfree_call_rcu_nobatch(&(alloc_ptrs[i]->rh), 
cb);
+   }
+   }
+
+   schedule_timeout_uninterruptible(2);
+   } while (!torture_must_stop() && ++l < kfree_loops);
+
+   kfree(alloc_ptrs);
+
+   if (atomic_inc_return(_kfree_perf_thread_ended) >= 
kfree_nrealthreads) {
+   end_time = ktime_get_mono_fast_ns();
+   pr_alert("Total time taken by all kfree'ers: %llu ns, loops: 
%d\n",
+  (unsigned long long)(end_time - start_time), 
kfree_loops);
+   if (shutdown) {
+   smp_mb(); /* Assign before wake. */
+   wake_up(_wq);
+   }
+   }
+
+   torture_kthread_stopping("kfree_perf_thread");
+   return 0;
+}
+
+static void
+kfree_perf_cleanup(void)
+{
+   int i;
+
+   if (torture_cleanup_begin())
+   return;
+
+   if (kfree_reader_tasks) {
+   for (i = 0; i < kfree_nrealthreads; i++)
+   torture_stop_kthread(kfree_perf_thread,
+kfree_reader_tasks[i]);
+   kfree(kfree_reader_tasks);
+   }
+
+   torture_cleanup_end();

[PATCH RFC v1 1/2] rcu/tree: Add basic support for kfree_rcu batching

2019-08-06 Thread Joel Fernandes (Google)
Recently a discussion about performance of system involving a high rate
of kfree_rcu() calls surfaced on the list [1] which led to another
discussion how to prepare for this situation.

This patch adds basic batching support for kfree_rcu. It is "basic"
because we do none of the slab management, dynamic allocation, code
moving or any of the other things, some of which previous attempts did
[2]. These fancier improvements can be follow-up patches and there are
several ideas being experimented in those regards.

Torture tests follow in the next patch and show improvements of around
~13% with continuous flooding of kfree_rcu() calls on a 16 CPU system.

[1] http://lore.kernel.org/lkml/20190723035725-mutt-send-email-...@kernel.org
[2] https://lkml.org/lkml/2017/12/19/824

This is an effort just to start simple, and build up from there.

Cc: Rao Shoaib 
Cc: max.byungchul.p...@gmail.com
Cc: byungchul.p...@lge.com
Cc: kernel-t...@android.com
Cc: kernel-t...@lge.com
Co-developed-by: Byungchul Park 
Signed-off-by: Byungchul Park 
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree.c | 198 --
 1 file changed, 193 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a14e5fbbea46..bdbd483606ce 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2593,19 +2593,194 @@ void call_rcu(struct rcu_head *head, rcu_callback_t 
func)
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+
+/* Maximum number of jiffies to wait before draining batch */
+#define KFREE_DRAIN_JIFFIES 50
+
+/*
+ * Maximum number of kfree(s) to batch, if limit is hit
+ * then RCU work is queued right away
+ */
+#define KFREE_MAX_BATCH20ULL
+
+struct kfree_rcu_cpu {
+   /* The work done to free objects after GP */
+   struct rcu_work rcu_work;
+
+   /* The list of objects being queued */
+   struct rcu_head *head;
+   int kfree_batch_len;
+
+   /* The list of objects pending a free */
+   struct rcu_head *head_free;
+
+   /* Protect concurrent access to this structure */
+   spinlock_t lock;
+
+   /* The work done to monitor whether objects need free */
+   struct delayed_work monitor_work;
+   bool monitor_todo;
+};
+
+static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc);
+
+/* Free all heads after a grace period (worker function) */
+static void kfree_rcu_work(struct work_struct *work)
+{
+   unsigned long flags;
+   struct rcu_head *head, *next;
+   struct kfree_rcu_cpu *krc = container_of(to_rcu_work(work),
+   struct kfree_rcu_cpu, rcu_work);
+
+   spin_lock_irqsave(>lock, flags);
+   head = krc->head_free;
+   krc->head_free = NULL;
+   spin_unlock_irqrestore(>lock, flags);
+
+   /* The head must be detached and not referenced from anywhere */
+   for (; head; head = next) {
+   next = head->next;
+   head->next = NULL;
+   /* Could be possible to optimize with kfree_bulk in future */
+   __rcu_reclaim(rcu_state.name, head);
+   }
+}
+
+/*
+ * Schedule the kfree batch RCU work to run after GP.
+ *
+ * Either the batch reached its maximum size, or the monitor's
+ * time reached, either way schedule the batch work.
+ */
+static bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krc)
+{
+   lockdep_assert_held(>lock);
+
+   /*
+* Someone already drained, probably before the monitor's worker
+* thread ran. Just return to avoid useless work.
+*/
+   if (!krc->head)
+   return true;
+
+   /*
+* If RCU batch work already in progress, we cannot
+* queue another one, just refuse the optimization.
+*/
+   if (krc->head_free)
+   return false;
+
+   krc->head_free = krc->head;
+   krc->head = NULL;
+   krc->kfree_batch_len = 0;
+   INIT_RCU_WORK(>rcu_work, kfree_rcu_work);
+   queue_rcu_work(system_wq, >rcu_work);
+
+   return true;
+}
+
+static void kfree_rcu_drain_unlock(struct kfree_rcu_cpu *krc,
+  unsigned long flags)
+{
+   struct rcu_head *head, *next;
+
+   /* It is time to do bulk reclaim after grace period */
+   krc->monitor_todo = false;
+   if (queue_kfree_rcu_work(krc)) {
+   spin_unlock_irqrestore(>lock, flags);
+   return;
+   }
+
+   /*
+* Use non-batch regular call_rcu for kfree_rcu in case things are too
+* busy and batching of kfree_rcu could not be used.
+*/
+   head = krc->head;
+   krc->head = NULL;
+   krc->kfree_batch_len = 0;
+   spin_unlock_irqrestore(>lock, flags);
+
+   for (; head; head = next) {
+   next = head->next;
+   head->next = NULL;
+   __call_rcu(head, head->func, -1, 1);
+   }
+}
+
+/*
+ * If enough

['PATCH v2' 4/7] docs: rcu: Correct links referring to titles

2019-08-01 Thread Joel Fernandes (Google)
Mauro's auto conversion broken these links, fix them.

Signed-off-by: Joel Fernandes (Google) 
---
 .../Tree-RCU-Memory-Ordering.rst  | 17 ++--
 .../RCU/Design/Requirements/Requirements.rst  | 90 ---
 2 files changed, 47 insertions(+), 60 deletions(-)

diff --git 
a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst 
b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
index 1011b5db1b3d..248b1222f918 100644
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
@@ -230,15 +230,14 @@ Tree RCU Grace Period Memory Ordering Components
 Tree RCU's grace-period memory-ordering guarantee is provided by a
 number of RCU components:
 
-#. `Callback Registry <#Callback%20Registry>`__
-#. `Grace-Period Initialization <#Grace-Period%20Initialization>`__
-#. `Self-Reported Quiescent
-   States <#Self-Reported%20Quiescent%20States>`__
-#. `Dynamic Tick Interface <#Dynamic%20Tick%20Interface>`__
-#. `CPU-Hotplug Interface <#CPU-Hotplug%20Interface>`__
-#. `Forcing Quiescent States `__
-#. `Grace-Period Cleanup `__
-#. `Callback Invocation `__
+#. `Callback Registry`_
+#. `Grace-Period Initialization`_
+#. `Self-Reported Quiescent States`_
+#. `Dynamic Tick Interface`_
+#. `CPU-Hotplug Interface`_
+#. `Forcing Quiescent States`_
+#. `Grace-Period Cleanup`_
+#. `Callback Invocation`_
 
 Each of the following section looks at the corresponding component in
 detail.
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst 
b/Documentation/RCU/Design/Requirements/Requirements.rst
index 876e0038bb58..a33b5fb331b4 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -36,16 +36,14 @@ technologies in interesting new ways.
 All that aside, here are the categories of currently known RCU
 requirements:
 
-#. `Fundamental Requirements <#Fundamental%20Requirements>`__
-#. `Fundamental Non-Requirements <#Fundamental%20Non-Requirements>`__
-#. `Parallelism Facts of Life <#Parallelism%20Facts%20of%20Life>`__
-#. `Quality-of-Implementation
-   Requirements <#Quality-of-Implementation%20Requirements>`__
-#. `Linux Kernel Complications <#Linux%20Kernel%20Complications>`__
-#. `Software-Engineering
-   Requirements <#Software-Engineering%20Requirements>`__
-#. `Other RCU Flavors <#Other%20RCU%20Flavors>`__
-#. `Possible Future Changes <#Possible%20Future%20Changes>`__
+#. `Fundamental Requirements`_
+#. `Fundamental Non-Requirements`_
+#. `Parallelism Facts of Life`_
+#. `Quality-of-Implementation Requirements`_
+#. `Linux Kernel Complications`_
+#. `Software-Engineering Requirements`_
+#. `Other RCU Flavors`_
+#. `Possible Future Changes`_
 
 This is followed by a `summary <#Summary>`__, however, the answers to
 each quick quiz immediately follows the quiz. Select the big white space
@@ -57,13 +55,11 @@ Fundamental Requirements
 RCU's fundamental requirements are the closest thing RCU has to hard
 mathematical requirements. These are:
 
-#. `Grace-Period Guarantee <#Grace-Period%20Guarantee>`__
-#. `Publish-Subscribe Guarantee <#Publish-Subscribe%20Guarantee>`__
-#. `Memory-Barrier Guarantees <#Memory-Barrier%20Guarantees>`__
-#. `RCU Primitives Guaranteed to Execute
-   Unconditionally 
<#RCU%20Primitives%20Guaranteed%20to%20Execute%20Unconditionally>`__
-#. `Guaranteed Read-to-Write
-   Upgrade <#Guaranteed%20Read-to-Write%20Upgrade>`__
+#. `Grace-Period Guarantee`_
+#. `Publish/Subscribe Guarantee`_
+#. `Memory-Barrier Guarantees`_
+#. `RCU Primitives Guaranteed to Execute Unconditionally`_
+#. `Guaranteed Read-to-Write Upgrade`_
 
 Grace-Period Guarantee
 ~~
@@ -689,16 +685,11 @@ infinitely long, however, the following sections list a 
few
 non-guarantees that have caused confusion. Except where otherwise noted,
 these non-guarantees were premeditated.
 
-#. `Readers Impose Minimal
-   Ordering <#Readers%20Impose%20Minimal%20Ordering>`__
-#. `Readers Do Not Exclude
-   Updaters <#Readers%20Do%20Not%20Exclude%20Updaters>`__
-#. `Updaters Only Wait For Old
-   Readers <#Updaters%20Only%20Wait%20For%20Old%20Readers>`__
-#. `Grace Periods Don't Partition Read-Side Critical
-   Sections 
<#Grace%20Periods%20Don't%20Partition%20Read-Side%20Critical%20Sections>`__
-#. `Read-Side Critical Sections Don't Partition Grace
-   Periods 
<#Read-Side%20Critical%20Sections%20Don't%20Partition%20Grace%20Periods>`__
+#. `Readers Impose Minimal Ordering`_
+#. `Readers Do Not Exclude Updaters`_
+#. `Updaters Only Wait For Old Readers`_
+#. `Grace Periods Don't Partition Read-Side Critical Sections`_
+#. `Read-Side Critical Sections Don't Partition Grace Periods`_
 
 Readers Impose Minimal Ordering
 ~~~
@@ -

['PATCH v2' 7/7] Restore docs "rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock()"

2019-08-01 Thread Joel Fernandes (Google)
This restores docs back in ReST format.
---
 .../RCU/Design/Requirements/Requirements.rst  | 54 +++
 1 file changed, 54 insertions(+)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst 
b/Documentation/RCU/Design/Requirements/Requirements.rst
index 0b222469d7ce..fd5e2cbc4935 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1691,6 +1691,7 @@ follows:
 #. `Hotplug CPU`_
 #. `Scheduler and RCU`_
 #. `Tracing and RCU`_
+#. `Accesses to User Memory and RCU`_
 #. `Energy Efficiency`_
 #. `Scheduling-Clock Interrupts and RCU`_
 #. `Memory Efficiency`_
@@ -2004,6 +2005,59 @@ where RCU readers execute in environments in which 
tracing cannot be
 used. The tracing folks both located the requirement and provided the
 needed fix, so this surprise requirement was relatively painless.
 
+Accesses to User Memory and RCU
+~~~
+
+The kernel needs to access user-space memory, for example, to access data
+referenced by system-call parameters.  The ``get_user()`` macro does this job.
+
+However, user-space memory might well be paged out, which means that
+``get_user()`` might well page-fault and thus block while waiting for the
+resulting I/O to complete.  It would be a very bad thing for the compiler to
+reorder a ``get_user()`` invocation into an RCU read-side critical section.
+
+For example, suppose that the source code looked like this:
+
+  ::
+
+   1 rcu_read_lock();
+   2 p = rcu_dereference(gp);
+   3 v = p->value;
+   4 rcu_read_unlock();
+   5 get_user(user_v, user_p);
+   6 do_something_with(v, user_v);
+
+The compiler must not be permitted to transform this source code into
+the following:
+
+  ::
+
+   1 rcu_read_lock();
+   2 p = rcu_dereference(gp);
+   3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!!
+   4 v = p->value;
+   5 rcu_read_unlock();
+   6 do_something_with(v, user_v);
+
+If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel
+build, and if ``get_user()`` did page fault, the result would be a quiescent
+state in the middle of an RCU read-side critical section.  This misplaced
+quiescent state could result in line 4 being a use-after-free access,
+which could be bad for your kernel's actuarial statistics.  Similar examples
+can be constructed with the call to ``get_user()`` preceding the
+``rcu_read_lock()``.
+
+Unfortunately, ``get_user()`` doesn't have any particular ordering properties,
+and in some architectures the underlying ``asm`` isn't even marked
+``volatile``.  And even if it was marked ``volatile``, the above access to
+``p->value`` is not volatile, so the compiler would not have any reason to keep
+those two accesses in order.
+
+Therefore, the Linux-kernel definitions of ``rcu_read_lock()`` and
+``rcu_read_unlock()`` must act as compiler barriers, at least for outermost
+instances of ``rcu_read_lock()`` and ``rcu_read_unlock()`` within a nested set
+of RCU read-side critical sections.
+
 Energy Efficiency
 ~
 
-- 
2.22.0.770.g0f2c4a37fd-goog



['PATCH v2' 2/7] Revert docs from "treewide: Rename rcu_dereference_raw_notrace() to _check()"

2019-08-01 Thread Joel Fernandes (Google)
This reverts docs from commit 355e9972da81e803bbb825b76106ae9b358caf8e.
---
 Documentation/RCU/Design/Requirements/Requirements.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html 
b/Documentation/RCU/Design/Requirements/Requirements.html
index bdbc84f1b949..5a9238a2883c 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2512,7 +2512,7 @@ disabled across the entire RCU read-side critical section.
 
 It is possible to use tracing on RCU code, but tracing itself
 uses RCU.
-For this reason, rcu_dereference_raw_check()
+For this reason, rcu_dereference_raw_notrace()
 is provided for use by tracing, which avoids the destructive
 recursion that could otherwise ensue.
 This API is also used by virtualization in some architectures,
-- 
2.22.0.770.g0f2c4a37fd-goog



[PATCH v2 0/7] Doc updates to /dev branch

2019-08-01 Thread Joel Fernandes (Google)
This series fixes the rcu/dev branch with the new ReST conversion patches.

Only changes are to documentation.

thanks,

 - Joel

Joel Fernandes (Google) (6):
Revert docs from "rcu: Restore barrier() to rcu_read_lock() and
rcu_read_unlock()"
Revert docs from "treewide: Rename rcu_dereference_raw_notrace() to
_check()"
docs: rcu: Correct links referring to titles
docs: rcu: Increase toctree to 3
Restore docs "treewide: Rename rcu_dereference_raw_notrace() to
_check()"
Restore docs "rcu: Restore barrier() to rcu_read_lock() and
rcu_read_unlock()"

Mauro Carvalho Chehab (1):
docs: rcu: convert some articles from html to ReST

.../Data-Structures/Data-Structures.html  | 1391 ---
.../Data-Structures/Data-Structures.rst   | 1163 ++
.../Expedited-Grace-Periods.html  |  668 
.../Expedited-Grace-Periods.rst   |  521 +++
.../Memory-Ordering/Tree-RCU-Diagram.html |9 -
.../Tree-RCU-Memory-Ordering.html |  704 
.../Tree-RCU-Memory-Ordering.rst  |  624 +++
.../RCU/Design/Requirements/Requirements.html | 3401 -
.../RCU/Design/Requirements/Requirements.rst  | 2704 +
Documentation/RCU/index.rst   |7 +-
Documentation/RCU/whatisRCU.txt   |4 +-
11 files changed, 5020 insertions(+), 6176 deletions(-)
delete mode 100644 Documentation/RCU/Design/Data-Structures/Data-Structures.html
create mode 100644 Documentation/RCU/Design/Data-Structures/Data-Structures.rst
delete mode 100644 
Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
create mode 100644 
Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
delete mode 100644 
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Diagram.html
delete mode 100644 
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
create mode 100644 
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
delete mode 100644 Documentation/RCU/Design/Requirements/Requirements.html
create mode 100644 Documentation/RCU/Design/Requirements/Requirements.rst

--
2.22.0.770.g0f2c4a37fd-goog



['PATCH v2' 5/7] docs: rcu: Increase toctree to 3

2019-08-01 Thread Joel Fernandes (Google)
These documents are long and have various sections. Provide a good
toc nesting level.

Signed-off-by: Joel Fernandes (Google) 
---
 Documentation/RCU/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 94427dc1f23d..5c99185710fa 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -5,7 +5,7 @@ RCU concepts
 
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 3
 
rcu
listRCU
-- 
2.22.0.770.g0f2c4a37fd-goog



['PATCH v2' 6/7] Restore docs "treewide: Rename rcu_dereference_raw_notrace() to _check()"

2019-08-01 Thread Joel Fernandes (Google)
This restores docs back in ReST format.
---
 Documentation/RCU/Design/Requirements/Requirements.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst 
b/Documentation/RCU/Design/Requirements/Requirements.rst
index a33b5fb331b4..0b222469d7ce 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1997,7 +1997,7 @@ Tracing and RCU
 ~~~
 
 It is possible to use tracing on RCU code, but tracing itself uses RCU.
-For this reason, ``rcu_dereference_raw_notrace()`` is provided for use
+For this reason, ``rcu_dereference_raw_check()`` is provided for use
 by tracing, which avoids the destructive recursion that could otherwise
 ensue. This API is also used by virtualization in some architectures,
 where RCU readers execute in environments in which tracing cannot be
-- 
2.22.0.770.g0f2c4a37fd-goog



['PATCH v2' 1/7] Revert docs from "rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock()"

2019-08-01 Thread Joel Fernandes (Google)
This reverts docs from commit d6b9cd7dc8e041ee83cb1362fce59a3cdb1f2709.
---
 .../RCU/Design/Requirements/Requirements.html | 71 ---
 1 file changed, 71 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html 
b/Documentation/RCU/Design/Requirements/Requirements.html
index 467251f7fef6..bdbc84f1b949 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2129,8 +2129,6 @@ Some of the relevant points of interest are as follows:
Hotplug CPU.
Scheduler and RCU.
Tracing and RCU.
-   
-Accesses to User Memory and RCU.
Energy Efficiency.

Scheduling-Clock Interrupts and RCU.
@@ -2523,75 +2521,6 @@ cannot be used.
 The tracing folks both located the requirement and provided the
 needed fix, so this surprise requirement was relatively painless.
 
-
-Accesses to User Memory and RCU
-
-
-The kernel needs to access user-space memory, for example, to access
-data referenced by system-call parameters.
-The get_user() macro does this job.
-
-
-However, user-space memory might well be paged out, which means
-that get_user() might well page-fault and thus block while
-waiting for the resulting I/O to complete.
-It would be a very bad thing for the compiler to reorder
-a get_user() invocation into an RCU read-side critical
-section.
-For example, suppose that the source code looked like this:
-
-
-
- 1 rcu_read_lock();
- 2 p = rcu_dereference(gp);
- 3 v = p-value;
- 4 rcu_read_unlock();
- 5 get_user(user_v, user_p);
- 6 do_something_with(v, user_v);
-
-
-
-
-The compiler must not be permitted to transform this source code into
-the following:
-
-
-
- 1 rcu_read_lock();
- 2 p = rcu_dereference(gp);
- 3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!!
- 4 v = p-value;
- 5 rcu_read_unlock();
- 6 do_something_with(v, user_v);
-
-
-
-
-If the compiler did make this transformation in a
-CONFIG_PREEMPT=n kernel build, and if get_user() did
-page fault, the result would be a quiescent state in the middle
-of an RCU read-side critical section.
-This misplaced quiescent state could result in line4 being
-a use-after-free access, which could be bad for your kernel's
-actuarial statistics.
-Similar examples can be constructed with the call to get_user()
-preceding the rcu_read_lock().
-
-
-Unfortunately, get_user() doesn't have any particular
-ordering properties, and in some architectures the underlying asm
-isn't even marked volatile.
-And even if it was marked volatile, the above access to
-p-value is not volatile, so the compiler would not have any
-reason to keep those two accesses in order.
-
-
-Therefore, the Linux-kernel definitions of rcu_read_lock()
-and rcu_read_unlock() must act as compiler barriers,
-at least for outermost instances of rcu_read_lock() and
-rcu_read_unlock() within a nested set of RCU read-side critical
-sections.
-
 Energy Efficiency
 
 
-- 
2.22.0.770.g0f2c4a37fd-goog



[PATCH 9/9] Revert "Revert "rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock()""

2019-08-01 Thread Joel Fernandes (Google)
This reverts commit 43ddb98ebe7171ff1c6e11c1616fd03726d8e9bf while
adding the documentation that the original commit added but in ReST
format.
---
 .../RCU/Design/Requirements/Requirements.rst  | 54 +++
 kernel/rcu/tree_plugin.h  | 11 
 2 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst 
b/Documentation/RCU/Design/Requirements/Requirements.rst
index 0b222469d7ce..fd5e2cbc4935 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1691,6 +1691,7 @@ follows:
 #. `Hotplug CPU`_
 #. `Scheduler and RCU`_
 #. `Tracing and RCU`_
+#. `Accesses to User Memory and RCU`_
 #. `Energy Efficiency`_
 #. `Scheduling-Clock Interrupts and RCU`_
 #. `Memory Efficiency`_
@@ -2004,6 +2005,59 @@ where RCU readers execute in environments in which 
tracing cannot be
 used. The tracing folks both located the requirement and provided the
 needed fix, so this surprise requirement was relatively painless.
 
+Accesses to User Memory and RCU
+~~~
+
+The kernel needs to access user-space memory, for example, to access data
+referenced by system-call parameters.  The ``get_user()`` macro does this job.
+
+However, user-space memory might well be paged out, which means that
+``get_user()`` might well page-fault and thus block while waiting for the
+resulting I/O to complete.  It would be a very bad thing for the compiler to
+reorder a ``get_user()`` invocation into an RCU read-side critical section.
+
+For example, suppose that the source code looked like this:
+
+  ::
+
+   1 rcu_read_lock();
+   2 p = rcu_dereference(gp);
+   3 v = p->value;
+   4 rcu_read_unlock();
+   5 get_user(user_v, user_p);
+   6 do_something_with(v, user_v);
+
+The compiler must not be permitted to transform this source code into
+the following:
+
+  ::
+
+   1 rcu_read_lock();
+   2 p = rcu_dereference(gp);
+   3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!!
+   4 v = p->value;
+   5 rcu_read_unlock();
+   6 do_something_with(v, user_v);
+
+If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel
+build, and if ``get_user()`` did page fault, the result would be a quiescent
+state in the middle of an RCU read-side critical section.  This misplaced
+quiescent state could result in line 4 being a use-after-free access,
+which could be bad for your kernel's actuarial statistics.  Similar examples
+can be constructed with the call to ``get_user()`` preceding the
+``rcu_read_lock()``.
+
+Unfortunately, ``get_user()`` doesn't have any particular ordering properties,
+and in some architectures the underlying ``asm`` isn't even marked
+``volatile``.  And even if it was marked ``volatile``, the above access to
+``p->value`` is not volatile, so the compiler would not have any reason to keep
+those two accesses in order.
+
+Therefore, the Linux-kernel definitions of ``rcu_read_lock()`` and
+``rcu_read_unlock()`` must act as compiler barriers, at least for outermost
+instances of ``rcu_read_lock()`` and ``rcu_read_unlock()`` within a nested set
+of RCU read-side critical sections.
+
 Energy Efficiency
 ~
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index e1491d262892..379cb7e50a62 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -288,7 +288,6 @@ void rcu_note_context_switch(bool preempt)
struct rcu_data *rdp = this_cpu_ptr(_data);
struct rcu_node *rnp;
 
-   barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
@@ -331,7 +330,6 @@ void rcu_note_context_switch(bool preempt)
if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
-   barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
@@ -815,11 +813,6 @@ static void rcu_qs(void)
  * dyntick-idle quiescent state visible to other CPUs, which will in
  * some cases serve for expedited as well as normal grace periods.
  * Either way, register a lightweight quiescent state.
- *
- * The barrier() calls are redundant in the common case when this is
- * called externally, but just in case this is called from within this
- * file.
- *
  */
 void rcu_all_qs(void)
 {
@@ -834,14 +827,12 @@ void rcu_all_qs(void)
return;
}
this_cpu_write(rcu_data.rcu_urgent_qs, false);
-   barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();

[PATCH 3/9] Revert "treewide: Rename rcu_dereference_raw_notrace() to _check()"

2019-08-01 Thread Joel Fernandes (Google)
This reverts commit 355e9972da81e803bbb825b76106ae9b358caf8e.
---
 Documentation/RCU/Design/Requirements/Requirements.html | 2 +-
 arch/powerpc/include/asm/kvm_book3s_64.h| 2 +-
 include/linux/rculist.h | 6 +++---
 include/linux/rcupdate.h| 2 +-
 kernel/trace/ftrace_internal.h  | 8 
 kernel/trace/trace.c| 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html 
b/Documentation/RCU/Design/Requirements/Requirements.html
index bdbc84f1b949..5a9238a2883c 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2512,7 +2512,7 @@ disabled across the entire RCU read-side critical section.
 
 It is possible to use tracing on RCU code, but tracing itself
 uses RCU.
-For this reason, rcu_dereference_raw_check()
+For this reason, rcu_dereference_raw_notrace()
 is provided for use by tracing, which avoids the destructive
 recursion that could otherwise ensue.
 This API is also used by virtualization in some architectures,
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 04b2b927bb5a..bb7c8cc77f1a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -535,7 +535,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-   return rcu_dereference_raw_check(kvm->memslots[0]);
+   return rcu_dereference_raw_notrace(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 932296144131..e91ec9ddcd30 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -622,7 +622,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define hlist_for_each_entry_rcu(pos, head, member)\
-   for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
+   for (pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
@@ -642,10 +642,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * not do any RCU debugging or tracing.
  */
 #define hlist_for_each_entry_rcu_notrace(pos, head, member)
\
-   for (pos = 
hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
+   for (pos = hlist_entry_safe 
(rcu_dereference_raw_notrace(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
-   pos = 
hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
+   pos = 
hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
 
 /**
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index bfcafbc1e301..8f7167478c1d 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -476,7 +476,7 @@ do {
  \
  * The no-tracing version of rcu_dereference_raw() must not call
  * rcu_read_lock_held().
  */
-#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
+#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
 
 /**
  * rcu_dereference_protected() - fetch RCU pointer when updates prevented
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0456e0a3dab1..0515a2096f90 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
 
 /*
  * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw_check() is that elements removed from this list
+ * can use rcu_dereference_raw_notrace() is that elements removed from this 
list
  * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw_check() calls are needed to handle
+ * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
  * concurrent insertions into the ftrace_global_list.
  *
  * Silly Alpha and silly pointer-speculation compiler optimizations!
  */
 #define do_for_each_ftrace_op(op, list)\
-   op = rcu_dereference_raw_check(list);   \
+   op = rcu_dereference_raw_notrace(list); \
do
 
 /*
  * 

[PATCH 6/9] docs: rcu: Increase toctree to 3

2019-08-01 Thread Joel Fernandes (Google)
These documents are long and have various sections. Provide a good
toc nesting level.

Signed-off-by: Joel Fernandes (Google) 
---
 Documentation/RCU/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/RCU/index.rst b/Documentation/RCU/index.rst
index 94427dc1f23d..5c99185710fa 100644
--- a/Documentation/RCU/index.rst
+++ b/Documentation/RCU/index.rst
@@ -5,7 +5,7 @@ RCU concepts
 
 
 .. toctree::
-   :maxdepth: 1
+   :maxdepth: 3
 
rcu
listRCU
-- 
2.22.0.770.g0f2c4a37fd-goog



[PATCH 8/9] Revert "Revert "rcu: Add support for consolidated-RCU reader checking""

2019-08-01 Thread Joel Fernandes (Google)
This reverts commit 24be1727c524b5874d5dc7828cd392cf86c3341e.
---
 include/linux/rculist.h  | 32 
 include/linux/rcupdate.h |  7 +
 kernel/rcu/Kconfig.debug | 11 +++
 kernel/rcu/update.c  | 65 ++--
 4 files changed, 88 insertions(+), 27 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 932296144131..4158b7212936 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -40,6 +40,24 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  */
 #define list_next_rcu(list)(*((struct list_head __rcu **)(&(list)->next)))
 
+/*
+ * Check during list traversal that we are within an RCU reader
+ */
+
+#define check_arg_count_one(dummy)
+
+#ifdef CONFIG_PROVE_RCU_LIST
+#define __list_check_rcu(dummy, cond, extra...)
\
+   ({  \
+   check_arg_count_one(extra); \
+   RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(),\
+"RCU-list traversed in non-reader section!");  \
+})
+#else
+#define __list_check_rcu(dummy, cond, extra...)
\
+   ({ check_arg_count_one(extra); })
+#endif
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -343,14 +361,16 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
  * @member:the name of the list_head within the struct.
+ * @cond:  optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define list_for_each_entry_rcu(pos, head, member) \
-   for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
-   >member != (head); \
+#define list_for_each_entry_rcu(pos, head, member, cond...)\
+   for (__list_check_rcu(dummy, ## cond, 0),   \
+pos = list_entry_rcu((head)->next, typeof(*pos), member);  \
+   >member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
@@ -616,13 +636,15 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
  * @member:the name of the hlist_node within the struct.
+ * @cond:  optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define hlist_for_each_entry_rcu(pos, head, member)\
-   for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
+#define hlist_for_each_entry_rcu(pos, head, member, cond...)   \
+   for (__list_check_rcu(dummy, ## cond, 0),   \
+pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index bfcafbc1e301..80d6056f5855 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -221,6 +221,7 @@ int debug_lockdep_rcu_enabled(void);
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
 int rcu_read_lock_sched_held(void);
+int rcu_read_lock_any_held(void);
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -241,6 +242,12 @@ static inline int rcu_read_lock_sched_held(void)
 {
return !preemptible();
 }
+
+static inline int rcu_read_lock_any_held(void)
+{
+   return !preemptible();
+}
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..4aa02eee8f6c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
 config PROVE_RCU
def_bool PROVE_LOCKING
 
+config PROVE_RCU_LIST
+   bool "RCU list lockdep debugging"
+   depends on PROVE_RCU && RCU_EXPERT
+   default n
+   help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove 

[PATCH 5/9] docs: rcu: Correct links referring to titles

2019-08-01 Thread Joel Fernandes (Google)
Mauro's auto conversion broken these links, fix them.

Signed-off-by: Joel Fernandes (Google) 
---
 .../Tree-RCU-Memory-Ordering.rst  | 17 ++--
 .../RCU/Design/Requirements/Requirements.rst  | 90 ---
 2 files changed, 47 insertions(+), 60 deletions(-)

diff --git 
a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst 
b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
index 1011b5db1b3d..248b1222f918 100644
--- a/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
+++ b/Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
@@ -230,15 +230,14 @@ Tree RCU Grace Period Memory Ordering Components
 Tree RCU's grace-period memory-ordering guarantee is provided by a
 number of RCU components:
 
-#. `Callback Registry <#Callback%20Registry>`__
-#. `Grace-Period Initialization <#Grace-Period%20Initialization>`__
-#. `Self-Reported Quiescent
-   States <#Self-Reported%20Quiescent%20States>`__
-#. `Dynamic Tick Interface <#Dynamic%20Tick%20Interface>`__
-#. `CPU-Hotplug Interface <#CPU-Hotplug%20Interface>`__
-#. `Forcing Quiescent States `__
-#. `Grace-Period Cleanup `__
-#. `Callback Invocation `__
+#. `Callback Registry`_
+#. `Grace-Period Initialization`_
+#. `Self-Reported Quiescent States`_
+#. `Dynamic Tick Interface`_
+#. `CPU-Hotplug Interface`_
+#. `Forcing Quiescent States`_
+#. `Grace-Period Cleanup`_
+#. `Callback Invocation`_
 
 Each of the following section looks at the corresponding component in
 detail.
diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst 
b/Documentation/RCU/Design/Requirements/Requirements.rst
index 876e0038bb58..a33b5fb331b4 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -36,16 +36,14 @@ technologies in interesting new ways.
 All that aside, here are the categories of currently known RCU
 requirements:
 
-#. `Fundamental Requirements <#Fundamental%20Requirements>`__
-#. `Fundamental Non-Requirements <#Fundamental%20Non-Requirements>`__
-#. `Parallelism Facts of Life <#Parallelism%20Facts%20of%20Life>`__
-#. `Quality-of-Implementation
-   Requirements <#Quality-of-Implementation%20Requirements>`__
-#. `Linux Kernel Complications <#Linux%20Kernel%20Complications>`__
-#. `Software-Engineering
-   Requirements <#Software-Engineering%20Requirements>`__
-#. `Other RCU Flavors <#Other%20RCU%20Flavors>`__
-#. `Possible Future Changes <#Possible%20Future%20Changes>`__
+#. `Fundamental Requirements`_
+#. `Fundamental Non-Requirements`_
+#. `Parallelism Facts of Life`_
+#. `Quality-of-Implementation Requirements`_
+#. `Linux Kernel Complications`_
+#. `Software-Engineering Requirements`_
+#. `Other RCU Flavors`_
+#. `Possible Future Changes`_
 
 This is followed by a `summary <#Summary>`__, however, the answers to
 each quick quiz immediately follows the quiz. Select the big white space
@@ -57,13 +55,11 @@ Fundamental Requirements
 RCU's fundamental requirements are the closest thing RCU has to hard
 mathematical requirements. These are:
 
-#. `Grace-Period Guarantee <#Grace-Period%20Guarantee>`__
-#. `Publish-Subscribe Guarantee <#Publish-Subscribe%20Guarantee>`__
-#. `Memory-Barrier Guarantees <#Memory-Barrier%20Guarantees>`__
-#. `RCU Primitives Guaranteed to Execute
-   Unconditionally 
<#RCU%20Primitives%20Guaranteed%20to%20Execute%20Unconditionally>`__
-#. `Guaranteed Read-to-Write
-   Upgrade <#Guaranteed%20Read-to-Write%20Upgrade>`__
+#. `Grace-Period Guarantee`_
+#. `Publish/Subscribe Guarantee`_
+#. `Memory-Barrier Guarantees`_
+#. `RCU Primitives Guaranteed to Execute Unconditionally`_
+#. `Guaranteed Read-to-Write Upgrade`_
 
 Grace-Period Guarantee
 ~~
@@ -689,16 +685,11 @@ infinitely long, however, the following sections list a 
few
 non-guarantees that have caused confusion. Except where otherwise noted,
 these non-guarantees were premeditated.
 
-#. `Readers Impose Minimal
-   Ordering <#Readers%20Impose%20Minimal%20Ordering>`__
-#. `Readers Do Not Exclude
-   Updaters <#Readers%20Do%20Not%20Exclude%20Updaters>`__
-#. `Updaters Only Wait For Old
-   Readers <#Updaters%20Only%20Wait%20For%20Old%20Readers>`__
-#. `Grace Periods Don't Partition Read-Side Critical
-   Sections 
<#Grace%20Periods%20Don't%20Partition%20Read-Side%20Critical%20Sections>`__
-#. `Read-Side Critical Sections Don't Partition Grace
-   Periods 
<#Read-Side%20Critical%20Sections%20Don't%20Partition%20Grace%20Periods>`__
+#. `Readers Impose Minimal Ordering`_
+#. `Readers Do Not Exclude Updaters`_
+#. `Updaters Only Wait For Old Readers`_
+#. `Grace Periods Don't Partition Read-Side Critical Sections`_
+#. `Read-Side Critical Sections Don't Partition Grace Periods`_
 
 Readers Impose Minimal Ordering
 ~~~
@@ -

[PATCH 7/9] Revert "Revert "treewide: Rename rcu_dereference_raw_notrace() to _check()""

2019-08-01 Thread Joel Fernandes (Google)
This reverts commit 61d814760f1d2dffdc8db636f70bbef07c30acd5.
---
 Documentation/RCU/Design/Requirements/Requirements.rst | 2 +-
 arch/powerpc/include/asm/kvm_book3s_64.h   | 2 +-
 include/linux/rculist.h| 6 +++---
 include/linux/rcupdate.h   | 2 +-
 kernel/trace/ftrace_internal.h | 8 
 kernel/trace/trace.c   | 4 ++--
 6 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst 
b/Documentation/RCU/Design/Requirements/Requirements.rst
index a33b5fb331b4..0b222469d7ce 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.rst
+++ b/Documentation/RCU/Design/Requirements/Requirements.rst
@@ -1997,7 +1997,7 @@ Tracing and RCU
 ~~~
 
 It is possible to use tracing on RCU code, but tracing itself uses RCU.
-For this reason, ``rcu_dereference_raw_notrace()`` is provided for use
+For this reason, ``rcu_dereference_raw_check()`` is provided for use
 by tracing, which avoids the destructive recursion that could otherwise
 ensue. This API is also used by virtualization in some architectures,
 where RCU readers execute in environments in which tracing cannot be
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index bb7c8cc77f1a..04b2b927bb5a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -535,7 +535,7 @@ static inline void note_hpte_modification(struct kvm *kvm,
  */
 static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 {
-   return rcu_dereference_raw_notrace(kvm->memslots[0]);
+   return rcu_dereference_raw_check(kvm->memslots[0]);
 }
 
 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e91ec9ddcd30..932296144131 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -622,7 +622,7 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define hlist_for_each_entry_rcu(pos, head, member)\
-   for (pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
+   for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
@@ -642,10 +642,10 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * not do any RCU debugging or tracing.
  */
 #define hlist_for_each_entry_rcu_notrace(pos, head, member)
\
-   for (pos = hlist_entry_safe 
(rcu_dereference_raw_notrace(hlist_first_rcu(head)),\
+   for (pos = 
hlist_entry_safe(rcu_dereference_raw_check(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
-   pos = 
hlist_entry_safe(rcu_dereference_raw_notrace(hlist_next_rcu(\
+   pos = 
hlist_entry_safe(rcu_dereference_raw_check(hlist_next_rcu(\
&(pos)->member)), typeof(*(pos)), member))
 
 /**
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 8f7167478c1d..bfcafbc1e301 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -476,7 +476,7 @@ do {
  \
  * The no-tracing version of rcu_dereference_raw() must not call
  * rcu_read_lock_held().
  */
-#define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
+#define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu)
 
 /**
  * rcu_dereference_protected() - fetch RCU pointer when updates prevented
diff --git a/kernel/trace/ftrace_internal.h b/kernel/trace/ftrace_internal.h
index 0515a2096f90..0456e0a3dab1 100644
--- a/kernel/trace/ftrace_internal.h
+++ b/kernel/trace/ftrace_internal.h
@@ -6,22 +6,22 @@
 
 /*
  * Traverse the ftrace_global_list, invoking all entries.  The reason that we
- * can use rcu_dereference_raw_notrace() is that elements removed from this 
list
+ * can use rcu_dereference_raw_check() is that elements removed from this list
  * are simply leaked, so there is no need to interact with a grace-period
- * mechanism.  The rcu_dereference_raw_notrace() calls are needed to handle
+ * mechanism.  The rcu_dereference_raw_check() calls are needed to handle
  * concurrent insertions into the ftrace_global_list.
  *
  * Silly Alpha and silly pointer-speculation compiler optimizations!
  */
 #define do_for_each_ftrace_op(op, list)\
-   op = rcu_dereference_raw_notrace(list); \
+   op = 

[PATCH 2/9] Revert "rcu: Add support for consolidated-RCU reader checking"

2019-08-01 Thread Joel Fernandes (Google)
This reverts commit 50ad3f1f9b13c8a6f2ae79df4cecb2c21da1c7c8.
---
 include/linux/rculist.h  | 32 
 include/linux/rcupdate.h |  7 -
 kernel/rcu/Kconfig.debug | 11 ---
 kernel/rcu/update.c  | 65 ++--
 4 files changed, 27 insertions(+), 88 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index 4158b7212936..932296144131 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -40,24 +40,6 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  */
 #define list_next_rcu(list)(*((struct list_head __rcu **)(&(list)->next)))
 
-/*
- * Check during list traversal that we are within an RCU reader
- */
-
-#define check_arg_count_one(dummy)
-
-#ifdef CONFIG_PROVE_RCU_LIST
-#define __list_check_rcu(dummy, cond, extra...)
\
-   ({  \
-   check_arg_count_one(extra); \
-   RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(),\
-"RCU-list traversed in non-reader section!");  \
-})
-#else
-#define __list_check_rcu(dummy, cond, extra...)
\
-   ({ check_arg_count_one(extra); })
-#endif
-
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -361,16 +343,14 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
  * @member:the name of the list_head within the struct.
- * @cond:  optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define list_for_each_entry_rcu(pos, head, member, cond...)\
-   for (__list_check_rcu(dummy, ## cond, 0),   \
-pos = list_entry_rcu((head)->next, typeof(*pos), member);  \
-   >member != (head); \
+#define list_for_each_entry_rcu(pos, head, member) \
+   for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
+   >member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
@@ -636,15 +616,13 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
  * @member:the name of the hlist_node within the struct.
- * @cond:  optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define hlist_for_each_entry_rcu(pos, head, member, cond...)   \
-   for (__list_check_rcu(dummy, ## cond, 0),   \
-pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
+#define hlist_for_each_entry_rcu(pos, head, member)\
+   for (pos = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 80d6056f5855..bfcafbc1e301 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -221,7 +221,6 @@ int debug_lockdep_rcu_enabled(void);
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
 int rcu_read_lock_sched_held(void);
-int rcu_read_lock_any_held(void);
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -242,12 +241,6 @@ static inline int rcu_read_lock_sched_held(void)
 {
return !preemptible();
 }
-
-static inline int rcu_read_lock_any_held(void)
-{
-   return !preemptible();
-}
-
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 4aa02eee8f6c..5ec3ea4028e2 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,17 +8,6 @@ menu "RCU Debugging"
 config PROVE_RCU
def_bool PROVE_LOCKING
 
-config PROVE_RCU_LIST
-   bool "RCU list lockdep debugging"
-   depends on PROVE_RCU && RCU_EXPERT
-   default n
-   help
- Enable RCU lockdep checking for list usages. By default it is
- turned off since there are several list RCU users that still
- need to be converted to pass a lockdep expression. To prevent
- false-positive splats, we keep it default disabled but once all
- users are converted, we can remove 

[PATCH 0/9] Apply new rest conversion patches to /dev branch

2019-08-01 Thread Joel Fernandes (Google)
This series fixes the rcu/dev branch so it can apply the new ReST conversion 
patches.

Patches based on "00ec8f46465e  rcu/nohz: Make multi_cpu_stop() enable tick on
all online CPUs"

The easiest was to do this is to revert the patches that conflict and then
applying the doc patches, and then applying them again. But in the
re-application, we convert the documentation

No manual fix ups were done in this process, other than to documentation.

thanks,

 - Joel

And in the process I learnt about get_user() and compiler barriers ;-)

Joel Fernandes (Google) (8):
Revert "rcu: Restore barrier() to rcu_read_lock() and
rcu_read_unlock()"
Revert "rcu: Add support for consolidated-RCU reader checking"
Revert "treewide: Rename rcu_dereference_raw_notrace() to _check()"
docs: rcu: Correct links referring to titles
docs: rcu: Increase toctree to 3
Revert "Revert "treewide: Rename rcu_dereference_raw_notrace() to
_check()""
Revert "Revert "rcu: Add support for consolidated-RCU reader
checking""
Revert "Revert "rcu: Restore barrier() to rcu_read_lock() and
rcu_read_unlock()""

Mauro Carvalho Chehab (1):
docs: rcu: convert some articles from html to ReST

.../Data-Structures/Data-Structures.html  | 1391 ---
.../Data-Structures/Data-Structures.rst   | 1163 ++
.../Expedited-Grace-Periods.html  |  668 
.../Expedited-Grace-Periods.rst   |  521 +++
.../Memory-Ordering/Tree-RCU-Diagram.html |9 -
.../Tree-RCU-Memory-Ordering.html |  704 
.../Tree-RCU-Memory-Ordering.rst  |  624 +++
.../RCU/Design/Requirements/Requirements.html | 3401 -
.../RCU/Design/Requirements/Requirements.rst  | 2704 +
Documentation/RCU/index.rst   |7 +-
Documentation/RCU/whatisRCU.txt   |4 +-
11 files changed, 5020 insertions(+), 6176 deletions(-)
delete mode 100644 Documentation/RCU/Design/Data-Structures/Data-Structures.html
create mode 100644 Documentation/RCU/Design/Data-Structures/Data-Structures.rst
delete mode 100644 
Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.html
create mode 100644 
Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst
delete mode 100644 
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Diagram.html
delete mode 100644 
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.html
create mode 100644 
Documentation/RCU/Design/Memory-Ordering/Tree-RCU-Memory-Ordering.rst
delete mode 100644 Documentation/RCU/Design/Requirements/Requirements.html
create mode 100644 Documentation/RCU/Design/Requirements/Requirements.rst

--
2.22.0.770.g0f2c4a37fd-goog



[PATCH 1/9] Revert "rcu: Restore barrier() to rcu_read_lock() and rcu_read_unlock()"

2019-08-01 Thread Joel Fernandes (Google)
This reverts commit d6b9cd7dc8e041ee83cb1362fce59a3cdb1f2709.
---
 .../RCU/Design/Requirements/Requirements.html | 71 ---
 kernel/rcu/tree_plugin.h  | 11 +++
 2 files changed, 11 insertions(+), 71 deletions(-)

diff --git a/Documentation/RCU/Design/Requirements/Requirements.html 
b/Documentation/RCU/Design/Requirements/Requirements.html
index 467251f7fef6..bdbc84f1b949 100644
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -2129,8 +2129,6 @@ Some of the relevant points of interest are as follows:
Hotplug CPU.
Scheduler and RCU.
Tracing and RCU.
-   
-Accesses to User Memory and RCU.
Energy Efficiency.

Scheduling-Clock Interrupts and RCU.
@@ -2523,75 +2521,6 @@ cannot be used.
 The tracing folks both located the requirement and provided the
 needed fix, so this surprise requirement was relatively painless.
 
-
-Accesses to User Memory and RCU
-
-
-The kernel needs to access user-space memory, for example, to access
-data referenced by system-call parameters.
-The get_user() macro does this job.
-
-
-However, user-space memory might well be paged out, which means
-that get_user() might well page-fault and thus block while
-waiting for the resulting I/O to complete.
-It would be a very bad thing for the compiler to reorder
-a get_user() invocation into an RCU read-side critical
-section.
-For example, suppose that the source code looked like this:
-
-
-
- 1 rcu_read_lock();
- 2 p = rcu_dereference(gp);
- 3 v = p-value;
- 4 rcu_read_unlock();
- 5 get_user(user_v, user_p);
- 6 do_something_with(v, user_v);
-
-
-
-
-The compiler must not be permitted to transform this source code into
-the following:
-
-
-
- 1 rcu_read_lock();
- 2 p = rcu_dereference(gp);
- 3 get_user(user_v, user_p); // BUG: POSSIBLE PAGE FAULT!!!
- 4 v = p-value;
- 5 rcu_read_unlock();
- 6 do_something_with(v, user_v);
-
-
-
-
-If the compiler did make this transformation in a
-CONFIG_PREEMPT=n kernel build, and if get_user() did
-page fault, the result would be a quiescent state in the middle
-of an RCU read-side critical section.
-This misplaced quiescent state could result in line4 being
-a use-after-free access, which could be bad for your kernel's
-actuarial statistics.
-Similar examples can be constructed with the call to get_user()
-preceding the rcu_read_lock().
-
-
-Unfortunately, get_user() doesn't have any particular
-ordering properties, and in some architectures the underlying asm
-isn't even marked volatile.
-And even if it was marked volatile, the above access to
-p-value is not volatile, so the compiler would not have any
-reason to keep those two accesses in order.
-
-
-Therefore, the Linux-kernel definitions of rcu_read_lock()
-and rcu_read_unlock() must act as compiler barriers,
-at least for outermost instances of rcu_read_lock() and
-rcu_read_unlock() within a nested set of RCU read-side critical
-sections.
-
 Energy Efficiency
 
 
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 379cb7e50a62..e1491d262892 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -288,6 +288,7 @@ void rcu_note_context_switch(bool preempt)
struct rcu_data *rdp = this_cpu_ptr(_data);
struct rcu_node *rnp;
 
+   barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0);
@@ -330,6 +331,7 @@ void rcu_note_context_switch(bool preempt)
if (rdp->exp_deferred_qs)
rcu_report_exp_rdp(rdp);
trace_rcu_utilization(TPS("End context switch"));
+   barrier(); /* Avoid RCU read-side critical sections leaking up. */
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
@@ -813,6 +815,11 @@ static void rcu_qs(void)
  * dyntick-idle quiescent state visible to other CPUs, which will in
  * some cases serve for expedited as well as normal grace periods.
  * Either way, register a lightweight quiescent state.
+ *
+ * The barrier() calls are redundant in the common case when this is
+ * called externally, but just in case this is called from within this
+ * file.
+ *
  */
 void rcu_all_qs(void)
 {
@@ -827,12 +834,14 @@ void rcu_all_qs(void)
return;
}
this_cpu_write(rcu_data.rcu_urgent_qs, false);
+   barrier(); /* Avoid RCU read-side critical sections leaking down. */
if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
local_irq_save(flags);
rcu_momentary_dyntick_idle();
local_irq_restore(flags);
}
rcu_qs();
+   barrier(); /* Avoid RCU read-side critical sections leaking up. */
preempt_enable();
 }
 EXPORT_SYMBOL_GPL(rcu_all_qs);
@@ -842,6 +851,7 @@ EXPORT_SYMBOL_GPL(rcu_all_qs);
  */
 void rcu_note_context_switch(bool preempt)
 {
+   

[PATCH] Use term cumul-fence instead of fence in ->prop ordering example

2019-07-29 Thread Joel Fernandes (Google)
To reduce ambiguity in the more exotic ->prop ordering example, let us
use the term cumul-fence instead fence for the 2 fences, so that the
implict ->rfe on loads/stores to Y are covered by the description.

Link: https://lore.kernel.org/lkml/20190729121745.ga140...@google.com

Suggested-by: Alan Stern 
Signed-off-by: Joel Fernandes (Google) 
---
 tools/memory-model/Documentation/explanation.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/memory-model/Documentation/explanation.txt 
b/tools/memory-model/Documentation/explanation.txt
index 68caa9a976d0..634dc6db26c4 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1302,7 +1302,7 @@ followed by an arbitrary number of cumul-fence links, 
ending with an
 rfe link.  You can concoct more exotic examples, containing more than
 one fence, although this quickly leads to diminishing returns in terms
 of complexity.  For instance, here's an example containing a coe link
-followed by two fences and an rfe link, utilizing the fact that
+followed by two cumul-fences and an rfe link, utilizing the fact that
 release fences are A-cumulative:
 
int x, y, z;
@@ -1334,10 +1334,10 @@ If x = 2, r0 = 1, and r2 = 1 after this code runs then 
there is a prop
 link from P0's store to its load.  This is because P0's store gets
 overwritten by P1's store since x = 2 at the end (a coe link), the
 smp_wmb() ensures that P1's store to x propagates to P2 before the
-store to y does (the first fence), the store to y propagates to P2
+store to y does (the first cumul-fence), the store to y propagates to P2
 before P2's load and store execute, P2's smp_store_release()
 guarantees that the stores to x and y both propagate to P0 before the
-store to z does (the second fence), and P0's load executes after the
+store to z does (the second cumul-fence), and P0's load executes after the
 store to z has propagated to P0 (an rfe link).
 
 In summary, the fact that the hb relation links memory access events
-- 
2.22.0.709.g102302147b-goog



[PATCH v2] lkmm/docs: Correct ->prop example with additional rfe link

2019-07-27 Thread Joel Fernandes (Google)
The lkmm example about ->prop relation should describe an additional rfe
link between P1's store to y and P2's load of y, which should be
critical to establishing the ordering resulting in the ->prop ordering
on P0. IOW, there are 2 rfe links, not one.

Correct these in the docs to make the ->prop ordering on P0 more clear.

Cc: kernel-t...@android.com
Reviewed-by: Boqun Feng 
Signed-off-by: Joel Fernandes (Google) 
---
 .../memory-model/Documentation/explanation.txt  | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/tools/memory-model/Documentation/explanation.txt 
b/tools/memory-model/Documentation/explanation.txt
index 68caa9a976d0..aa84fce854cc 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1302,8 +1302,8 @@ followed by an arbitrary number of cumul-fence links, 
ending with an
 rfe link.  You can concoct more exotic examples, containing more than
 one fence, although this quickly leads to diminishing returns in terms
 of complexity.  For instance, here's an example containing a coe link
-followed by two fences and an rfe link, utilizing the fact that
-release fences are A-cumulative:
+followed by a fence, an rfe link, another fence and and a final rfe link,
+utilizing the fact that release fences are A-cumulative:
 
int x, y, z;
 
@@ -1334,11 +1334,14 @@ If x = 2, r0 = 1, and r2 = 1 after this code runs then 
there is a prop
 link from P0's store to its load.  This is because P0's store gets
 overwritten by P1's store since x = 2 at the end (a coe link), the
 smp_wmb() ensures that P1's store to x propagates to P2 before the
-store to y does (the first fence), the store to y propagates to P2
-before P2's load and store execute, P2's smp_store_release()
-guarantees that the stores to x and y both propagate to P0 before the
-store to z does (the second fence), and P0's load executes after the
-store to z has propagated to P0 (an rfe link).
+store to y does (the first fence), P2's store to y happens before P2's
+load of y (rfe link), P2's smp_store_release() ensures that P2's load
+of y executes before P2's store to z (second fence), which implies that
+that stores to x and y propagate to P2 before the smp_store_release(), which
+means that P2's smp_store_release() will propagate stores to x and y to all
+CPUs before the store to z propagates (A-cumulative property of this fence).
+Finally P0's load of z executes after P2's store to z has propagated to
+P0 (rfe link).
 
 In summary, the fact that the hb relation links memory access events
 in the order they execute means that it must not have cycles.  This
-- 
2.22.0.709.g102302147b-goog



[PATCH] docs/lkmm: Correct ->prop example with additional rfe link

2019-07-27 Thread Joel Fernandes (Google)
This lkmm example should describe an additional rfe link between P1's
store to y and P2's load of y, which should be critical to establishing
the ordering resulting in the ->prop ordering on P0. IOW, there are 2 rfe
links, not one.

Correct these in the docs to make the ->prop ordering in P0 more clear.

Signed-off-by: Joel Fernandes (Google) 
---
 tools/memory-model/Documentation/explanation.txt | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/tools/memory-model/Documentation/explanation.txt 
b/tools/memory-model/Documentation/explanation.txt
index 68caa9a976d0..6c0dfaac7f04 100644
--- a/tools/memory-model/Documentation/explanation.txt
+++ b/tools/memory-model/Documentation/explanation.txt
@@ -1302,8 +1302,8 @@ followed by an arbitrary number of cumul-fence links, 
ending with an
 rfe link.  You can concoct more exotic examples, containing more than
 one fence, although this quickly leads to diminishing returns in terms
 of complexity.  For instance, here's an example containing a coe link
-followed by two fences and an rfe link, utilizing the fact that
-release fences are A-cumulative:
+followed by a fence, an rfe link, another fence and and a final rfe link,
+utilizing the fact that release fences are A-cumulative:
 
int x, y, z;
 
@@ -1334,11 +1334,13 @@ If x = 2, r0 = 1, and r2 = 1 after this code runs then 
there is a prop
 link from P0's store to its load.  This is because P0's store gets
 overwritten by P1's store since x = 2 at the end (a coe link), the
 smp_wmb() ensures that P1's store to x propagates to P2 before the
-store to y does (the first fence), the store to y propagates to P2
-before P2's load and store execute, P2's smp_store_release()
-guarantees that the stores to x and y both propagate to P0 before the
-store to z does (the second fence), and P0's load executes after the
-store to z has propagated to P0 (an rfe link).
+store to y does (the first fence), P2's store to y happens before P2's
+load of y (rfe link), P2's smp_store_release() ensures that P2's load
+of y executes before P2's store of z (second fence), which also would
+imply that stores to x and y happen before the smp_store_release(), which
+means that P2's smp_store_release() will propagate stores to x and y to all
+CPUs before the store to z does (A-cumulative property of this fence).
+Finally P0's load executes after store to z has propagated to P0 (rfe link).
 
 In summary, the fact that the hb relation links memory access events
 in the order they execute means that it must not have cycles.  This
-- 
2.22.0.709.g102302147b-goog



[PATCH] pidfd: Add warning if exit_state is 0 during notification

2019-07-24 Thread Joel Fernandes (Google)
Previously a condition got missed where the pidfd waiters are awakened
before the exit_state gets set. This can result in a missed notification
[1] and the polling thread waiting forever.

It is fixed now, however it would be nice to avoid this kind of issue
going unnoticed in the future. So just add a warning to catch it in the
future.

[1] https://lore.kernel.org/lkml/20190717172100.261204-1-j...@joelfernandes.org/

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/signal.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/signal.c b/kernel/signal.c
index 91b789dd6e72..349f5a67f100 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1885,6 +1885,7 @@ static void do_notify_pidfd(struct task_struct *task)
 {
struct pid *pid;
 
+   WARN_ON(task->exit_state == 0);
pid = task_pid(task);
wake_up_all(>wait_pidfd);
 }
-- 
2.22.0.657.g960e92d24f-goog



[PATCH v1 1/2] mm/page_idle: Add support for per-pid page_idle using virtual indexing

2019-07-22 Thread Joel Fernandes (Google)
The page_idle tracking feature currently requires looking up the pagemap
for a process followed by interacting with /sys/kernel/mm/page_idle.
This is quite cumbersome and can be error-prone too. If between
accessing the per-PID pagemap and the global page_idle bitmap, if
something changes with the page then the information is not accurate.
More over looking up PFN from pagemap in Android devices is not
supported by unprivileged process and requires SYS_ADMIN and gives 0 for
the PFN.

This patch adds support to directly interact with page_idle tracking at
the PID level by introducing a /proc//page_idle file. This
eliminates the need for userspace to calculate the mapping of the page.
It follows the exact same semantics as the global
/sys/kernel/mm/page_idle, however it is easier to use for some usecases
where looking up PFN is not needed and also does not require SYS_ADMIN.
It ended up simplifying userspace code, solving the security issue
mentioned and works quite well. SELinux does not need to be turned off
since no pagemap look up is needed.

In Android, we are using this for the heap profiler (heapprofd) which
profiles and pin points code paths which allocates and leaves memory
idle for long periods of time.

Documentation material:
The idle page tracking API for virtual address indexing using virtual page
frame numbers (VFN) is located at /proc//page_idle. It is a bitmap
that follows the same semantics as /sys/kernel/mm/page_idle/bitmap
except that it uses virtual instead of physical frame numbers.

This idle page tracking API can be simpler to use than physical address
indexing, since the pagemap for a process does not need to be looked up
to mark or read a page's idle bit. It is also more accurate than
physical address indexing since in physical address indexing, address
space changes can occur between reading the pagemap and reading the
bitmap. In virtual address indexing, the process's mmap_sem is held for
the duration of the access.

Cc: vdavydov@gmail.com
Cc: Brendan Gregg 
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 

---
Internal review -> v1:
Fixes from Suren.
Corrections to change log, docs (Florian, Sandeep)

 fs/proc/base.c|   3 +
 fs/proc/internal.h|   1 +
 fs/proc/task_mmu.c|  57 +++
 include/linux/page_idle.h |   4 +
 mm/page_idle.c| 305 +-
 5 files changed, 330 insertions(+), 40 deletions(-)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 77eb628ecc7f..a58dd74606e9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3021,6 +3021,9 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("smaps",  S_IRUGO, proc_pid_smaps_operations),
REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
REG("pagemap",S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+   REG("page_idle", S_IRUSR|S_IWUSR, proc_page_idle_operations),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
DIR("attr",   S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, 
proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index cd0c8d5ce9a1..bc9371880c63 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -293,6 +293,7 @@ extern const struct file_operations 
proc_pid_smaps_operations;
 extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_page_idle_operations;
 
 extern unsigned long task_vsize(struct mm_struct *);
 extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 4d2b860dbc3f..11ccc53da38e 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1642,6 +1642,63 @@ const struct file_operations proc_pagemap_operations = {
.open   = pagemap_open,
.release= pagemap_release,
 };
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+static ssize_t proc_page_idle_read(struct file *file, char __user *buf,
+  size_t count, loff_t *ppos)
+{
+   int ret;
+   struct task_struct *tsk = get_proc_task(file_inode(file));
+
+   if (!tsk)
+   return -EINVAL;
+   ret = page_idle_proc_read(file, buf, count, ppos, tsk);
+   put_task_struct(tsk);
+   return ret;
+}
+
+static ssize_t proc_page_idle_write(struct file *file, const char __user *buf,
+size_t count, loff_t *ppos)
+{
+   int ret;
+   struct task_struct *tsk = get_proc_task(file_inode(file));
+
+   if (!tsk)
+   return -EINVAL;
+   ret = page_idle_proc_write(file, (char __user *)buf, count, ppos, tsk);
+   put_task_struct(tsk);
+   return ret;
+}
+
+static int proc_page_idle_open(struct inode *inode, struct file *file)
+{
+   str

[PATCH RFC v1] pidfd: fix a race in setting exit_state for pidfd polling

2019-07-17 Thread Joel Fernandes (Google)
From: Suren Baghdasaryan 

There is a race between reading task->exit_state in pidfd_poll and writing
it after do_notify_parent calls do_notify_pidfd. Expected sequence of
events is:

CPU 0CPU 1

exit_notify
  do_notify_parent
do_notify_pidfd
  tsk->exit_state = EXIT_DEAD
  pidfd_poll
 if (tsk->exit_state)

However nothing prevents the following sequence:

CPU 0CPU 1

exit_notify
  do_notify_parent
do_notify_pidfd
   pidfd_poll
  if (tsk->exit_state)
  tsk->exit_state = EXIT_DEAD

This causes a polling task to wait forever, since poll blocks because
exit_state is 0 and the waiting task is not notified again. A stress
test continuously doing pidfd poll and process exits uncovered this bug,
and the below patch fixes it.

To fix this, we set tsk->exit_state before calling do_notify_pidfd.

Cc: kernel-t...@android.com
Signed-off-by: Suren Baghdasaryan 
Signed-off-by: Joel Fernandes (Google) 

---
 kernel/exit.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index a75b6a7f458a..740ceacb4b76 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -720,6 +720,7 @@ static void exit_notify(struct task_struct *tsk, int 
group_dead)
if (group_dead)
kill_orphaned_pgrp(tsk->group_leader, NULL);
 
+   tsk->exit_state = EXIT_ZOMBIE;
if (unlikely(tsk->ptrace)) {
int sig = thread_group_leader(tsk) &&
thread_group_empty(tsk) &&
@@ -1156,10 +1157,11 @@ static int wait_task_zombie(struct wait_opts *wo, 
struct task_struct *p)
ptrace_unlink(p);
 
/* If parent wants a zombie, don't release it now */
-   state = EXIT_ZOMBIE;
+   p->exit_state = EXIT_ZOMBIE;
if (do_notify_parent(p, p->exit_signal))
-   state = EXIT_DEAD;
-   p->exit_state = state;
+   p->exit_state = EXIT_DEAD;
+
+   state = p->exit_state;
write_unlock_irq(_lock);
}
if (state == EXIT_DEAD)
-- 
2.22.0.657.g960e92d24f-goog



[PATCH 4/9] ipv4: add lockdep condition to fix for_each_entry (v1)

2019-07-15 Thread Joel Fernandes (Google)
Using the previous support added, use it for adding lockdep conditions
to list usage here.

Signed-off-by: Joel Fernandes (Google) 
---
 net/ipv4/fib_frontend.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 317339cd7f03..26b0fb24e2c2 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -124,7 +124,8 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
h = id & (FIB_TABLE_HASHSZ - 1);
 
head = >ipv4.fib_table_hash[h];
-   hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+   hlist_for_each_entry_rcu(tb, head, tb_hlist,
+lockdep_rtnl_is_held()) {
if (tb->tb_id == id)
return tb;
}
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 6/9] workqueue: Convert for_each_wq to use built-in list check (v2)

2019-07-15 Thread Joel Fernandes (Google)
list_for_each_entry_rcu now has support to check for RCU reader sections
as well as lock. Just use the support in it, instead of explictly
checking in the caller.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/workqueue.c | 10 ++
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 601d61150b65..e882477ebf6e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -364,11 +364,6 @@ static void workqueue_sysfs_unregister(struct 
workqueue_struct *wq);
 !lockdep_is_held(_pool_mutex),  \
 "RCU or wq_pool_mutex should be held")
 
-#define assert_rcu_or_wq_mutex(wq) \
-   RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&   \
-!lockdep_is_held(>mutex),  \
-"RCU or wq->mutex should be held")
-
 #define assert_rcu_or_wq_mutex_or_pool_mutex(wq)   \
RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&   \
 !lockdep_is_held(>mutex) &&\
@@ -425,9 +420,8 @@ static void workqueue_sysfs_unregister(struct 
workqueue_struct *wq);
  * ignored.
  */
 #define for_each_pwq(pwq, wq)  \
-   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)  \
-   if (({ assert_rcu_or_wq_mutex(wq); false; })) { }   \
-   else
+   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,  \
+lock_is_held(&(wq->mutex).dep_map))
 
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 7/9] x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator (v1)

2019-07-15 Thread Joel Fernandes (Google)
The pcm_mmcfg_list is traversed with list_for_each_entry_rcu without a
reader-lock held, because the pci_mmcfg_lock is already held. Make this
known to the list macro so that it fixes new lockdep warnings that
trigger due to lockdep checks added to list_for_each_entry_rcu().

Signed-off-by: Joel Fernandes (Google) 
---
 arch/x86/pci/mmconfig-shared.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 7389db538c30..6fa42e9c4e6f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -29,6 +29,7 @@
 static bool pci_mmcfg_running_state;
 static bool pci_mmcfg_arch_init_failed;
 static DEFINE_MUTEX(pci_mmcfg_lock);
+#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map)
 
 LIST_HEAD(pci_mmcfg_list);
 
@@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new)
struct pci_mmcfg_region *cfg;
 
/* keep list sorted by segment and starting bus number */
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list) {
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held()) {
if (cfg->segment > new->segment ||
(cfg->segment == new->segment &&
 cfg->start_bus >= new->start_bus)) {
@@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, 
int bus)
 {
struct pci_mmcfg_region *cfg;
 
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list)
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held())
if (cfg->segment == segment &&
cfg->start_bus <= bus && bus <= cfg->end_bus)
return cfg;
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 8/9] acpi: Use built-in RCU list checking for acpi_ioremaps list (v1)

2019-07-15 Thread Joel Fernandes (Google)
list_for_each_entry_rcu has built-in RCU and lock checking. Make use of
it for acpi_ioremaps list traversal.

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/acpi/osl.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 9c0edf2fc0dd..2f9d0d20b836 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -80,6 +81,7 @@ struct acpi_ioremap {
 
 static LIST_HEAD(acpi_ioremaps);
 static DEFINE_MUTEX(acpi_ioremap_lock);
+#define acpi_ioremap_lock_held() lock_is_held(_ioremap_lock.dep_map)
 
 static void __init acpi_request_region (struct acpi_generic_address *gas,
unsigned int length, char *desc)
@@ -206,7 +208,7 @@ acpi_map_lookup(acpi_physical_address phys, acpi_size size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, _ioremaps, list)
+   list_for_each_entry_rcu(map, _ioremaps, list, 
acpi_ioremap_lock_held())
if (map->phys <= phys &&
phys + size <= map->phys + map->size)
return map;
@@ -249,7 +251,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, _ioremaps, list)
+   list_for_each_entry_rcu(map, _ioremaps, list, 
acpi_ioremap_lock_held())
if (map->virt <= virt &&
virt + size <= map->virt + map->size)
return map;
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 2/9] rcu: Add support for consolidated-RCU reader checking (v3)

2019-07-15 Thread Joel Fernandes (Google)
This patch adds support for checking RCU reader sections in list
traversal macros. Optionally, if the list macro is called under SRCU or
other lock/mutex protection, then appropriate lockdep expressions can be
passed to make the checks pass.

Existing list_for_each_entry_rcu() invocations don't need to pass the
optional fourth argument (cond) unless they are under some non-RCU
protection and needs to make lockdep check pass.

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rculist.h  | 28 -
 include/linux/rcupdate.h |  7 +++
 kernel/rcu/Kconfig.debug | 11 ++
 kernel/rcu/update.c  | 44 
 4 files changed, 67 insertions(+), 23 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e91ec9ddcd30..1048160625bb 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -40,6 +40,20 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  */
 #define list_next_rcu(list)(*((struct list_head __rcu **)(&(list)->next)))
 
+/*
+ * Check during list traversal that we are within an RCU reader
+ */
+
+#ifdef CONFIG_PROVE_RCU_LIST
+#define __list_check_rcu(dummy, cond, ...) \
+   ({  \
+   RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(),\
+"RCU-list traversed in non-reader section!");  \
+})
+#else
+#define __list_check_rcu(dummy, cond, ...) ({})
+#endif
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -343,14 +357,16 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
  * @member:the name of the list_head within the struct.
+ * @cond:  optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define list_for_each_entry_rcu(pos, head, member) \
-   for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
-   >member != (head); \
+#define list_for_each_entry_rcu(pos, head, member, cond...)\
+   for (__list_check_rcu(dummy, ## cond, 0),   \
+pos = list_entry_rcu((head)->next, typeof(*pos), member);  \
+   >member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
@@ -616,13 +632,15 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * @pos:   the type * to use as a loop cursor.
  * @head:  the head for your list.
  * @member:the name of the hlist_node within the struct.
+ * @cond:  optional lockdep expression if called from non-RCU protection.
  *
  * This list-traversal primitive may safely run concurrently with
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define hlist_for_each_entry_rcu(pos, head, member)\
-   for (pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
+#define hlist_for_each_entry_rcu(pos, head, member, cond...)   \
+   for (__list_check_rcu(dummy, ## cond, 0),   \
+pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 8f7167478c1d..f3c29efdf19a 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -221,6 +221,7 @@ int debug_lockdep_rcu_enabled(void);
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
 int rcu_read_lock_sched_held(void);
+int rcu_read_lock_any_held(void);
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -241,6 +242,12 @@ static inline int rcu_read_lock_sched_held(void)
 {
return !preemptible();
 }
+
+static inline int rcu_read_lock_any_held(void)
+{
+   return !preemptible();
+}
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 5ec3ea4028e2..7fbd21dbfcd0 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -8,6 +8,17 @@ menu "RCU Debugging"
 config PROVE_RCU
def_bool PROVE_LOCKING
 
+config PROVE_RCU_LIST
+   bool "RCU list lockdep debugging"
+   depends on PROVE_RCU
+   default n
+   help
+ Enable RCU lockdep checking for list usages. By def

[PATCH 5/9] driver/core: Convert to use built-in RCU list checking (v1)

2019-07-15 Thread Joel Fernandes (Google)
list_for_each_entry_rcu has built-in RCU and lock checking. Make use of
it in driver core.

Acked-by: Greg Kroah-Hartman 
Signed-off-by: Joel Fernandes (Google) 
---
 drivers/base/base.h  |  1 +
 drivers/base/core.c  | 10 ++
 drivers/base/power/runtime.c | 15 ++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index b405436ee28e..0d32544b6f91 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -165,6 +165,7 @@ static inline int devtmpfs_init(void) { return 0; }
 /* Device links support */
 extern int device_links_read_lock(void);
 extern void device_links_read_unlock(int idx);
+extern int device_links_read_lock_held(void);
 extern int device_links_check_suppliers(struct device *dev);
 extern void device_links_driver_bound(struct device *dev);
 extern void device_links_driver_cleanup(struct device *dev);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index da84a73f2ba6..85e82f38717f 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -68,6 +68,11 @@ void device_links_read_unlock(int idx)
 {
srcu_read_unlock(_links_srcu, idx);
 }
+
+int device_links_read_lock_held(void)
+{
+   return srcu_read_lock_held(_links_srcu);
+}
 #else /* !CONFIG_SRCU */
 static DECLARE_RWSEM(device_links_lock);
 
@@ -91,6 +96,11 @@ void device_links_read_unlock(int not_used)
 {
up_read(_links_lock);
 }
+
+int device_links_read_lock_held(void)
+{
+   return lock_is_held(_links_lock);
+}
 #endif /* !CONFIG_SRCU */
 
 /**
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..7a10e8379a70 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -287,7 +287,8 @@ static int rpm_get_suppliers(struct device *dev)
 {
struct device_link *link;
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node) {
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held()) {
int retval;
 
if (!(link->flags & DL_FLAG_PM_RUNTIME) ||
@@ -309,7 +310,8 @@ static void rpm_put_suppliers(struct device *dev)
 {
struct device_link *link;
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node) {
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held()) {
if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
continue;
 
@@ -1640,7 +1642,8 @@ void pm_runtime_clean_up_links(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.consumers, s_node) {
+   list_for_each_entry_rcu(link, >links.consumers, s_node,
+   device_links_read_lock_held()) {
if (link->flags & DL_FLAG_STATELESS)
continue;
 
@@ -1662,7 +1665,8 @@ void pm_runtime_get_suppliers(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node)
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held())
if (link->flags & DL_FLAG_PM_RUNTIME) {
link->supplier_preactivated = true;
refcount_inc(>rpm_active);
@@ -1683,7 +1687,8 @@ void pm_runtime_put_suppliers(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node)
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held())
if (link->supplier_preactivated) {
link->supplier_preactivated = false;
if (refcount_dec_not_one(>rpm_active))
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 9/9] doc: Update documentation about list_for_each_entry_rcu (v1)

2019-07-15 Thread Joel Fernandes (Google)
This patch updates the documentation with information about
usage of lockdep with list_for_each_entry_rcu().

Signed-off-by: Joel Fernandes (Google) 
---
 Documentation/RCU/lockdep.txt   | 15 +++
 Documentation/RCU/whatisRCU.txt |  9 -
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/Documentation/RCU/lockdep.txt b/Documentation/RCU/lockdep.txt
index da51d3068850..3d967df3a801 100644
--- a/Documentation/RCU/lockdep.txt
+++ b/Documentation/RCU/lockdep.txt
@@ -96,7 +96,14 @@ other flavors of rcu_dereference().  On the other hand, it 
is illegal
 to use rcu_dereference_protected() if either the RCU-protected pointer
 or the RCU-protected data that it points to can change concurrently.
 
-There are currently only "universal" versions of the rcu_assign_pointer()
-and RCU list-/tree-traversal primitives, which do not (yet) check for
-being in an RCU read-side critical section.  In the future, separate
-versions of these primitives might be created.
+Similar to rcu_dereference_protected, The RCU list and hlist traversal
+primitives also check for whether there are called from within a reader
+section. However, an optional lockdep expression can be passed to them as
+the last argument in case they are called under other non-RCU protection.
+
+For example, the workqueue for_each_pwq() macro is implemented as follows.
+It is safe to call for_each_pwq() outside a reader section but under protection
+of wq->mutex:
+#define for_each_pwq(pwq, wq)
+   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,
+   lock_is_held(&(wq->mutex).dep_map))
diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt
index 7e1a8721637a..00fe77ede1e2 100644
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -290,7 +290,7 @@ rcu_dereference()
at any time, including immediately after the rcu_dereference().
And, again like rcu_assign_pointer(), rcu_dereference() is
typically used indirectly, via the _rcu list-manipulation
-   primitives, such as list_for_each_entry_rcu().
+   primitives, such as list_for_each_entry_rcu() [2].
 
[1] The variant rcu_dereference_protected() can be used outside
of an RCU read-side critical section as long as the usage is
@@ -305,6 +305,13 @@ rcu_dereference()
a lockdep splat is emitted.  See 
RCU/Design/Requirements/Requirements.html
and the API's code comments for more details and example usage.
 
+   [2] In case the list_for_each_entry_rcu() primitive is intended
+   to be used outside of an RCU reader section such as when
+   protected by a lock, then an additional lockdep expression can be
+   passed as the last argument to it so that RCU lockdep checking code
+   knows that the dereference of the list pointers are safe. If the
+   indicated protection is not provided, a lockdep splat is emitted.
+
 The following diagram shows how each API communicates among the
 reader, updater, and reclaimer.
 
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 3/9] rcu/sync: Remove custom check for reader-section (v2)

2019-07-15 Thread Joel Fernandes (Google)
The rcu/sync code was doing its own check whether we are in a reader
section. With RCU consolidating flavors and the generic helper added in
this series, this is no longer need. We can just use the generic helper
and it results in a nice cleanup.

Cc: Oleg Nesterov 
Acked-by: Oleg Nesterov 
Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rcu_sync.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/include/linux/rcu_sync.h b/include/linux/rcu_sync.h
index 9b83865d24f9..0027d4c8087c 100644
--- a/include/linux/rcu_sync.h
+++ b/include/linux/rcu_sync.h
@@ -31,9 +31,7 @@ struct rcu_sync {
  */
 static inline bool rcu_sync_is_idle(struct rcu_sync *rsp)
 {
-   RCU_LOCKDEP_WARN(!rcu_read_lock_held() &&
-!rcu_read_lock_bh_held() &&
-!rcu_read_lock_sched_held(),
+   RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(),
 "suspicious rcu_sync_is_idle() usage");
return !READ_ONCE(rsp->gp_state); /* GP_IDLE */
 }
-- 
2.22.0.510.g264f2c817a-goog



[PATCH 0/9] Harden list_for_each_entry_rcu() and family

2019-07-15 Thread Joel Fernandes (Google)
Hi,
This series aims to provide lockdep checking to RCU list macros for additional
kernel hardening.

RCU has a number of primitives for "consumption" of an RCU protected pointer.
Most of the time, these consumers make sure that such accesses are under a RCU
reader-section (such as rcu_dereference{,sched,bh} or under a lock, such as
with rcu_dereference_protected()).

However, there are other ways to consume RCU pointers, such as by
list_for_each_entry_rcu or hlist_for_each_enry_rcu. Unlike the rcu_dereference
family, these consumers do no lockdep checking at all. And with the growing
number of RCU list uses (1000+), it is possible for bugs to creep in and go
unnoticed which lockdep checks can catch.

Since RCU consolidation efforts last year, the different traditional RCU
flavors (preempt, bh, sched) are all consolidated. In other words, any of these
flavors can cause a reader section to occur and all of them must cease before
the reader section is considered to be unlocked. Thanks to this, we can
generically check if we are in an RCU reader. This is what patch 1 does. Note
that the list_for_each_entry_rcu and family are different from the
rcu_dereference family in that, there is no _bh or _sched version of this
macro. They are used under many different RCU reader flavors, and also SRCU.
Patch 1 adds a new internal function rcu_read_lock_any_held() which checks
if any reader section is active at all, when these macros are called. If no
reader section exists, then the optional fourth argument to
list_for_each_entry_rcu() can be a lockdep expression which is evaluated
(similar to how rcu_dereference_check() works). If no lockdep expression is
passed, and we are not in a reader, then a splat occurs. Just take off the
lockdep expression after applying the patches, by using the following diff and
see what happens:

+++ b/arch/x86/pci/mmconfig-shared.c
@@ -55,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new)
struct pci_mmcfg_region *cfg;

/* keep list sorted by segment and starting bus number */
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held()) {
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list) {


The optional argument trick to list_for_each_entry_rcu() can also be used in
the future to possibly remove rcu_dereference_{,bh,sched}_protected() API and
we can pass an optional lockdep expression to rcu_dereference() itself. Thus
eliminating 3 more RCU APIs.

Note that some list macro wrappers already do their own lockdep checking in the
caller side. These can be eliminated in favor of the built-in lockdep checking
in the list macro that this series adds. For example, workqueue code has a
assert_rcu_or_wq_mutex() function which is called in for_each_wq().  This
series replaces that in favor of the built-in check.

Also in the future, we can extend these checks to list_entry_rcu() and other
list macros as well, if needed.

Please note that I have kept this option default-disabled under a new config:
CONFIG_PROVE_RCU_LIST. This is so that until all users are converted to pass
the optional argument, we should keep the check disabled. There are about a
1000 or so users and it is not possible to pass in the optional lockdep
expression in a single series since it is done on a case-by-case basis. I did
convert a few users in this series itself.

v2->v3: Simplified rcu-sync logic after rebase (Paul)
Added check for bh_map (Paul)
Refactored out more of the common code (Joel)
Added Oleg ack to rcu-sync patch.

v1->v2: Have assert_rcu_or_wq_mutex deleted (Daniel Jordan)
Simplify rcu_read_lock_any_held()   (Peter Zijlstra)
Simplified rcu-sync logic   (Oleg Nesterov)
Updated documentation and rculist comments.
Added GregKH ack.

RFC->v1: 
Simplify list checking macro (Rasmus Villemoes)

Joel Fernandes (Google) (9):
rcu/update: Remove useless check for debug_locks (v1)
rcu: Add support for consolidated-RCU reader checking (v3)
rcu/sync: Remove custom check for reader-section (v2)
ipv4: add lockdep condition to fix for_each_entry (v1)
driver/core: Convert to use built-in RCU list checking (v1)
workqueue: Convert for_each_wq to use built-in list check (v2)
x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator (v1)
acpi: Use built-in RCU list checking for acpi_ioremaps list (v1)
doc: Update documentation about list_for_each_entry_rcu (v1)

Documentation/RCU/lockdep.txt   | 15 ---
Documentation/RCU/whatisRCU.txt |  9 ++-
arch/x86/pci/mmconfig-shared.c  |  5 ++--
drivers/acpi/osl.c  |  6 +++--
drivers/base/base.h |  1 +
drivers/base/core.c | 10 +++
drivers/base/power/runtime.c| 15 +++
include/linux/rcu_sync.h|  4 +--
include/linux/rculist.h | 28 +++
include/linux/rcupdate.h|  7 +
kernel/rcu/Kconfig.debug| 11 
kernel/rcu/update

[PATCH v1 4/6] workqueue: Convert for_each_wq to use built-in list check

2019-07-11 Thread Joel Fernandes (Google)
list_for_each_entry_rcu now has support to check for RCU reader sections
as well as lock. Just use the support in it, instead of explictly
checking in the caller.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/workqueue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9657315405de..91ed7aca16e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -424,9 +424,8 @@ static void workqueue_sysfs_unregister(struct 
workqueue_struct *wq);
  * ignored.
  */
 #define for_each_pwq(pwq, wq)  \
-   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)  \
-   if (({ assert_rcu_or_wq_mutex(wq); false; })) { }   \
-   else
+   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,  \
+lock_is_held(&(wq->mutex).dep_map))
 
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH v1 6/6] acpi: Use built-in RCU list checking for acpi_ioremaps list

2019-07-11 Thread Joel Fernandes (Google)
list_for_each_entry_rcu has built-in RCU and lock checking. Make use of
it for acpi_ioremaps list traversal.

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/acpi/osl.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index f29e427d0d1d..c8b5d712c7ae 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -94,6 +95,7 @@ struct acpi_ioremap {
 
 static LIST_HEAD(acpi_ioremaps);
 static DEFINE_MUTEX(acpi_ioremap_lock);
+#define acpi_ioremap_lock_held() lock_is_held(_ioremap_lock.dep_map)
 
 static void __init acpi_request_region (struct acpi_generic_address *gas,
unsigned int length, char *desc)
@@ -220,7 +222,7 @@ acpi_map_lookup(acpi_physical_address phys, acpi_size size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, _ioremaps, list)
+   list_for_each_entry_rcu(map, _ioremaps, list, 
acpi_ioremap_lock_held())
if (map->phys <= phys &&
phys + size <= map->phys + map->size)
return map;
@@ -263,7 +265,7 @@ acpi_map_lookup_virt(void __iomem *virt, acpi_size size)
 {
struct acpi_ioremap *map;
 
-   list_for_each_entry_rcu(map, _ioremaps, list)
+   list_for_each_entry_rcu(map, _ioremaps, list, 
acpi_ioremap_lock_held())
if (map->virt <= virt &&
virt + size <= map->virt + map->size)
return map;
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH v1 0/6] Harden list_for_each_entry_rcu() and family

2019-07-11 Thread Joel Fernandes (Google)
Hi,
This series aims to provide lockdep checking to RCU list macros.

RCU has a number of primitives for "consumption" of an RCU protected pointer.
Most of the time, these consumers make sure that such accesses are under a RCU
reader-section (such as rcu_dereference{,sched,bh} or under a lock, such as
with rcu_dereference_protected()).

However, there are other ways to consume RCU pointers, such as by
list_for_each_entry_rcu or hlist_for_each_enry_rcu. Unlike the rcu_dereference
family, these consumers do no lockdep checking at all. And with the growing
number of RCU list uses (1000+), it is possible for bugs to creep in and go
unnoticed which lockdep checks can catch.

Since RCU consolidation efforts last year, the different traditional RCU
flavors (preempt, bh, sched) are all consolidated. In other words, any of these
flavors can cause a reader section to occur and all of them must cease before
the reader section is considered to be unlocked. Thanks to this, we can
generically check if we are in an RCU reader. This is what patch 1 does. Note
that the list_for_each_entry_rcu and family are different from the
rcu_dereference family in that, there is no _bh or _sched version of this
macro. They are used under many different RCU reader flavors, and also SRCU.
Patch 1 adds a new internal function rcu_read_lock_any_held() which checks
if any reader section is active at all, when these macros are called. If no
reader section exists, then the optional fourth argument to
list_for_each_entry_rcu() can be a lockdep expression which is evaluated
(similar to how rcu_dereference_check() works). If no lockdep expression is
passed, and we are not in a reader, then a splat occurs. Just take off the
lockdep expression after applying the patches, by using the following diff and
see what happens:

+++ b/arch/x86/pci/mmconfig-shared.c
@@ -55,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new)
struct pci_mmcfg_region *cfg;

/* keep list sorted by segment and starting bus number */
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held()) {
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list) {


The optional argument trick to list_for_each_entry_rcu() can also be used in
the future to possibly remove rcu_dereference_{,bh,sched}_protected() API and
we can pass an optional lockdep expression to rcu_dereference() itself. Thus
eliminating 3 more RCU APIs.

Note that some list macro wrappers already do their own lockdep checking in the
caller side. These can be eliminated in favor of the built-in lockdep checking
in the list macro that this series adds. For example, workqueue code has a
assert_rcu_or_wq_mutex() function which is called in for_each_wq().  This
series replaces that in favor of the built-in check.

Also in the future, we can extend these checks to list_entry_rcu() and other
list macros as well, if needed.

Joel Fernandes (Google) (6):
rcu: Add support for consolidated-RCU reader checking
ipv4: add lockdep condition to fix for_each_entry
driver/core: Convert to use built-in RCU list checking
workqueue: Convert for_each_wq to use built-in list check
x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator
acpi: Use built-in RCU list checking for acpi_ioremaps list

arch/x86/pci/mmconfig-shared.c |  5 +++--
drivers/acpi/osl.c |  6 --
drivers/base/base.h|  1 +
drivers/base/core.c| 10 ++
drivers/base/power/runtime.c   | 15 ++-
include/linux/rculist.h| 29 -
include/linux/rcupdate.h   |  7 +++
kernel/rcu/Kconfig.debug   | 11 +++
kernel/rcu/update.c| 26 ++
kernel/workqueue.c |  5 ++---
net/ipv4/fib_frontend.c|  3 ++-
11 files changed, 100 insertions(+), 18 deletions(-)

--
2.22.0.410.gd8fdbe21b5-goog



[PATCH v1 3/6] driver/core: Convert to use built-in RCU list checking

2019-07-11 Thread Joel Fernandes (Google)
list_for_each_entry_rcu has built-in RCU and lock checking. Make use of
it in driver core.

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/base/base.h  |  1 +
 drivers/base/core.c  | 10 ++
 drivers/base/power/runtime.c | 15 ++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index b405436ee28e..0d32544b6f91 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -165,6 +165,7 @@ static inline int devtmpfs_init(void) { return 0; }
 /* Device links support */
 extern int device_links_read_lock(void);
 extern void device_links_read_unlock(int idx);
+extern int device_links_read_lock_held(void);
 extern int device_links_check_suppliers(struct device *dev);
 extern void device_links_driver_bound(struct device *dev);
 extern void device_links_driver_cleanup(struct device *dev);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd7511e04e62..6c5ca9685647 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -68,6 +68,11 @@ void device_links_read_unlock(int idx)
 {
srcu_read_unlock(_links_srcu, idx);
 }
+
+int device_links_read_lock_held(void)
+{
+   return srcu_read_lock_held(_links_srcu);
+}
 #else /* !CONFIG_SRCU */
 static DECLARE_RWSEM(device_links_lock);
 
@@ -91,6 +96,11 @@ void device_links_read_unlock(int not_used)
 {
up_read(_links_lock);
 }
+
+int device_links_read_lock_held(void)
+{
+   return lock_is_held(_links_lock);
+}
 #endif /* !CONFIG_SRCU */
 
 /**
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..7a10e8379a70 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -287,7 +287,8 @@ static int rpm_get_suppliers(struct device *dev)
 {
struct device_link *link;
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node) {
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held()) {
int retval;
 
if (!(link->flags & DL_FLAG_PM_RUNTIME) ||
@@ -309,7 +310,8 @@ static void rpm_put_suppliers(struct device *dev)
 {
struct device_link *link;
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node) {
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held()) {
if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
continue;
 
@@ -1640,7 +1642,8 @@ void pm_runtime_clean_up_links(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.consumers, s_node) {
+   list_for_each_entry_rcu(link, >links.consumers, s_node,
+   device_links_read_lock_held()) {
if (link->flags & DL_FLAG_STATELESS)
continue;
 
@@ -1662,7 +1665,8 @@ void pm_runtime_get_suppliers(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node)
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held())
if (link->flags & DL_FLAG_PM_RUNTIME) {
link->supplier_preactivated = true;
refcount_inc(>rpm_active);
@@ -1683,7 +1687,8 @@ void pm_runtime_put_suppliers(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node)
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held())
if (link->supplier_preactivated) {
link->supplier_preactivated = false;
if (refcount_dec_not_one(>rpm_active))
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH v1 1/6] rcu: Add support for consolidated-RCU reader checking

2019-07-11 Thread Joel Fernandes (Google)
This patch adds support for checking RCU reader sections in list
traversal macros. Optionally, if the list macro is called under SRCU or
other lock/mutex protection, then appropriate lockdep expressions can be
passed to make the checks pass.

Existing list_for_each_entry_rcu() invocations don't need to pass the
optional fourth argument (cond) unless they are under some non-RCU
protection and needs to make lockdep check pass.

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rculist.h  | 29 -
 include/linux/rcupdate.h |  7 +++
 kernel/rcu/Kconfig.debug | 11 +++
 kernel/rcu/update.c  | 26 ++
 4 files changed, 68 insertions(+), 5 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e91ec9ddcd30..78c15ec6b2c9 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -40,6 +40,23 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  */
 #define list_next_rcu(list)(*((struct list_head __rcu **)(&(list)->next)))
 
+/*
+ * Check during list traversal that we are within an RCU reader
+ */
+
+#define SIXTH_ARG(a1, a2, a3, a4, a5, a6, ...) a6
+#define COUNT_VARGS(...) SIXTH_ARG(dummy, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
+#ifdef CONFIG_PROVE_RCU_LIST
+#define __list_check_rcu(dummy, cond, ...) \
+   ({  \
+   RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(),\
+"RCU-list traversed in non-reader section!");  \
+})
+#else
+#define __list_check_rcu(dummy, cond, ...) ({})
+#endif
+
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -348,9 +365,10 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define list_for_each_entry_rcu(pos, head, member) \
-   for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
-   >member != (head); \
+#define list_for_each_entry_rcu(pos, head, member, cond...)\
+   for (__list_check_rcu(dummy, ## cond, 0),   \
+pos = list_entry_rcu((head)->next, typeof(*pos), member);  \
+   >member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
@@ -621,8 +639,9 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define hlist_for_each_entry_rcu(pos, head, member)\
-   for (pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
+#define hlist_for_each_entry_rcu(pos, head, member, cond...)   \
+   for (__list_check_rcu(dummy, ## cond, 0),   \
+pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
pos = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(\
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 922bb6848813..712b464ab960 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -223,6 +223,7 @@ int debug_lockdep_rcu_enabled(void);
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
 int rcu_read_lock_sched_held(void);
+int rcu_read_lock_any_held(void);
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -243,6 +244,12 @@ static inline int rcu_read_lock_sched_held(void)
 {
return !preemptible();
 }
+
+static inline int rcu_read_lock_any_held(void)
+{
+   return !preemptible();
+}
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 0ec7d1d33a14..b20d0e2903d1 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -7,6 +7,17 @@ menu "RCU Debugging"
 config PROVE_RCU
def_bool PROVE_LOCKING
 
+config PROVE_RCU_LIST
+   bool "RCU list lockdep debugging"
+   depends on PROVE_RCU
+   default n
+   help
+ Enable RCU lockdep checking for list usages. By default it is
+ turned off since there are several list RCU users that still
+ need to be converted to pass a lockdep expression. To prevent
+ false-positive splats, we keep it default disabled but once all
+ users are converted, we can remove this config option.
+
 config TORTURE_TEST
tristate
default n
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..9cb30006a5e1 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/

[PATCH v1 5/6] x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator

2019-07-11 Thread Joel Fernandes (Google)
The pcm_mmcfg_list is traversed with list_for_each_entry_rcu without a
reader-lock held, because the pci_mmcfg_lock is already held. Make this
known to the list macro so that it fixes new lockdep warnings that
trigger due to lockdep checks added to list_for_each_entry_rcu().

Signed-off-by: Joel Fernandes (Google) 
---
 arch/x86/pci/mmconfig-shared.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 7389db538c30..6fa42e9c4e6f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -29,6 +29,7 @@
 static bool pci_mmcfg_running_state;
 static bool pci_mmcfg_arch_init_failed;
 static DEFINE_MUTEX(pci_mmcfg_lock);
+#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map)
 
 LIST_HEAD(pci_mmcfg_list);
 
@@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new)
struct pci_mmcfg_region *cfg;
 
/* keep list sorted by segment and starting bus number */
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list) {
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held()) {
if (cfg->segment > new->segment ||
(cfg->segment == new->segment &&
 cfg->start_bus >= new->start_bus)) {
@@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, 
int bus)
 {
struct pci_mmcfg_region *cfg;
 
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list)
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held())
if (cfg->segment == segment &&
cfg->start_bus <= bus && bus <= cfg->end_bus)
return cfg;
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH v1 2/6] ipv4: add lockdep condition to fix for_each_entry

2019-07-11 Thread Joel Fernandes (Google)
Signed-off-by: Joel Fernandes (Google) 
---
 net/ipv4/fib_frontend.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index b298255f6fdb..ef7c9f8e8682 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -127,7 +127,8 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
h = id & (FIB_TABLE_HASHSZ - 1);
 
head = >ipv4.fib_table_hash[h];
-   hlist_for_each_entry_rcu(tb, head, tb_hlist) {
+   hlist_for_each_entry_rcu(tb, head, tb_hlist,
+lockdep_rtnl_is_held()) {
if (tb->tb_id == id)
return tb;
}
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH RFC 3/4] lib/bpf: Add support for ftrace event attach and detach

2019-07-10 Thread Joel Fernandes (Google)
Add the needed library support in this commit.

Signed-off-by: Joel Fernandes (Google) 
---
 tools/lib/bpf/bpf.c  | 53 
 tools/lib/bpf/bpf.h  |  4 +++
 tools/lib/bpf/libbpf.map |  2 ++
 3 files changed, 59 insertions(+)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index c4a48086dc9a..28c5a7d00d14 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -24,6 +24,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -57,6 +60,8 @@
 #define min(x, y) ((x) < (y) ? (x) : (y))
 #endif
 
+#define TRACEFS "/sys/kernel/debug/tracing"
+
 static inline __u64 ptr_to_u64(const void *ptr)
 {
return (__u64) (unsigned long) ptr;
@@ -658,6 +663,54 @@ int bpf_raw_tracepoint_open(const char *name, int prog_fd)
return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, , sizeof(attr));
 }
 
+int bpf_raw_tracepoint_ftrace_attach(const char *subsys, const char *name,
+int prog_fd)
+{
+   char buf[256];
+   int len, ret, tfd;
+
+   sprintf(buf, "%s/events/%s/%s/bpf", TRACEFS, subsys, name);
+   tfd = open(buf, O_WRONLY);
+   if (tfd < 0)
+   return tfd;
+
+   sprintf(buf, "attach:%d", prog_fd);
+   len = strlen(buf);
+   ret = write(tfd, buf, len);
+
+   if (ret < 0)
+   goto err;
+   if (ret != len)
+   ret = -1;
+err:
+   close(tfd);
+   return ret;
+}
+
+int bpf_raw_tracepoint_ftrace_detach(const char *subsys, const char *name,
+int prog_fd)
+{
+   char buf[256];
+   int len, ret, tfd;
+
+   sprintf(buf, "%s/events/%s/%s/bpf", TRACEFS, subsys, name);
+   tfd = open(buf, O_WRONLY);
+   if (tfd < 0)
+   return tfd;
+
+   sprintf(buf, "detach:%d", prog_fd);
+   len = strlen(buf);
+   ret = write(tfd, buf, len);
+
+   if (ret < 0)
+   goto err;
+   if (ret != len)
+   ret = -1;
+err:
+   close(tfd);
+   return ret;
+}
+
 int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size,
 bool do_log)
 {
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 9593fec75652..5b9c44658037 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -163,6 +163,10 @@ LIBBPF_API int bpf_prog_query(int target_fd, enum 
bpf_attach_type type,
  __u32 query_flags, __u32 *attach_flags,
  __u32 *prog_ids, __u32 *prog_cnt);
 LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd);
+LIBBPF_API int bpf_raw_tracepoint_ftrace_attach(const char *subsys,
+   const char *name, int prog_fd);
+LIBBPF_API int bpf_raw_tracepoint_ftrace_detach(const char *subsys,
+   const char *name, int prog_fd);
 LIBBPF_API int bpf_load_btf(void *btf, __u32 btf_size, char *log_buf,
__u32 log_buf_size, bool do_log);
 LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 673001787cba..fca377b688c2 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -163,4 +163,6 @@ LIBBPF_0.0.3 {
bpf_map__is_internal;
bpf_map_freeze;
btf__finalize_data;
+   bpf_raw_tracepoint_ftrace_attach;
+   bpf_raw_tracepoint_ftrace_detach;
 } LIBBPF_0.0.2;
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH RFC 1/4] Move bpf_raw_tracepoint functionality into bpf_trace.c

2019-07-10 Thread Joel Fernandes (Google)
In preparation to use raw tracepoints for BPF directly from ftrace, move
the bpf_raw_tracepoint functionality into bpf_trace.c

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/bpf_trace.h   | 10 ++
 kernel/bpf/syscall.c| 69 ++---
 kernel/trace/bpf_trace.c| 56 ++
 kernel/trace/trace_events.c |  3 ++
 4 files changed, 80 insertions(+), 58 deletions(-)

diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index ddf896abcfb6..4a593827fd87 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -4,4 +4,14 @@
 
 #include 
 
+#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
+
+struct bpf_raw_tracepoint {
+   struct bpf_raw_event_map *btp;
+   struct bpf_prog *prog;
+};
+
+struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd);
+void bpf_raw_tracepoint_close(struct bpf_raw_tracepoint *tp);
+
 #endif /* __LINUX_BPF_TRACE_H__ */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 42d17f730780..2001949b33f1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1737,21 +1737,11 @@ static int bpf_obj_get(const union bpf_attr *attr)
attr->file_flags);
 }
 
-struct bpf_raw_tracepoint {
-   struct bpf_raw_event_map *btp;
-   struct bpf_prog *prog;
-};
-
 static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp)
 {
struct bpf_raw_tracepoint *raw_tp = filp->private_data;
 
-   if (raw_tp->prog) {
-   bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
-   bpf_prog_put(raw_tp->prog);
-   }
-   bpf_put_raw_tracepoint(raw_tp->btp);
-   kfree(raw_tp);
+   bpf_raw_tracepoint_close(raw_tp);
return 0;
 }
 
@@ -1761,64 +1751,27 @@ static const struct file_operations bpf_raw_tp_fops = {
.write  = bpf_dummy_write,
 };
 
-#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd
-
-static int bpf_raw_tracepoint_open(const union bpf_attr *attr)
+static int bpf_raw_tracepoint_open_syscall(const union bpf_attr *attr)
 {
-   struct bpf_raw_tracepoint *raw_tp;
-   struct bpf_raw_event_map *btp;
-   struct bpf_prog *prog;
+   int tp_fd;
char tp_name[128];
-   int tp_fd, err;
+   struct bpf_raw_tracepoint *raw_tp;
 
if (strncpy_from_user(tp_name, 
u64_to_user_ptr(attr->raw_tracepoint.name),
  sizeof(tp_name) - 1) < 0)
return -EFAULT;
tp_name[sizeof(tp_name) - 1] = 0;
 
-   btp = bpf_get_raw_tracepoint(tp_name);
-   if (!btp)
-   return -ENOENT;
-
-   raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER);
-   if (!raw_tp) {
-   err = -ENOMEM;
-   goto out_put_btp;
-   }
-   raw_tp->btp = btp;
-
-   prog = bpf_prog_get(attr->raw_tracepoint.prog_fd);
-   if (IS_ERR(prog)) {
-   err = PTR_ERR(prog);
-   goto out_free_tp;
-   }
-   if (prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT &&
-   prog->type != BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE) {
-   err = -EINVAL;
-   goto out_put_prog;
-   }
-
-   err = bpf_probe_register(raw_tp->btp, prog);
-   if (err)
-   goto out_put_prog;
+   raw_tp = bpf_raw_tracepoint_open(tp_name, attr->raw_tracepoint.prog_fd);
+   if (IS_ERR(raw_tp))
+   return PTR_ERR(raw_tp);
 
-   raw_tp->prog = prog;
tp_fd = anon_inode_getfd("bpf-raw-tracepoint", _raw_tp_fops, raw_tp,
 O_CLOEXEC);
-   if (tp_fd < 0) {
-   bpf_probe_unregister(raw_tp->btp, prog);
-   err = tp_fd;
-   goto out_put_prog;
-   }
-   return tp_fd;
+   if (tp_fd < 0)
+   bpf_probe_unregister(raw_tp->btp, raw_tp->prog);
 
-out_put_prog:
-   bpf_prog_put(prog);
-out_free_tp:
-   kfree(raw_tp);
-out_put_btp:
-   bpf_put_raw_tracepoint(btp);
-   return err;
+   return tp_fd;
 }
 
 static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
@@ -2848,7 +2801,7 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, 
uattr, unsigned int, siz
err = bpf_obj_get_info_by_fd(, uattr);
break;
case BPF_RAW_TRACEPOINT_OPEN:
-   err = bpf_raw_tracepoint_open();
+   err = bpf_raw_tracepoint_open_syscall();
break;
case BPF_BTF_LOAD:
err = bpf_btf_load();
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 1c9a4745e596..c4b543bc617f 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1413,3 +1414,58 @@ static int __init bpf_even

[PATCH RFC 4/4] selftests/bpf: Add test for ftrace-based BPF attach/detach

2019-07-10 Thread Joel Fernandes (Google)
Here we add support for testing the attach and detach of a BPF program
to a tracepoint through tracefs.

Signed-off-by: Joel Fernandes (Google) 
---
 .../raw_tp_writable_test_ftrace_run.c | 89 +++
 1 file changed, 89 insertions(+)
 create mode 100644 
tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c

diff --git 
a/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c 
b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
new file mode 100644
index ..7b42e3a69b71
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c
@@ -0,0 +1,89 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+
+void test_raw_tp_writable_test_ftrace_run(void)
+{
+   __u32 duration = 0;
+   char error[4096];
+   int ret;
+
+   const struct bpf_insn trace_program[] = {
+   BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_1, 0),
+   BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, 0),
+   BPF_MOV64_IMM(BPF_REG_0, 42),
+   BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   };
+
+   struct bpf_load_program_attr load_attr = {
+   .prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+   .license = "GPL v2",
+   .insns = trace_program,
+   .insns_cnt = sizeof(trace_program) / sizeof(struct bpf_insn),
+   .log_level = 2,
+   };
+
+   int bpf_fd = bpf_load_program_xattr(_attr, error, sizeof(error));
+
+   if (CHECK(bpf_fd < 0, "bpf_raw_tracepoint_writable loaded",
+ "failed: %d errno %d\n", bpf_fd, errno))
+   return;
+
+   const struct bpf_insn skb_program[] = {
+   BPF_MOV64_IMM(BPF_REG_0, 0),
+   BPF_EXIT_INSN(),
+   };
+
+   struct bpf_load_program_attr skb_load_attr = {
+   .prog_type = BPF_PROG_TYPE_SOCKET_FILTER,
+   .license = "GPL v2",
+   .insns = skb_program,
+   .insns_cnt = sizeof(skb_program) / sizeof(struct bpf_insn),
+   };
+
+   int filter_fd =
+   bpf_load_program_xattr(_load_attr, error, sizeof(error));
+   if (CHECK(filter_fd < 0, "test_program_loaded", "failed: %d errno %d\n",
+ filter_fd, errno))
+   goto out_bpffd;
+
+   ret = bpf_raw_tracepoint_ftrace_attach("bpf_test_run",
+  "bpf_test_finish",
+  bpf_fd);
+   if (CHECK(ret < 0, "bpf_raw_tracepoint_ftrace_attach",
+ "failed: %d errno %d\n", ret, errno))
+   goto out_filterfd;
+
+   char test_skb[128] = {
+   0,
+   };
+
+   __u32 prog_ret;
+   int err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0,
+   0, _ret, 0);
+   CHECK(err != 42, "test_run",
+ "tracepoint did not modify return value\n");
+   CHECK(prog_ret != 0, "test_run_ret",
+ "socket_filter did not return 0\n");
+
+   ret = bpf_raw_tracepoint_ftrace_detach("bpf_test_run",
+  "bpf_test_finish",
+  bpf_fd);
+   if (CHECK(ret < 0, "bpf_raw_tracepoint_ftrace_detach",
+ "failed: %d errno %d\n", ret, errno))
+   goto out_filterfd;
+
+   err = bpf_prog_test_run(filter_fd, 1, test_skb, sizeof(test_skb), 0, 0,
+   _ret, 0);
+   CHECK(err != 0, "test_run_notrace",
+ "test_run failed with %d errno %d\n", err, errno);
+   CHECK(prog_ret != 0, "test_run_ret_notrace",
+ "socket_filter did not return 0\n");
+
+out_filterfd:
+   close(filter_fd);
+out_bpffd:
+   close(bpf_fd);
+}
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH RFC 2/4] trace/bpf: Add support for attach/detach of ftrace events to BPF

2019-07-10 Thread Joel Fernandes (Google)
Add a new bpf file to each trace event. The following commands can be
written into it:
attach: Attaches BPF prog fd to tracepoint
detach: Detaches BPF prog fd to tracepoint

Reading the bpf file will show all the attached programs to the
tracepoint.

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/bpf_trace.h|   6 ++
 include/linux/trace_events.h |   1 +
 kernel/trace/bpf_trace.c | 169 +++
 kernel/trace/trace.h |   1 +
 kernel/trace/trace_events.c  |   9 +-
 5 files changed, 184 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf_trace.h b/include/linux/bpf_trace.h
index 4a593827fd87..1fe73501809c 100644
--- a/include/linux/bpf_trace.h
+++ b/include/linux/bpf_trace.h
@@ -9,6 +9,12 @@
 struct bpf_raw_tracepoint {
struct bpf_raw_event_map *btp;
struct bpf_prog *prog;
+   /*
+* Multiple programs can be attached to a tracepoint,
+* All of these are linked to each other and can be reached
+* from the event's bpf_attach file in tracefs.
+*/
+   struct list_head event_attached;
 };
 
 struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char *tp_name, int prog_fd);
diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731673f7..525f2ac44aa3 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -371,6 +371,7 @@ struct trace_event_file {
struct trace_array  *tr;
struct trace_subsystem_dir  *system;
struct list_headtriggers;
+   struct list_headbpf_attached;
 
/*
 * 32 bit flags:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index c4b543bc617f..28621ad88c12 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1469,3 +1469,172 @@ struct bpf_raw_tracepoint *bpf_raw_tracepoint_open(char 
*tp_name, int prog_fd)
bpf_put_raw_tracepoint(btp);
return ERR_PTR(err);
 }
+
+enum event_bpf_cmd { BPF_ATTACH, BPF_DETACH };
+#define BPF_CMD_BUF_LEN 32
+
+static ssize_t
+event_bpf_attach_write(struct file *filp, const char __user *ubuf,
+   size_t cnt, loff_t *ppos)
+{
+   int err, prog_fd, cmd_num, len;
+   struct trace_event_call *call;
+   struct trace_event_file *file;
+   struct bpf_raw_tracepoint *raw_tp, *next;
+   char buf[BPF_CMD_BUF_LEN], *end, *tok;
+   enum event_bpf_cmd cmd;
+   struct bpf_prog *prog;
+   bool prog_put = true;
+
+   len = min((int)cnt, BPF_CMD_BUF_LEN - 1);
+
+   err = copy_from_user(buf, ubuf, len);
+   if (err)
+   return err;
+   buf[len] = 0;
+
+   /* Parse 2 arguments of format: : */
+   end = [0];
+   cmd_num = 1;
+   while (cmd_num < 3) {
+   tok = strsep(, ":");
+   if (!tok)
+   return -EINVAL;
+
+   switch (cmd_num) {
+   case 1:
+   if (!strncmp(tok, "attach", 6))
+   cmd = BPF_ATTACH;
+   else if (!strncmp(tok, "detach", 6))
+   cmd = BPF_DETACH;
+   else
+   return -EINVAL;
+   break;
+   case 2:
+   err = kstrtoint(tok, 10, _fd);
+   if (err)
+   return err;
+   break;
+   }
+   cmd_num++;
+   }
+   if (cmd_num != 3)
+   return -EINVAL;
+
+   file = event_file_data(filp);
+   /* Command is to attach fd to tracepoint */
+   if (cmd == BPF_ATTACH) {
+   mutex_lock(_mutex);
+   call = file->event_call;
+
+   raw_tp = bpf_raw_tracepoint_open((char *)call->tp->name,
+prog_fd);
+   if (IS_ERR(raw_tp)) {
+   mutex_unlock(_mutex);
+   return PTR_ERR(raw_tp);
+   }
+
+   list_add(_tp->event_attached, >bpf_attached);
+   mutex_unlock(_mutex);
+   *ppos += cnt;
+   return cnt;
+   }
+
+   /* Command is to detach fd from tracepoint */
+   prog = bpf_prog_get(prog_fd);
+   if (IS_ERR(prog))
+   return PTR_ERR(prog);
+
+   mutex_lock(_mutex);
+   list_for_each_entry_safe(raw_tp, next, >bpf_attached,
+event_attached) {
+   if (raw_tp->prog == prog) {
+   list_del(_tp->event_attached);
+   bpf_raw_tracepoint_close(raw_tp);
+   prog_put = false;
+   break;
+   }
+   }
+   mutex_unlock(_mutex);
+
+   if (prog_put)
+   bpf_prog_put(prog);
+   *ppos += cnt;
+  

[PATCH RFC 0/4] Add support to directly attach BPF program to ftrace

2019-07-10 Thread Joel Fernandes (Google)
Hi,
These patches make it possible to attach BPF programs directly to tracepoints
using ftrace (/sys/kernel/debug/tracing) without needing the process doing the
attach to be alive. This has the following benefits:

1. Simplified Security: In Android, we have finer-grained security controls to
specific ftrace trace events using SELinux labels. We control precisely who is
allowed to enable an ftrace event already. By adding a node to ftrace for
attaching BPF programs, we can use the same mechanism to further control who is
allowed to attach to a trace event.

2. Process lifetime: In Android we are adding usecases where a tracing program
needs to be attached all the time to a tracepoint, for the full life time of
the system. Such as to gather statistics where there no need for a detach for
the full system lifetime. With perf or bpf(2)'s BPF_RAW_TRACEPOINT_OPEN, this
means keeping a process alive all the time.  However, in Android our BPF loader
currently (for hardeneded security) involves just starting a process at boot
time, doing the BPF program loading, and then pinning them to /sys/fs/bpf.  We
don't keep this process alive all the time. It is more suitable to do a
one-shot attach of the program using ftrace and not need to have a process
alive all the time anymore for this. Such process also needs elevated
privileges since tracepoint program loading currently requires CAP_SYS_ADMIN
anyway so by design Android's bpfloader runs once at init and exits.

This series add a new bpf file to /sys/kernel/debug/tracing/events/X/Y/bpf
The following commands can be written into it:
attach: Attaches BPF prog fd to tracepoint
detach: Detaches BPF prog fd to tracepoint

Reading the bpf file will show all the attached programs to the tracepoint.

Joel Fernandes (Google) (4):
Move bpf_raw_tracepoint functionality into bpf_trace.c
trace/bpf: Add support for attach/detach of ftrace events to BPF
lib/bpf: Add support for ftrace event attach and detach
selftests/bpf: Add test for ftrace-based BPF attach/detach

include/linux/bpf_trace.h |  16 ++
include/linux/trace_events.h  |   1 +
kernel/bpf/syscall.c  |  69 +-
kernel/trace/bpf_trace.c  | 225 ++
kernel/trace/trace.h  |   1 +
kernel/trace/trace_events.c   |   8 +
tools/lib/bpf/bpf.c   |  53 +
tools/lib/bpf/bpf.h   |   4 +
tools/lib/bpf/libbpf.map  |   2 +
.../raw_tp_writable_test_ftrace_run.c |  89 +++
10 files changed, 410 insertions(+), 58 deletions(-)
create mode 100644 
tools/testing/selftests/bpf/prog_tests/raw_tp_writable_test_ftrace_run.c

--
2.22.0.410.gd8fdbe21b5-goog



[RFC] Fix python feature detection

2019-07-07 Thread Joel Fernandes (Google)
I am having a hard time building BPF samples by doing a make in
samples/bpf. While I am debugging that, I ran into the Python issue.
Even though the system has libpython2.7-dev:

If I just do a 'make' inside tools/build/feature/ I get:
Python.h: No such file or directory

This led me to this patch which fixes Python feature detection for me.
I am not sure if it is the right fix for Python since it is hardcoded
for Python version 2, but I thought it could be useful.

My system is a Debian buster release.

Cc: a...@kernel.org
Cc: jo...@redhat.com
Signed-off-by: Joel Fernandes (Google) 
---
 tools/build/feature/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile
index 4b8244ee65ce..cde44cb38a5e 100644
--- a/tools/build/feature/Makefile
+++ b/tools/build/feature/Makefile
@@ -83,7 +83,7 @@ __BUILDXX = $(CXX) $(CXXFLAGS) -MD -Wall -Werror -o $@ 
$(patsubst %.bin,%.cpp,$(
 ###
 
 $(OUTPUT)test-all.bin:
-   $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf 
-lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs 
--cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) 
-DPACKAGE='"perf"' -lbfd -ldl -lz -llzma
+   $(BUILD) -fstack-protector-all -O2 -D_FORTIFY_SOURCE=2 -ldw -lelf 
-lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs 
--cflags gtk+-2.0 2>/dev/null) $(shell $(PKG_CONFIG) --libs --cflags python2 
2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd 
-ldl -lz -llzma
 
 $(OUTPUT)test-hello.bin:
$(BUILD)
@@ -205,7 +205,7 @@ $(OUTPUT)test-libperl.bin:
$(BUILD) $(FLAGS_PERL_EMBED)
 
 $(OUTPUT)test-libpython.bin:
-   $(BUILD) $(FLAGS_PYTHON_EMBED)
+   $(BUILD) $(shell $(PKG_CONFIG) --libs --cflags python2 2>/dev/null) 
$(FLAGS_PYTHON_EMBED)
 
 $(OUTPUT)test-libpython-version.bin:
$(BUILD)
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH] rcuperf: Make rcuperf kernel test more robust for !expedited mode

2019-07-03 Thread Joel Fernandes (Google)
It is possible that the rcuperf kernel test runs concurrently with init
starting up.  During this time, the system is running all grace periods
as expedited.  However, rcuperf can also be run for normal GP tests.
Right now, it depends on a holdoff time before starting the test to
ensure grace periods start later. This works fine with the default
holdoff time however it is not robust in situations where init takes
greater than the holdoff time to finish running. Or, as in my case:

I modified the rcuperf test locally to also run a thread that did
preempt disable/enable in a loop. This had the effect of slowing down
init. The end result was that the "batches:" counter in rcuperf was 0
causing a division by 0 error in the results. This counter was 0 because
only expedited GPs seem to happen, not normal ones which led to the
rcu_state.gp_seq counter remaining constant across grace periods which
unexpectedly happen to be expedited. The system was running expedited
RCU all the time because rcu_unexpedited_gp() would not have run yet
from init.  In other words, the test would concurrently with init
booting in expedited GP mode.

To fix this properly, let us check if system_state if SYSTEM_RUNNING
is set before starting the test. The system_state approximately aligns
with when rcu_unexpedited_gp() is called and works well in practice.

I also tried late_initcall however it is still too early to be
meaningful for this case.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/rcuperf.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 4513807cd4c4..5a879d073c1c 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -375,6 +375,14 @@ rcu_perf_writer(void *arg)
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
 
+   /*
+* Wait until rcu_end_inkernel_boot() is called for normal GP tests
+* so that RCU is not always expedited for normal GP tests.
+* The system_state test is approximate, but works well in practice.
+*/
+   while (!gp_exp && system_state != SYSTEM_RUNNING)
+   schedule_timeout_uninterruptible(1);
+
t = ktime_get_mono_fast_ns();
if (atomic_inc_return(_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
-- 
2.22.0.410.gd8fdbe21b5-goog



[RFC] rcuperf: Make rcuperf test more robust for !expedited mode

2019-07-02 Thread Joel Fernandes (Google)
It is possible that rcuperf run concurrently with init starting up.
During this time, the system is running all grace periods as expedited.
However, rcuperf can also be run in a normal mode. The rcuperf test
depends on a holdoff before starting the test to ensure grace periods
start later. This works fine with the default holdoff time however it is
not robust in situations where init takes greater than the holdoff time
the finish running. Or, as in my case:

I modified the rcuperf test locally to also run a thread that did
preempt disable/enable in a loop. This had the effect of slowing down
init. The end result was "batches:" counter was 0. This was because only
expedited GPs seem to happen, not normal ones which led to the
rcu_state.gp_seq counter remaining constant across grace periods which
unexpectedly happen to be expedited.

This led me to debug that even though the test could be for normal GP
performance, because init has still not run enough, the
rcu_unexpedited_gp() call would not have run yet. In other words, the
test would concurrently with init booting in expedited GP mode.

To fix this properly, let us just check for whether rcu_unexpedited_gp()
was called yet before starting the writer test. With this, the holdoff
parameter could also be dropped or reduced to speed up the test.

Signed-off-by: Joel Fernandes (Google) 
---
Please consider this patch as an RFC only! This is the first time I am
running the RCU performance tests, thanks!

Question:
I actually did not know that expedited gp does not increment
rcu_state.gp_seq. Does expedited GPs not go through the same RCU-tree
machinery as non-expedited? If yes, why doesn't rcu_state.gp_seq
increment when we are expedited? If no, why not?

 kernel/rcu/rcu.h | 2 ++
 kernel/rcu/rcuperf.c | 5 +
 kernel/rcu/update.c  | 9 +
 3 files changed, 16 insertions(+)

diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 8fd4f82c9b3d..5d30dbc7000b 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -429,12 +429,14 @@ static inline void srcu_init(void) { }
 static inline bool rcu_gp_is_normal(void) { return true; }
 static inline bool rcu_gp_is_expedited(void) { return false; }
 static inline void rcu_expedite_gp(void) { }
+static inline bool rcu_expedite_gp_called(void) { }
 static inline void rcu_unexpedite_gp(void) { }
 static inline void rcu_request_urgent_qs_task(struct task_struct *t) { }
 #else /* #ifdef CONFIG_TINY_RCU */
 bool rcu_gp_is_normal(void); /* Internal RCU use. */
 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
 void rcu_expedite_gp(void);
+bool rcu_expedite_gp_called(void);
 void rcu_unexpedite_gp(void);
 void rcupdate_announce_bootup_oddness(void);
 void rcu_request_urgent_qs_task(struct task_struct *t);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 4513807cd4c4..9902857d3cc6 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -375,6 +375,11 @@ rcu_perf_writer(void *arg)
if (holdoff)
schedule_timeout_uninterruptible(holdoff * HZ);
 
+   // Wait for rcu_unexpedite_gp() to be called from init to avoid
+   // doing expedited GPs if we are not supposed to
+   while (!gp_exp && rcu_expedite_gp_called())
+   schedule_timeout_uninterruptible(1);
+
t = ktime_get_mono_fast_ns();
if (atomic_inc_return(_rcu_perf_writer_started) >= nrealwriters) {
t_rcu_perf_writer_started = t;
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 249517058b13..840f62805d62 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -154,6 +154,15 @@ void rcu_expedite_gp(void)
 }
 EXPORT_SYMBOL_GPL(rcu_expedite_gp);
 
+/**
+ * rcu_expedite_gp_called - Was there a prior call to rcu_expedite_gp()?
+ */
+bool rcu_expedite_gp_called(void)
+{
+   return (atomic_read(_expedited_nesting) != 0);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp_called);
+
 /**
  * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
  *
-- 
2.22.0.410.gd8fdbe21b5-goog


[PATCH v3] Convert struct pid count to refcount_t

2019-07-01 Thread Joel Fernandes (Google)
struct pid's count is an atomic_t field used as a refcount. Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.

For memory ordering, the only change is with the following:
 -  if ((atomic_read(>count) == 1) ||
 -   atomic_dec_and_test(>count)) {
 +  if (refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);

Here the change is from:
Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
This ACQUIRE should take care of making sure the free happens after the
refcount_dec_and_test().

The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is. The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.

Cc: mathieu.desnoy...@efficios.com
Cc: wi...@infradead.org
Cc: pet...@infradead.org
Cc: will.dea...@arm.com
Cc: paul...@linux.vnet.ibm.com
Cc: elena.reshet...@intel.com
Cc: keesc...@chromium.org
Cc: Andrea Parri 
Cc: kernel-t...@android.com
Cc: kernel-harden...@lists.openwall.com
Cc: ja...@google.com
Reviewed-by: keesc...@chromium.org
Reviewed-by: Andrea Parri 
Signed-off-by: Joel Fernandes (Google) 

---
v1->v2 is to get rid of the atomic_read().
v2->v3 replaces ATOMIC_INIT with REFCOUNT_INIT

 include/linux/pid.h | 5 +++--
 kernel/pid.c| 9 -
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..8cb86d377ff5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include 
+#include 
 
 enum pid_type
 {
@@ -56,7 +57,7 @@ struct upid {
 
 struct pid
 {
-   atomic_t count;
+   refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
@@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
 static inline struct pid *get_pid(struct pid *pid)
 {
if (pid)
-   atomic_inc(>count);
+   refcount_inc(>count);
return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..86b526bd59e1 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,12 +37,12 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 
 struct pid init_struct_pid = {
-   .count  = ATOMIC_INIT(1),
+   .count  = REFCOUNT_INIT(1),
.tasks  = {
{ .first = NULL },
{ .first = NULL },
@@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
return;
 
ns = pid->numbers[pid->level].ns;
-   if ((atomic_read(>count) == 1) ||
-atomic_dec_and_test(>count)) {
+   if (refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
 
get_pid_ns(ns);
-   atomic_set(>count, 1);
+   refcount_set(>count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(>tasks[type]);
 
-- 
2.22.0.410.gd8fdbe21b5-goog


[RFC 1/3] rcu: Expedite the rcu quiescent state reporting if help needed

2019-06-30 Thread Joel Fernandes (Google)
The t->rcu_read_unlock_special union's need_qs bit can be set by the
scheduler tick (in rcu_flavor_sched_clock_irq) to indicate that help is
needed from the rcu_read_unlock path. When this help arrives however, we
can do better to speed up the quiescent state reporting which if
rcu_read_unlock_special::need_qs is set might be quite urgent. Make use
of this information in deciding when to do heavy-weight softirq raising
where possible.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c588ef98efd3..bff6410fac06 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -622,7 +622,8 @@ static void rcu_read_unlock_special(struct task_struct *t)
t->rcu_read_unlock_special.b.exp_hint = false;
exp = (t->rcu_blocked_node && t->rcu_blocked_node->exp_tasks) ||
  (rdp->grpmask & rnp->expmask) ||
- tick_nohz_full_cpu(rdp->cpu);
+ tick_nohz_full_cpu(rdp->cpu)  ||
+ t->rcu_read_unlock_special.b.need_qs;
// Need to defer quiescent state until everything is enabled.
if (irqs_were_disabled && use_softirq &&
(in_interrupt() ||
-- 
2.22.0.410.gd8fdbe21b5-goog



[RFC 2/3] rcu: Simplify rcu_note_context_switch exit from critical section

2019-06-30 Thread Joel Fernandes (Google)
The rcu_preempt_note_context_switch() tries to handle cases where
__rcu_read_unlock() got preempted and then the context switch path does
the reporting of the quiscent state along with clearing any bits in the
rcu_read_unlock_special union.

This can be handled by just calling rcu_deferred_qs() which was added
during the RCU consolidation work and already does these checks.

Tested RCU config TREE03 for an hour which succeeds.

Cc: r...@vger.kernel.org
Cc: kernel-t...@android.com
Signed-off-by: Joel Fernandes (Google) 
---
 kernel/rcu/tree_plugin.h | 9 -
 1 file changed, 9 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index bff6410fac06..ebb4d46a6267 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -313,15 +313,6 @@ void rcu_note_context_switch(bool preempt)
   ? rnp->gp_seq
   : rcu_seq_snap(>gp_seq));
rcu_preempt_ctxt_queue(rnp, rdp);
-   } else if (t->rcu_read_lock_nesting < 0 &&
-  t->rcu_read_unlock_special.s) {
-
-   /*
-* Complete exit from RCU read-side critical section on
-* behalf of preempted instance of __rcu_read_unlock().
-*/
-   rcu_read_unlock_special(t);
-   rcu_preempt_deferred_qs(t);
} else {
rcu_preempt_deferred_qs(t);
}
-- 
2.22.0.410.gd8fdbe21b5-goog



[RFC 3/3] Revert "rcutorture: Tweak kvm options"

2019-06-30 Thread Joel Fernandes (Google)
This reverts commit a6fda6dab93c2c06ef4b8cb4b9258df6674d2438 which
causes kvm.sh to not run on my machines. The qemu-system-x86_64 command
runs but does nothing.

Signed-off-by: Joel Fernandes (Google) 
---
I am Ok if we want to drop this patch but it is in my tree because
without it I can't run the tests.

 tools/testing/selftests/rcutorture/bin/functions.sh | 13 +
 .../selftests/rcutorture/configs/rcu/CFcommon   |  3 ---
 2 files changed, 1 insertion(+), 15 deletions(-)

diff --git a/tools/testing/selftests/rcutorture/bin/functions.sh 
b/tools/testing/selftests/rcutorture/bin/functions.sh
index c3a49fb4d6f6..6bcb8b5b2ff2 100644
--- a/tools/testing/selftests/rcutorture/bin/functions.sh
+++ b/tools/testing/selftests/rcutorture/bin/functions.sh
@@ -172,7 +172,7 @@ identify_qemu_append () {
local console=ttyS0
case "$1" in
qemu-system-x86_64|qemu-system-i386)
-   echo selinux=0 initcall_debug debug
+   echo noapic selinux=0 initcall_debug debug
;;
qemu-system-aarch64)
console=ttyAMA0
@@ -191,19 +191,8 @@ identify_qemu_append () {
 # Output arguments for qemu arguments based on the TORTURE_QEMU_MAC
 # and TORTURE_QEMU_INTERACTIVE environment variables.
 identify_qemu_args () {
-   local KVM_CPU=""
-   case "$1" in
-   qemu-system-x86_64)
-   KVM_CPU=kvm64
-   ;;
-   qemu-system-i386)
-   KVM_CPU=kvm32
-   ;;
-   esac
case "$1" in
qemu-system-x86_64|qemu-system-i386)
-   echo -machine q35,accel=kvm
-   echo -cpu ${KVM_CPU}
;;
qemu-system-aarch64)
echo -machine virt,gic-version=host -cpu host
diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon 
b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index e19a444a0684..d2d2a86139db 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,5 +1,2 @@
 CONFIG_RCU_TORTURE_TEST=y
 CONFIG_PRINTK_TIME=y
-CONFIG_HYPERVISOR_GUEST=y
-CONFIG_PARAVIRT=y
-CONFIG_KVM_GUEST=y
-- 
2.22.0.410.gd8fdbe21b5-goog



[PATCH v2] Convert struct pid count to refcount_t

2019-06-28 Thread Joel Fernandes (Google)
struct pid's count is an atomic_t field used as a refcount. Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.

For memory ordering, the only change is with the following:
 -  if ((atomic_read(>count) == 1) ||
 -   atomic_dec_and_test(>count)) {
 +  if (refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);

Here the change is from:
Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
This ACQUIRE should take care of making sure the free happens after the
refcount_dec_and_test().

The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is. The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.

Cc: mathieu.desnoy...@efficios.com
Cc: wi...@infradead.org
Cc: pet...@infradead.org
Cc: will.dea...@arm.com
Cc: paul...@linux.vnet.ibm.com
Cc: elena.reshet...@intel.com
Cc: keesc...@chromium.org
Cc: kernel-t...@android.com
Cc: kernel-harden...@lists.openwall.com
Signed-off-by: Joel Fernandes (Google) 

---
Only change from v1->v2 is to get rid of the atomic_read().

 include/linux/pid.h | 5 +++--
 kernel/pid.c| 7 +++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..8cb86d377ff5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include 
+#include 
 
 enum pid_type
 {
@@ -56,7 +57,7 @@ struct upid {
 
 struct pid
 {
-   atomic_t count;
+   refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
@@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
 static inline struct pid *get_pid(struct pid *pid)
 {
if (pid)
-   atomic_inc(>count);
+   refcount_inc(>count);
return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..89c4849fab5d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,7 +37,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 
@@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
return;
 
ns = pid->numbers[pid->level].ns;
-   if ((atomic_read(>count) == 1) ||
-atomic_dec_and_test(>count)) {
+   if (refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
 
get_pid_ns(ns);
-   atomic_set(>count, 1);
+   refcount_set(>count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(>tasks[type]);
 
-- 
2.22.0.410.gd8fdbe21b5-goog


[PATCH RFC v2] Convert struct pid count to refcount_t

2019-06-24 Thread Joel Fernandes (Google)
struct pid's count is an atomic_t field used as a refcount. Use
refcount_t for it which is basically atomic_t but does additional
checking to prevent use-after-free bugs.

For memory ordering, the only change is with the following:
 -  if ((atomic_read(>count) == 1) ||
 -   atomic_dec_and_test(>count)) {
 +  if (refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);

Here the change is from:
Fully ordered --> RELEASE + ACQUIRE (as per refcount-vs-atomic.rst)
This ACQUIRE should take care of making sure the free happens after the
refcount_dec_and_test().

The above hunk also removes atomic_read() since it is not needed for the
code to work and it is unclear how beneficial it is. The removal lets
refcount_dec_and_test() check for cases where get_pid() happened before
the object was freed.

Cc: ja...@google.com
Cc: o...@redhat.com
Cc: mathieu.desnoy...@efficios.com
Cc: wi...@infradead.org
Cc: pet...@infradead.org
Cc: will.dea...@arm.com
Cc: paul...@linux.vnet.ibm.com
Cc: elena.reshet...@intel.com
Cc: keesc...@chromium.org
Cc: kernel-t...@android.com
Cc: kernel-harden...@lists.openwall.com
Signed-off-by: Joel Fernandes (Google) 

---
Changed to RFC to get any feedback on the memory ordering.


 include/linux/pid.h | 5 +++--
 kernel/pid.c| 7 +++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/linux/pid.h b/include/linux/pid.h
index 14a9a39da9c7..8cb86d377ff5 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -3,6 +3,7 @@
 #define _LINUX_PID_H
 
 #include 
+#include 
 
 enum pid_type
 {
@@ -56,7 +57,7 @@ struct upid {
 
 struct pid
 {
-   atomic_t count;
+   refcount_t count;
unsigned int level;
/* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX];
@@ -69,7 +70,7 @@ extern struct pid init_struct_pid;
 static inline struct pid *get_pid(struct pid *pid)
 {
if (pid)
-   atomic_inc(>count);
+   refcount_inc(>count);
return pid;
 }
 
diff --git a/kernel/pid.c b/kernel/pid.c
index 20881598bdfa..89c4849fab5d 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -37,7 +37,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 #include 
 
@@ -106,8 +106,7 @@ void put_pid(struct pid *pid)
return;
 
ns = pid->numbers[pid->level].ns;
-   if ((atomic_read(>count) == 1) ||
-atomic_dec_and_test(>count)) {
+   if (refcount_dec_and_test(>count)) {
kmem_cache_free(ns->pid_cachep, pid);
put_pid_ns(ns);
}
@@ -210,7 +209,7 @@ struct pid *alloc_pid(struct pid_namespace *ns)
}
 
get_pid_ns(ns);
-   atomic_set(>count, 1);
+   refcount_set(>count, 1);
for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(>tasks[type]);
 
-- 
2.22.0.410.gd8fdbe21b5-goog


[PATCH BACKPORT Android 4.9]: mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()

2019-06-12 Thread Joel Fernandes (Google)
From: Joel Fernandes 

Johannes, all, could you take a look at the below backport of this fix
which I am apply for our Android 4.9 kernel? Since lruvec stats are not
present in the kernel and I did not want to backport that, I added my
own mem_cgroup_update_stat functions which should be sufficient for this
fix. Does this patch look good to you? Thanks for the help.

(Joel: Fixed conflicts and added new memcg stats functions)
(Cherry-picked from 739f79fc9db1)

Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:

BUG: unable to handle kernel NULL pointer dereference at 03b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
 
 end_page_writeback+0x47/0x70
 f2fs_write_end_io+0x76/0x180 [f2fs]
 bio_endio+0x9f/0x120
 blk_update_request+0xa8/0x2f0
 scsi_end_request+0x39/0x1d0
 scsi_io_completion+0x211/0x690
 scsi_finish_command+0xd9/0x120
 scsi_softirq_done+0x127/0x150
 __blk_mq_complete_request_remote+0x13/0x20
 flush_smp_call_function_queue+0x56/0x110
 generic_smp_call_function_single_interrupt+0x13/0x30
 smp_call_function_single_interrupt+0x27/0x40
 call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10

(gdb) l *(test_clear_page_writeback+0x12e)
0x811bae3e is in test_clear_page_writeback 
(./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,

The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback().  The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles.  But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.

It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared.  Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.

Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward.  It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.

Change-Id: I692226d6f183c11c27ed096967e6a5face3b9741
Link: http://lkml.kernel.org/r/20170809183825.ga26...@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner 
Reported-by: Jaegeuk Kim 
Tested-by: Jaegeuk Kim 
Reported-by: Bradley Bolen 
Tested-by: Brad Bolen 
Cc: Vladimir Davydov 
Cc: Michal Hocko 
Cc: [4.6+]
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 
Signed-off-by: Joel Fernandes 

---
 include/linux/memcontrol.h | 31 +--
 mm/memcontrol.c| 43 +++---
 mm/page-writeback.c| 14 ++---
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8b35bdbdc214c..f9e02fd7e86b7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -490,7 +490,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-void lock_page_memcg(struct page *page);
+struct mem_cgroup *lock_page_memcg(struct page *page);
+void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
 /**
@@ -529,6 +530,27 @@ static inline void mem_cgroup_dec_page_stat(struct page 
*page,
mem_cgroup_update_page_stat(page, idx, -1);
 }
 
+static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
+enum mem_cgroup_stat_index idx, int val)
+{
+   VM_BUG_ON(!(rcu_read_lock_held()));
+
+   if (memcg)
+   this_cpu_add(memcg->stat->count[idx], val);
+}
+
+static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
+   enum mem_cgroup_stat_index idx)
+{
+   mem_cgroup_update_stat(memcg, idx, 1);
+}
+
+static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
+   enum mem_cgroup_stat_index idx)
+{
+   

[PATCH BACKPORT Android 4.9]: mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()

2019-06-12 Thread Joel Fernandes (Google)
From: Joel Fernandes 

Johannes, all, could you take a look at the below backport of this fix
which I am apply for our Android 4.9 kernel? Since lruvec stats are not
present in the kernel and I did not want to backport that, I added my
own mem_cgroup_update_stat functions which should be sufficient for this
fix. Does this patch look good to you? Thanks for the help.

(Joel: Fixed conflicts and added new memcg stats functions)
(Cherry-picked from 739f79fc9db1)

Jaegeuk and Brad report a NULL pointer crash when writeback ending tries
to update the memcg stats:

BUG: unable to handle kernel NULL pointer dereference at 03b0
IP: test_clear_page_writeback+0x12e/0x2c0
[...]
RIP: 0010:test_clear_page_writeback+0x12e/0x2c0
Call Trace:
 
 end_page_writeback+0x47/0x70
 f2fs_write_end_io+0x76/0x180 [f2fs]
 bio_endio+0x9f/0x120
 blk_update_request+0xa8/0x2f0
 scsi_end_request+0x39/0x1d0
 scsi_io_completion+0x211/0x690
 scsi_finish_command+0xd9/0x120
 scsi_softirq_done+0x127/0x150
 __blk_mq_complete_request_remote+0x13/0x20
 flush_smp_call_function_queue+0x56/0x110
 generic_smp_call_function_single_interrupt+0x13/0x30
 smp_call_function_single_interrupt+0x27/0x40
 call_function_single_interrupt+0x89/0x90
RIP: 0010:native_safe_halt+0x6/0x10

(gdb) l *(test_clear_page_writeback+0x12e)
0x811bae3e is in test_clear_page_writeback 
(./include/linux/memcontrol.h:619).
614 mod_node_page_state(page_pgdat(page), idx, val);
615 if (mem_cgroup_disabled() || !page->mem_cgroup)
616 return;
617 mod_memcg_state(page->mem_cgroup, idx, val);
618 pn = page->mem_cgroup->nodeinfo[page_to_nid(page)];
619 this_cpu_add(pn->lruvec_stat->count[idx], val);
620 }
621
622 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
623 gfp_t gfp_mask,

The issue is that writeback doesn't hold a page reference and the page
might get freed after PG_writeback is cleared (and the mapping is
unlocked) in test_clear_page_writeback().  The stat functions looking up
the page's node or zone are safe, as those attributes are static across
allocation and free cycles.  But page->mem_cgroup is not, and it will
get cleared if we race with truncation or migration.

It appears this race window has been around for a while, but less likely
to trigger when the memcg stats were updated first thing after
PG_writeback is cleared.  Recent changes reshuffled this code to update
the global node stats before the memcg ones, though, stretching the race
window out to an extent where people can reproduce the problem.

Update test_clear_page_writeback() to look up and pin page->mem_cgroup
before clearing PG_writeback, then not use that pointer afterward.  It
is a partial revert of 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
but leaves the pageref-holding callsites that aren't affected alone.

Change-Id: I692226d6f183c11c27ed096967e6a5face3b9741
Link: http://lkml.kernel.org/r/20170809183825.ga26...@cmpxchg.org
Fixes: 62cccb8c8e7a ("mm: simplify lock_page_memcg()")
Signed-off-by: Johannes Weiner 
Reported-by: Jaegeuk Kim 
Tested-by: Jaegeuk Kim 
Reported-by: Bradley Bolen 
Tested-by: Brad Bolen 
Cc: Vladimir Davydov 
Cc: Michal Hocko 
Cc: [4.6+]
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 
Signed-off-by: Joel Fernandes 

---
 include/linux/memcontrol.h | 31 +--
 mm/memcontrol.c| 43 +++---
 mm/page-writeback.c| 14 ++---
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 8b35bdbdc214c..f9e02fd7e86b7 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -490,7 +490,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
 extern int do_swap_account;
 #endif
 
-void lock_page_memcg(struct page *page);
+struct mem_cgroup *lock_page_memcg(struct page *page);
+void __unlock_page_memcg(struct mem_cgroup *memcg);
 void unlock_page_memcg(struct page *page);
 
 /**
@@ -529,6 +530,27 @@ static inline void mem_cgroup_dec_page_stat(struct page 
*page,
mem_cgroup_update_page_stat(page, idx, -1);
 }
 
+static inline void mem_cgroup_update_stat(struct mem_cgroup *memcg,
+enum mem_cgroup_stat_index idx, int val)
+{
+   VM_BUG_ON(!(rcu_read_lock_held()));
+
+   if (memcg)
+   this_cpu_add(memcg->stat->count[idx], val);
+}
+
+static inline void mem_cgroup_inc_stat(struct mem_cgroup *memcg,
+   enum mem_cgroup_stat_index idx)
+{
+   mem_cgroup_update_stat(memcg, idx, 1);
+}
+
+static inline void mem_cgroup_dec_stat(struct mem_cgroup *memcg,
+   enum mem_cgroup_stat_index idx)
+{
+   

[RFC 4/6] workqueue: Convert for_each_wq to use built-in list check

2019-06-01 Thread Joel Fernandes (Google)
list_for_each_entry_rcu now has support to check for RCU reader sections
as well as lock. Just use the support in it, instead of explictly
checking in the caller.

Signed-off-by: Joel Fernandes (Google) 
---
 kernel/workqueue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9657315405de..91ed7aca16e5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -424,9 +424,8 @@ static void workqueue_sysfs_unregister(struct 
workqueue_struct *wq);
  * ignored.
  */
 #define for_each_pwq(pwq, wq)  \
-   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node)  \
-   if (({ assert_rcu_or_wq_mutex(wq); false; })) { }   \
-   else
+   list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node,  \
+lock_is_held(&(wq->mutex).dep_map))
 
 #ifdef CONFIG_DEBUG_OBJECTS_WORK
 
-- 
2.22.0.rc1.311.g5d7573a151-goog



[RFC 3/6] driver/core: Convert to use built-in RCU list checking

2019-06-01 Thread Joel Fernandes (Google)
list_for_each_entry_rcu has built-in RCU and lock checking. Make use of
it in driver core.

Signed-off-by: Joel Fernandes (Google) 
---
 drivers/base/base.h  |  1 +
 drivers/base/core.c  | 10 ++
 drivers/base/power/runtime.c | 15 ++-
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/drivers/base/base.h b/drivers/base/base.h
index b405436ee28e..0d32544b6f91 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -165,6 +165,7 @@ static inline int devtmpfs_init(void) { return 0; }
 /* Device links support */
 extern int device_links_read_lock(void);
 extern void device_links_read_unlock(int idx);
+extern int device_links_read_lock_held(void);
 extern int device_links_check_suppliers(struct device *dev);
 extern void device_links_driver_bound(struct device *dev);
 extern void device_links_driver_cleanup(struct device *dev);
diff --git a/drivers/base/core.c b/drivers/base/core.c
index fd7511e04e62..6c5ca9685647 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -68,6 +68,11 @@ void device_links_read_unlock(int idx)
 {
srcu_read_unlock(_links_srcu, idx);
 }
+
+int device_links_read_lock_held(void)
+{
+   return srcu_read_lock_held(_links_srcu);
+}
 #else /* !CONFIG_SRCU */
 static DECLARE_RWSEM(device_links_lock);
 
@@ -91,6 +96,11 @@ void device_links_read_unlock(int not_used)
 {
up_read(_links_lock);
 }
+
+int device_links_read_lock_held(void)
+{
+   return lock_is_held(_links_lock);
+}
 #endif /* !CONFIG_SRCU */
 
 /**
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 952a1e7057c7..7a10e8379a70 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -287,7 +287,8 @@ static int rpm_get_suppliers(struct device *dev)
 {
struct device_link *link;
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node) {
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held()) {
int retval;
 
if (!(link->flags & DL_FLAG_PM_RUNTIME) ||
@@ -309,7 +310,8 @@ static void rpm_put_suppliers(struct device *dev)
 {
struct device_link *link;
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node) {
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held()) {
if (READ_ONCE(link->status) == DL_STATE_SUPPLIER_UNBIND)
continue;
 
@@ -1640,7 +1642,8 @@ void pm_runtime_clean_up_links(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.consumers, s_node) {
+   list_for_each_entry_rcu(link, >links.consumers, s_node,
+   device_links_read_lock_held()) {
if (link->flags & DL_FLAG_STATELESS)
continue;
 
@@ -1662,7 +1665,8 @@ void pm_runtime_get_suppliers(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node)
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held())
if (link->flags & DL_FLAG_PM_RUNTIME) {
link->supplier_preactivated = true;
refcount_inc(>rpm_active);
@@ -1683,7 +1687,8 @@ void pm_runtime_put_suppliers(struct device *dev)
 
idx = device_links_read_lock();
 
-   list_for_each_entry_rcu(link, >links.suppliers, c_node)
+   list_for_each_entry_rcu(link, >links.suppliers, c_node,
+   device_links_read_lock_held())
if (link->supplier_preactivated) {
link->supplier_preactivated = false;
if (refcount_dec_not_one(>rpm_active))
-- 
2.22.0.rc1.311.g5d7573a151-goog



[RFC 0/6] Harden list_for_each_entry_rcu() and family

2019-06-01 Thread Joel Fernandes (Google)
Hi,
Please consider this as an RFC / proof-of-concept to gather some feedback. This
series aims to provide lockdep checking to RCU list macros.

RCU has a number of primitives for "consumption" of an RCU protected pointer.
Most of the time, these consumers make sure that such accesses are under a RCU
reader-section (such as rcu_dereference{,sched,bh} or under a lock, such as
with rcu_dereference_protected()).

However, there are other ways to consume RCU pointers, such as by
list_for_each_enry_rcu or hlist_for_each_enry_rcu. Unlike the rcu_dereference
family, these consumers do no lockdep checking at all. And with the growing
number of RCU list uses, it is possible for bugs to creep in and go unnoticed
which lockdep checks can catch.

Since RCU consolidation efforts last year, the different traditional RCU
flavors (preempt, bh, sched) are all consolidated. In other words, any of these
flavors can cause a reader section to occur and all of them must cease before
the reader section is considered to be unlocked.

Now, the list_for_each_entry_rcu and family are different from the
rcu_dereference family in that, there is no _bh or _sched version of this
macro. They are used under many different RCU reader flavors, and also SRCU.
This series adds a new internal function rcu_read_lock_any_held() which checks
if any reader section is active at all, when these macros are called. If no
reader section exists, then the optional fourth argument to
list_for_each_entry_rcu() can be a lockdep expression which is evaluated
(similar to how rcu_dereference_check() works).

The optional argument trick to list_for_each_entry_rcu() can also be used in
the future to possibly remove rcu_dereference_{,bh,sched}_protected() API and
we can pass an optional lockdep expression to rcu_dereference() itself. Thus
eliminating 3 more RCU APIs.

Note that some list macro wrappers already do their own lockdep checking in the
caller side. These can be eliminated in favor of the built-in lockdep checking
in the list macro that this series adds. For example, workqueue code has a
assert_rcu_or_wq_mutex() function which is called in for_each_wq().  This
series replaces that in favor of the built-in one.

Also in the future, we can extend these checks to list_entry_rcu() and other
list macros as well.

Joel Fernandes (Google) (6):
rcu: Add support for consolidated-RCU reader checking
ipv4: add lockdep condition to fix for_each_entry
driver/core: Convert to use built-in RCU list checking
workqueue: Convert for_each_wq to use built-in list check
x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator
acpi: Use built-in RCU list checking for acpi_ioremaps list

arch/x86/pci/mmconfig-shared.c |  5 +++--
drivers/acpi/osl.c |  6 +++--
drivers/base/base.h|  1 +
drivers/base/core.c| 10 +
drivers/base/power/runtime.c   | 15 -
include/linux/rculist.h| 40 ++
include/linux/rcupdate.h   |  7 ++
kernel/rcu/update.c| 26 ++
kernel/workqueue.c |  5 ++---
net/ipv4/fib_frontend.c|  3 ++-
10 files changed, 101 insertions(+), 17 deletions(-)

--
2.22.0.rc1.311.g5d7573a151-goog



[RFC 5/6] x86/pci: Pass lockdep condition to pcm_mmcfg_list iterator

2019-06-01 Thread Joel Fernandes (Google)
The pcm_mmcfg_list is traversed with list_for_each_entry_rcu without a
reader-lock held, because the pci_mmcfg_lock is already held. Make this
known to the list macro so that it fixes new lockdep warnings that
trigger due to lockdep checks added to list_for_each_entry_rcu().

Signed-off-by: Joel Fernandes (Google) 
---
 arch/x86/pci/mmconfig-shared.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 7389db538c30..6fa42e9c4e6f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -29,6 +29,7 @@
 static bool pci_mmcfg_running_state;
 static bool pci_mmcfg_arch_init_failed;
 static DEFINE_MUTEX(pci_mmcfg_lock);
+#define pci_mmcfg_lock_held() lock_is_held(&(pci_mmcfg_lock).dep_map)
 
 LIST_HEAD(pci_mmcfg_list);
 
@@ -54,7 +55,7 @@ static void list_add_sorted(struct pci_mmcfg_region *new)
struct pci_mmcfg_region *cfg;
 
/* keep list sorted by segment and starting bus number */
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list) {
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held()) {
if (cfg->segment > new->segment ||
(cfg->segment == new->segment &&
 cfg->start_bus >= new->start_bus)) {
@@ -118,7 +119,7 @@ struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, 
int bus)
 {
struct pci_mmcfg_region *cfg;
 
-   list_for_each_entry_rcu(cfg, _mmcfg_list, list)
+   list_for_each_entry_rcu(cfg, _mmcfg_list, list, 
pci_mmcfg_lock_held())
if (cfg->segment == segment &&
cfg->start_bus <= bus && bus <= cfg->end_bus)
return cfg;
-- 
2.22.0.rc1.311.g5d7573a151-goog



[RFC 1/6] rcu: Add support for consolidated-RCU reader checking

2019-06-01 Thread Joel Fernandes (Google)
This patch adds support for checking RCU reader sections in list
traversal macros. Optionally, if the list macro is called under SRCU or
other lock/mutex protection, then appropriate lockdep expressions can be
passed to make the checks pass.

Existing list_for_each_entry_rcu() invocations don't need to pass the
optional fourth argument (cond) unless they are under some non-RCU
protection and needs to make lockdep check pass.

Signed-off-by: Joel Fernandes (Google) 
---
 include/linux/rculist.h  | 40 
 include/linux/rcupdate.h |  7 +++
 kernel/rcu/update.c  | 26 ++
 3 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e91ec9ddcd30..b641fdd9f1a2 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -40,6 +40,25 @@ static inline void INIT_LIST_HEAD_RCU(struct list_head *list)
  */
 #define list_next_rcu(list)(*((struct list_head __rcu **)(&(list)->next)))
 
+/*
+ * Check during list traversal that we are within an RCU reader
+ */
+#define __list_check_rcu() \
+   RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), \
+"RCU-list traversed in non-reader section!")
+
+static inline void __list_check_rcu_cond(int dummy, ...)
+{
+   va_list ap;
+   int cond;
+
+   va_start(ap, dummy);
+   cond = va_arg(ap, int);
+   va_end(ap);
+
+   RCU_LOCKDEP_WARN(!cond && !rcu_read_lock_any_held(),
+"RCU-list traversed in non-reader section!");
+}
 /*
  * Insert a new entry between two known consecutive entries.
  *
@@ -338,6 +357,9 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
  member) : NULL; \
 })
 
+#define SIXTH_ARG(a1, a2, a3, a4, a5, a6, ...) a6
+#define COUNT_VARGS(...) SIXTH_ARG(dummy, ## __VA_ARGS__, 4, 3, 2, 1, 0)
+
 /**
  * list_for_each_entry_rcu -   iterate over rcu list of given type
  * @pos:   the type * to use as a loop cursor.
@@ -348,9 +370,14 @@ static inline void list_splice_tail_init_rcu(struct 
list_head *list,
  * the _rcu list-mutation primitives such as list_add_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define list_for_each_entry_rcu(pos, head, member) \
-   for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
-   >member != (head); \
+#define list_for_each_entry_rcu(pos, head, member, cond...)\
+   if (COUNT_VARGS(cond) != 0) {   \
+   __list_check_rcu_cond(0, ## cond);  \
+   } else {\
+   __list_check_rcu(); \
+   }   \
+   for (pos = list_entry_rcu((head)->next, typeof(*pos), member);  \
+   >member != (head); \
pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 /**
@@ -621,7 +648,12 @@ static inline void hlist_add_behind_rcu(struct hlist_node 
*n,
  * the _rcu list-mutation primitives such as hlist_add_head_rcu()
  * as long as the traversal is guarded by rcu_read_lock().
  */
-#define hlist_for_each_entry_rcu(pos, head, member)\
+#define hlist_for_each_entry_rcu(pos, head, member, cond...)   \
+   if (COUNT_VARGS(cond) != 0) {   \
+   __list_check_rcu_cond(0, ## cond);  \
+   } else {\
+   __list_check_rcu(); \
+   }   \
for (pos = hlist_entry_safe 
(rcu_dereference_raw(hlist_first_rcu(head)),\
typeof(*(pos)), member);\
pos;\
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 922bb6848813..712b464ab960 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -223,6 +223,7 @@ int debug_lockdep_rcu_enabled(void);
 int rcu_read_lock_held(void);
 int rcu_read_lock_bh_held(void);
 int rcu_read_lock_sched_held(void);
+int rcu_read_lock_any_held(void);
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -243,6 +244,12 @@ static inline int rcu_read_lock_sched_held(void)
 {
return !preemptible();
 }
+
+static inline int rcu_read_lock_any_held(void)
+{
+   return !preemptible();
+}
+
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
 #ifdef CONFIG_PROVE_RCU
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index c3bf44ba42e5..9cb30006a5e1 100644
--- a/kernel/rcu/up

<    1   2   3   4   5   6   >