from:"Marcelo"

Re: [PATCH RESEND][next] sctp: Fix fall-through warnings for Clang

2021-04-20 Thread Marcelo Ricardo Leitner

On Tue, Apr 20, 2021 at 03:09:24PM -0500, Gustavo A. R. Silva wrote:
> Hi all,
> 
> Friendly ping: who can take this, please?

It would go via net/net-next tree, but I can't find this one on
patchwork. Just the previous version.

http://patchwork.ozlabs.org/project/netdev/list/?series===*=sctp%3A+Fix+fall-through+warnings+for+Clang=both=

I can, however, find it in the archives.

https://lore.kernel.org/netdev/20210305090717.GA139387%40embeddedor/T/

Dave and Jakub will know better.

  Marcelo

[PATCH v6] hrtimer: avoid retrigger_next_event IPI

2021-04-19 Thread Marcelo Tosatti



Setting the realtime clock triggers an IPI to all CPUs to reprogram
the clock event device.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Instead of sending an IPI unconditionally, check each per CPU hrtimer base
whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If
that's not the case, update the realtime and TAI base offsets remotely and
skip the IPI. This ensures that any subsequently armed timers on
CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets.

Signed-off-by: Marcelo Tosatti 

---

v6:
  - Do not take softirq_raised into account (Peter Xu).
  - Include BOOTTIME as base that requires IPI (Thomas).
  - Unconditional reprogram on resume path, since there is
nothing to gain in such path anyway.

v5:
  - Add missing hrtimer_update_base (Peter Xu).

v4:
   - Drop unused code (Thomas).

v3:
   - Nicer changelog  (Thomas).
   - Code style fixes (Thomas).
   - Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas).
   - Shrink preemption disabled section (Thomas).

v2:
   - Only REALTIME and TAI bases are affected by offset-to-monotonic changes 
(Thomas).
   - Don't special case nohz_full CPUs (Thomas).


diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bb5e7b0a4274..14a6e449b221 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -318,7 +318,7 @@ struct clock_event_device;
 
 extern void hrtimer_interrupt(struct clock_event_device *dev);
 
-extern void clock_was_set_delayed(void);
+extern void clock_was_set_delayed(bool force_reprogram);
 
 extern unsigned int hrtimer_resolution;
 
@@ -326,7 +326,7 @@ extern unsigned int hrtimer_resolution;
 
 #define hrtimer_resolution (unsigned int)LOW_RES_NSEC
 
-static inline void clock_was_set_delayed(void) { }
+static inline void clock_was_set_delayed(bool force_reprogram) { }
 
 #endif
 
@@ -351,7 +351,7 @@ hrtimer_expires_remaining_adjusted(const struct hrtimer 
*timer)
timer->base->get_time());
 }
 
-extern void clock_was_set(void);
+extern void clock_was_set(bool);
 #ifdef CONFIG_TIMERFD
 extern void timerfd_clock_was_set(void);
 #else
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..2258782fd714 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -758,9 +758,17 @@ static void hrtimer_switch_to_hres(void)
retrigger_next_event(NULL);
 }
 
+static void clock_was_set_force_reprogram_work(struct work_struct *work)
+{
+   clock_was_set(true);
+}
+
+static DECLARE_WORK(hrtimer_force_reprogram_work, 
clock_was_set_force_reprogram_work);
+
+
 static void clock_was_set_work(struct work_struct *work)
 {
-   clock_was_set();
+   clock_was_set(false);
 }
 
 static DECLARE_WORK(hrtimer_work, clock_was_set_work);
@@ -769,9 +777,12 @@ static DECLARE_WORK(hrtimer_work, clock_was_set_work);
  * Called from timekeeping and resume code to reprogram the hrtimer
  * interrupt device on all cpus.
  */
-void clock_was_set_delayed(void)
+void clock_was_set_delayed(bool force_reprogram)
 {
-   schedule_work(_work);
+   if (force_reprogram)
+   schedule_work(_force_reprogram_work);
+   else
+   schedule_work(_work);
 }
 
 #else
@@ -871,6 +882,18 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
tick_program_event(expires, 1);
 }
 
+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) |   \
+(1U << HRTIMER_BASE_REALTIME_SOFT) |   \
+(1U << HRTIMER_BASE_TAI) | \
+(1U << HRTIMER_BASE_TAI_SOFT) |\
+(1U << HRTIMER_BASE_BOOTTIME) |\
+(1U << HRTIMER_BASE_BOOTTIME_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+   return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
+}
+
 /*
  * Clock realtime was set
  *
@@ -882,11 +905,42 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
  * resolution timer interrupts. On UP we just disable interrupts and
  * call the high resolution interrupt code.
  */
-void clock_was_set(void)
+void clock_was_set(bool force_reprogram)
 {
 #ifdef CONFIG_HIGH_RES_TIMERS
-   /* Retrigger the CPU local events everywhere */
-   on_each_cpu(retrigger_next_event, NULL, 1);
+   cpumask_var_t mask;
+   int cpu;
+
+   if (force_reprogram == true) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   /* Avoid interrupting CPUs if possible */
+   cpus_read_lock();
+   for_each_online_cpu(cpu) {
+   unsigned long flags;
+

Re: [PATCH v5] hrtimer: avoid retrigger_next_event IPI

2021-04-19 Thread Marcelo Tosatti

On Sat, Apr 17, 2021 at 06:51:08PM +0200, Thomas Gleixner wrote:
> On Sat, Apr 17 2021 at 18:24, Thomas Gleixner wrote:
> > On Fri, Apr 16 2021 at 13:13, Peter Xu wrote:
> >> On Fri, Apr 16, 2021 at 01:00:23PM -0300, Marcelo Tosatti wrote:
> >>>  
> >>> +#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) | \
> >>> +  (1U << HRTIMER_BASE_REALTIME_SOFT) |   \
> >>> +  (1U << HRTIMER_BASE_TAI) | \
> >>> +  (1U << HRTIMER_BASE_TAI_SOFT))
> >>> +
> >>> +static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
> >>> +{
> >>> + if (cpu_base->softirq_activated)
> >>> + return true;
> >>
> >> A pure question on whether this check is needed...
> >>
> >> Here even if softirq_activated==1 (as softirq is going to happen), as long 
> >> as
> >> (cpu_base->active_bases & CLOCK_SET_BASES)==0, shouldn't it already mean 
> >> that
> >> "yes indeed clock was set, but no need to kick this cpu as no relevant 
> >> timer"?
> >> As that question seems to be orthogonal to whether a softirq is going to
> >> trigger on that cpu.
> >
> > That's correct and it's not any different from firing the IPI because in
> > both cases the update happens with the base lock of the CPU in question
> > held. And if there are no active timers in any of the affected bases,
> > then there is no need to reevaluate the next expiry because the offset
> > update does not affect any armed timers. It just makes sure that the
> > next enqueu of a timer on such a base will see the the correct offset.
> >
> > I'll just zap it.
> 
> But the whole thing is still wrong in two aspects:
> 
> 1) BOOTTIME can be one of the affected clocks when sleep time
>(suspended time) is injected because that uses the same mechanism.
> 
>Sorry for missing that earlier when I asked to remove it, but
>that's trivial to fix by adding the BOOTTIME base back.
> 
> 2) What's worse is that on resume this might break because that
>mechanism is also used to enforce the reprogramming of the clock
>event devices and there we cannot be selective on clock bases.
> 
>I need to dig deeper into that because suspend/resume has changed
>a lot over time, so this might be just a historical leftover. But
>without proper analysis we might end up with subtle and hard to
>debug wreckage.
> 
> Thanks,
> 
> tglx

Thomas,

There is no gain in avoiding the IPIs for the suspend/resume case 
(since suspending is a large interruption anyway). To avoid 
the potential complexity (and associated bugs), one option would 
be to NOT skip IPIs for the resume case.

Sending -v6 with that (and other suggestions/fixes).

Re: [PATCH][next] sctp: Fix out-of-bounds warning in sctp_process_asconf_param()

2021-04-16 Thread Marcelo Ricardo Leitner

On Fri, Apr 16, 2021 at 02:12:36PM -0500, Gustavo A. R. Silva wrote:
> Fix the following out-of-bounds warning:
> 
> net/sctp/sm_make_chunk.c:3150:4: warning: 'memcpy' offset [17, 28] from the 
> object at 'addr' is out of the bounds of referenced subobject 'v4' with type 
> 'struct sockaddr_in' at offset 0 [-Warray-bounds]
> 
> This helps with the ongoing efforts to globally enable -Warray-bounds
> and get us closer to being able to tighten the FORTIFY_SOURCE routines
> on memcpy().
> 
> Link: https://github.com/KSPP/linux/issues/109
> Reported-by: kernel test robot 
> Signed-off-by: Gustavo A. R. Silva 

Acked-by: Marcelo Ricardo Leitner 
Thanks.

[PATCH v5] hrtimer: avoid retrigger_next_event IPI

2021-04-16 Thread Marcelo Tosatti



Setting the realtime clock triggers an IPI to all CPUs to reprogram
the clock event device.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Instead of sending an IPI unconditionally, check each per CPU hrtimer base
whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If
that's not the case, update the realtime and TAI base offsets remotely and
skip the IPI. This ensures that any subsequently armed timers on
CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets.

Signed-off-by: Marcelo Tosatti 

---

v5:
  - Add missing hrtimer_update_base (Peter Xu).

v4:
   - Drop unused code (Thomas).

v3:
   - Nicer changelog  (Thomas).
   - Code style fixes (Thomas).
   - Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas).
   - Shrink preemption disabled section (Thomas).

v2:
   - Only REALTIME and TAI bases are affected by offset-to-monotonic changes 
(Thomas).
   - Don't special case nohz_full CPUs (Thomas).


diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..06fcc272e28d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -871,6 +871,19 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
tick_program_event(expires, 1);
 }
 
+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) |   \
+(1U << HRTIMER_BASE_REALTIME_SOFT) |   \
+(1U << HRTIMER_BASE_TAI) | \
+(1U << HRTIMER_BASE_TAI_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+   if (cpu_base->softirq_activated)
+   return true;
+
+   return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
+}
+
 /*
  * Clock realtime was set
  *
@@ -885,8 +898,34 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
 void clock_was_set(void)
 {
 #ifdef CONFIG_HIGH_RES_TIMERS
-   /* Retrigger the CPU local events everywhere */
-   on_each_cpu(retrigger_next_event, NULL, 1);
+   cpumask_var_t mask;
+   int cpu;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   /* Avoid interrupting CPUs if possible */
+   cpus_read_lock();
+   for_each_online_cpu(cpu) {
+   unsigned long flags;
+   struct hrtimer_cpu_base *cpu_base = _cpu(hrtimer_bases, 
cpu);
+
+   raw_spin_lock_irqsave(_base->lock, flags);
+   if (need_reprogram_timer(cpu_base))
+   cpumask_set_cpu(cpu, mask);
+   else
+   hrtimer_update_base(cpu_base);
+   raw_spin_unlock_irqrestore(_base->lock, flags);
+   }
+
+   preempt_disable();
+   smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+   preempt_enable();
+   cpus_read_unlock();
+   free_cpumask_var(mask);
+set_timerfd:
 #endif
timerfd_clock_was_set();
 }

[PATCH v4] hrtimer: avoid retrigger_next_event IPI

2021-04-15 Thread Marcelo Tosatti

Setting the realtime clock triggers an IPI to all CPUs to reprogram
the clock event device.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Instead of sending an IPI unconditionally, check each per CPU hrtimer base
whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If
that's not the case, update the realtime and TAI base offsets remotely and
skip the IPI. This ensures that any subsequently armed timers on
CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets.

Signed-off-by: Marcelo Tosatti 

---

v4:
   - Drop unused code (Thomas).

v3:
   - Nicer changelog  (Thomas).
   - Code style fixes (Thomas).
   - Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas).
   - Shrink preemption disabled section (Thomas).

v2:
   - Only REALTIME and TAI bases are affected by offset-to-monotonic changes 
(Thomas).
   - Don't special case nohz_full CPUs (Thomas).

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..e228c0a0c98f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -871,6 +871,19 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
tick_program_event(expires, 1);
 }
 
+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) |   \
+(1U << HRTIMER_BASE_REALTIME_SOFT) |   \
+(1U << HRTIMER_BASE_TAI) | \
+(1U << HRTIMER_BASE_TAI_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+   if (cpu_base->softirq_activated)
+   return true;
+
+   return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
+}
+
 /*
  * Clock realtime was set
  *
@@ -885,8 +898,32 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
 void clock_was_set(void)
 {
 #ifdef CONFIG_HIGH_RES_TIMERS
-   /* Retrigger the CPU local events everywhere */
-   on_each_cpu(retrigger_next_event, NULL, 1);
+   cpumask_var_t mask;
+   int cpu;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   /* Avoid interrupting CPUs if possible */
+   cpus_read_lock();
+   for_each_online_cpu(cpu) {
+   unsigned long flags;
+   struct hrtimer_cpu_base *cpu_base = _cpu(hrtimer_bases, 
cpu);
+
+   raw_spin_lock_irqsave(_base->lock, flags);
+   if (need_reprogram_timer(cpu_base))
+   cpumask_set_cpu(cpu, mask);
+   raw_spin_unlock_irqrestore(_base->lock, flags);
+   }
+
+   preempt_disable();
+   smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+   preempt_enable();
+   cpus_read_unlock();
+   free_cpumask_var(mask);
+set_timerfd:
 #endif
timerfd_clock_was_set();
 }

[PATCH v3] hrtimer: avoid retrigger_next_event IPI

2021-04-15 Thread Marcelo Tosatti



Setting the realtime clock triggers an IPI to all CPUs to reprogram
the clock event device.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Instead of sending an IPI unconditionally, check each per CPU hrtimer base
whether it has active timers in the CLOCK_REALTIME and CLOCK_TAI bases. If
that's not the case, update the realtime and TAI base offsets remotely and
skip the IPI. This ensures that any subsequently armed timers on
CLOCK_REALTIME and CLOCK_TAI are evaluated with the correct offsets.

Signed-off-by: Marcelo Tosatti 

---

v3:
   - Nicer changelog  (Thomas).
   - Code style fixes (Thomas).
   - Compilation warning with CONFIG_HIGH_RES_TIMERS=n (Thomas).
   - Shrink preemption disabled section (Thomas).

v2:
   - Only REALTIME and TAI bases are affected by offset-to-monotonic changes 
(Thomas).
   - Don't special case nohz_full CPUs (Thomas).

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..dd9c0d2f469f 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -871,6 +871,24 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
tick_program_event(expires, 1);
 }
 
+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME) |   \
+(1U << HRTIMER_BASE_REALTIME_SOFT) |   \
+(1U << HRTIMER_BASE_TAI) | \
+(1U << HRTIMER_BASE_TAI_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+   unsigned int active = 0;
+
+   if (cpu_base->softirq_activated)
+   return true;
+
+   active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+   active = active | (cpu_base->active_bases & HRTIMER_ACTIVE_HARD);
+
+   return (cpu_base->active_bases & CLOCK_SET_BASES) != 0;
+}
+
 /*
  * Clock realtime was set
  *
@@ -885,8 +903,32 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
 void clock_was_set(void)
 {
 #ifdef CONFIG_HIGH_RES_TIMERS
-   /* Retrigger the CPU local events everywhere */
-   on_each_cpu(retrigger_next_event, NULL, 1);
+   cpumask_var_t mask;
+   int cpu;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   /* Avoid interrupting CPUs if possible */
+   cpus_read_lock();
+   for_each_online_cpu(cpu) {
+   unsigned long flags;
+   struct hrtimer_cpu_base *cpu_base = _cpu(hrtimer_bases, 
cpu);
+
+   raw_spin_lock_irqsave(_base->lock, flags);
+   if (need_reprogram_timer(cpu_base))
+   cpumask_set_cpu(cpu, mask);
+   raw_spin_unlock_irqrestore(_base->lock, flags);
+   }
+
+   preempt_disable();
+   smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+   preempt_enable();
+   cpus_read_unlock();
+   free_cpumask_var(mask);
+set_timerfd:
 #endif
timerfd_clock_was_set();
 }

Re: [PATCH v2] net/sctp: fix race condition in sctp_destroy_sock

2021-04-13 Thread Marcelo Ricardo Leitner

On Tue, Apr 13, 2021 at 09:10:31PM +0300, Or Cohen wrote:
> If sctp_destroy_sock is called without sock_net(sk)->sctp.addr_wq_lock
> held and sp->do_auto_asconf is true, then an element is removed
> from the auto_asconf_splist without any proper locking.
> 
> This can happen in the following functions:
> 1. In sctp_accept, if sctp_sock_migrate fails.
> 2. In inet_create or inet6_create, if there is a bpf program
>attached to BPF_CGROUP_INET_SOCK_CREATE which denies
>creation of the sctp socket.
> 
> The bug is fixed by acquiring addr_wq_lock in sctp_destroy_sock
> instead of sctp_close.
> 
> This addresses CVE-2021-23133.
> 
> Reported-by: Or Cohen 
> Reviewed-by: Xin Long 
> Fixes: 610236587600 ("bpf: Add new cgroup attach type to enable sock 
> modifications")
> Signed-off-by: Or Cohen 

Thanks folks.
Acked-by: Marcelo Ricardo Leitner

[PATCH v2] hrtimer: avoid retrigger_next_event IPI

2021-04-13 Thread Marcelo Tosatti




Setting the realtime clock triggers an IPI to all CPUs to reprogram
hrtimers.

However, only realtime and TAI clocks have their offsets updated
(and therefore potentially require a reprogram).

Check if it only has monotonic active timers, and in that case 
update the realtime and TAI base offsets remotely, skipping the IPI.

This reduces interruptions to latency sensitive applications.

Signed-off-by: Marcelo Tosatti 

---

v2:
   - Only REALTIME and TAI bases are affected by offset-to-monotonic changes 
(Thomas).
   - Don't special case nohz_full CPUs (Thomas).
   

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 5c9d968187ae..be21b85c679d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -871,6 +871,28 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
tick_program_event(expires, 1);
 }
 
+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME)|\
+(1U << HRTIMER_BASE_REALTIME_SOFT)|\
+(1U << HRTIMER_BASE_TAI)|  \
+(1U << HRTIMER_BASE_TAI_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+   unsigned int active = 0;
+
+   if (cpu_base->softirq_activated)
+   return true;
+
+   active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+
+   active = active | (cpu_base->active_bases & HRTIMER_ACTIVE_HARD);
+
+   if ((active & CLOCK_SET_BASES) == 0)
+   return false;
+
+   return true;
+}
+
 /*
  * Clock realtime was set
  *
@@ -885,9 +907,31 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
 void clock_was_set(void)
 {
 #ifdef CONFIG_HIGH_RES_TIMERS
-   /* Retrigger the CPU local events everywhere */
-   on_each_cpu(retrigger_next_event, NULL, 1);
+   cpumask_var_t mask;
+   int cpu;
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   /* Avoid interrupting CPUs if possible */
+   preempt_disable();
+   for_each_online_cpu(cpu) {
+   unsigned long flags;
+   struct hrtimer_cpu_base *cpu_base = _cpu(hrtimer_bases, 
cpu);
+
+   raw_spin_lock_irqsave(_base->lock, flags);
+   if (need_reprogram_timer(cpu_base))
+   cpumask_set_cpu(cpu, mask);
+   raw_spin_unlock_irqrestore(_base->lock, flags);
+   }
+
+   smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+   preempt_enable();
+   free_cpumask_var(mask);
 #endif
+set_timerfd:
timerfd_clock_was_set();
 }

Re: [PATCH] net/sctp: fix race condition in sctp_destroy_sock

2021-04-13 Thread Marcelo Ricardo Leitner

On Tue, Apr 13, 2021 at 12:31:53PM +0300, Or Cohen wrote:
> +++ b/net/sctp/socket.c
> @@ -1520,11 +1520,9 @@ static void sctp_close(struct sock *sk, long timeout)
>  
>   /* Supposedly, no process has access to the socket, but
>* the net layers still may.
> -  * Also, sctp_destroy_sock() needs to be called with addr_wq_lock
> -  * held and that should be grabbed before socket lock.
>*/

Please also update the following comment in sctp_init_sock():
  /* Nothing can fail after this block, otherwise
   * sctp_destroy_sock() will be called without addr_wq_lock held
   */

Other than this, LGTM. Thanks.

  Marcelo

Re: [PATCH] hrtimer: avoid retrigger_next_event IPI

2021-04-09 Thread Marcelo Tosatti



+CC Anna-Maria.

On Fri, Apr 09, 2021 at 04:15:13PM +0200, Thomas Gleixner wrote:
> On Wed, Apr 07 2021 at 10:53, Marcelo Tosatti wrote:
> > Setting the realtime clock triggers an IPI to all CPUs to reprogram
> > hrtimers.
> >
> > However, only base, boottime and tai clocks have their offsets updated
> 
> base clock? 

Heh...

> And why boottime? Boottime is not affected by a clock
> realtime set. It's clock REALTIME and TAI, nothing else.

OK!

> > +#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME)|\
> > +(1U << HRTIMER_BASE_REALTIME_SOFT)|\
> > +(1U << HRTIMER_BASE_BOOTTIME)| \
> > +(1U << HRTIMER_BASE_BOOTTIME_SOFT)|\
> > +(1U << HRTIMER_BASE_TAI)|  \
> > +(1U << HRTIMER_BASE_TAI_SOFT))
> > +
> > +static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
> > +{
> > +   unsigned int active = 0;
> > +
> > +   if (!cpu_base->softirq_activated)
> > +   active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;

Again, if (cpu_base->softirq_activated), need to IPI (will resend).

> > +   active = active | (cpu_base->active_bases & HRTIMER_ACTIVE_HARD);
> > +
> > +   if ((active & CLOCK_SET_BASES) == 0)
> > +   return false;
> > +
> > +   return true;
> > +}
> 
> Errm. 

What?

> > +   /* Avoid interrupting nohz_full CPUs if possible */
> > +   preempt_disable();
> > +   for_each_online_cpu(cpu) {
> > +   if (tick_nohz_full_cpu(cpu)) {
> > +   unsigned long flags;
> > +   struct hrtimer_cpu_base *cpu_base = 
> > _cpu(hrtimer_bases, cpu);
> > +
> > +   raw_spin_lock_irqsave(_base->lock, flags);
> > +   if (need_reprogram_timer(cpu_base))
> > +   cpumask_set_cpu(cpu, mask);
> > +   else
> > +   hrtimer_update_base(cpu_base);
> > +   raw_spin_unlock_irqrestore(_base->lock, flags);
> > +   }
> > +   }
> 
> How is that supposed to be correct?
> 
> CPU0  CPU1
> 
> clock_was_set() hrtimer_start(CLOCK_REALTIME)
> 
>   if (!active_mask[CPU1] & XXX)
>   continue;
> active_mask |= REALTIME;
> 
> ---> fail because that newly started timer is on the old offset.

CPU0CPU1


clock_was_set()
Case-1: CPU-1 grabs 
base->lock before CPU-0:
CPU-0 sees 
active_mask[CPU1] and IPIs.

base = 
lock_hrtimer_base(timer, );
if 
(__hrtimer_start_range_ns(timer, tim, ...

hrtimer_reprogram(timer, true);


unlock_hrtimer_base(timer, );


raw_spin_lock_irqsave(_base->lock, flags);
if (need_reprogram_timer(cpu_base))
cpumask_set_cpu(cpu, mask);
else
hrtimer_update_base(cpu_base);
raw_spin_unlock_irqrestore(_base->lock, flags);

Case-2: CPU-1 grabs 
base->lock after CPU-0:
CPU-0 will have updated 
the offsets remotely.

base = 
lock_hrtimer_base(timer, );
if 
(__hrtimer_start_range_ns(timer, tim, ...

hrtimer_reprogram(timer, true);


unlock_hrtimer_base(timer, );


No?

Re: [PATCH] hrtimer: avoid retrigger_next_event IPI

2021-04-08 Thread Marcelo Tosatti

On Thu, Apr 08, 2021 at 12:14:57AM +0200, Frederic Weisbecker wrote:
> On Wed, Apr 07, 2021 at 10:53:01AM -0300, Marcelo Tosatti wrote:
> > 
> > Setting the realtime clock triggers an IPI to all CPUs to reprogram
> > hrtimers.
> > 
> > However, only base, boottime and tai clocks have their offsets updated
> > (and therefore potentially require a reprogram).
> > 
> > If the CPU is a nohz_full one, check if it only has 
> > monotonic active timers, and in that case update the 
> > realtime base offsets, skipping the IPI.
> > 
> > This reduces interruptions to nohz_full CPUs.
> > 
> > Signed-off-by: Marcelo Tosatti 
> > 
> > diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
> > index 743c852e10f2..b42b1a434b22 100644
> > --- a/kernel/time/hrtimer.c
> > +++ b/kernel/time/hrtimer.c
> > @@ -853,6 +853,28 @@ static void hrtimer_reprogram(struct hrtimer *timer, 
> > bool reprogram)
> > tick_program_event(expires, 1);
> >  }
> >  
> > +#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME)|\
> > +(1U << HRTIMER_BASE_REALTIME_SOFT)|\
> > +(1U << HRTIMER_BASE_BOOTTIME)| \
> > +(1U << HRTIMER_BASE_BOOTTIME_SOFT)|\
> > +(1U << HRTIMER_BASE_TAI)|  \
> > +(1U << HRTIMER_BASE_TAI_SOFT))
> > +
> > +static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
> > +{
> > +   unsigned int active = 0;
> > +
> > +   if (!cpu_base->softirq_activated)
> > +   active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;

If cpu_base->softirq_activated == 1, should IPI as well.

> > +   active = active | (cpu_base->active_bases & HRTIMER_ACTIVE_HARD);
> > +
> > +   if ((active & CLOCK_SET_BASES) == 0)
> > +   return false;
> > +
> > +   return true;
> > +}
> > +
> >  /*
> >   * Clock realtime was set
> >   *
> > @@ -867,9 +889,41 @@ static void hrtimer_reprogram(struct hrtimer *timer, 
> > bool reprogram)
> >  void clock_was_set(void)
> >  {
> >  #ifdef CONFIG_HIGH_RES_TIMERS
> > -   /* Retrigger the CPU local events everywhere */
> > -   on_each_cpu(retrigger_next_event, NULL, 1);
> > +   cpumask_var_t mask;
> > +   int cpu;
> > +
> > +   if (!tick_nohz_full_enabled()) {
> > +   /* Retrigger the CPU local events everywhere */
> > +   on_each_cpu(retrigger_next_event, NULL, 1);
> > +   goto set_timerfd;
> > +   }
> > +
> > +   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
> > +   on_each_cpu(retrigger_next_event, NULL, 1);
> > +   goto set_timerfd;
> > +   }
> > +
> > +   /* Avoid interrupting nohz_full CPUs if possible */
> > +   preempt_disable();
> > +   for_each_online_cpu(cpu) {
> > +   if (tick_nohz_full_cpu(cpu)) {
> > +   unsigned long flags;
> > +   struct hrtimer_cpu_base *cpu_base = 
> > _cpu(hrtimer_bases, cpu);
> > +
> > +   raw_spin_lock_irqsave(_base->lock, flags);
> > +   if (need_reprogram_timer(cpu_base))
> > +   cpumask_set_cpu(cpu, mask);
> > +   else
> > +   hrtimer_update_base(cpu_base);
> > +   raw_spin_unlock_irqrestore(_base->lock, flags);
> > +   }
> 
> You forgot to add the housekeeping CPUs to the mask.

So people are using:

console=tty0 console=ttyS0,115200n8 skew_tick=1 nohz=on rcu_nocbs=8-31 
tuned.non_isolcpus=00ff intel_pstate=disable nosoftlockup tsc=nowatchdog 
intel_iommu=on iommu=pt isolcpus=managed_irq,8-31 
systemd.cpu_affinity=0,1,2,3,4,5,6,7 default_hugepagesz=1G hugepagesz=2M 
hugepages=128 nohz_full=8-31

And using the nohz_full= CPUs (or subsets of nohz_full= CPUs) in two modes:

Either "generic non-isolated applications" 
(with load-balancing enabled for those CPUs), or for 
latency sensitive applications. And switching between the modes.

In this case, it would only be possible to check for
housekeeping CPUs of type MANAGED_IRQ, which would be strange.

> As for the need_reprogram_timer() trick, I'll rather defer to Thomas review...
> 
> Thanks.

Thanks!

> 
> > +   }
> > +
> > +   smp_call_function_many(mask, retrigger_next_event, NULL, 1);
> > +   preempt_enable();
> > +   free_cpumask_var(mask);
> >  #endif
> > +set_timerfd:
> > timerfd_clock_was_set();
> >  }
> >  
> >

Re: [PATCH 1/2] KVM: x86: reduce pvclock_gtod_sync_lock critical sections

2021-04-08 Thread Marcelo Tosatti

Hi Paolo,

On Thu, Apr 08, 2021 at 10:15:16AM +0200, Paolo Bonzini wrote:
> On 07/04/21 19:40, Marcelo Tosatti wrote:
> > > diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> > > index fe806e894212..0a83eff40b43 100644
> > > --- a/arch/x86/kvm/x86.c
> > > +++ b/arch/x86/kvm/x86.c
> > > @@ -2562,10 +2562,12 @@ static void kvm_gen_update_masterclock(struct kvm 
> > > *kvm)
> > >   kvm_hv_invalidate_tsc_page(kvm);
> > > - spin_lock(>pvclock_gtod_sync_lock);
> > >   kvm_make_mclock_inprogress_request(kvm);
> > > +
> > Might be good to serialize against two kvm_gen_update_masterclock
> > callers? Otherwise one caller could clear KVM_REQ_MCLOCK_INPROGRESS,
> > while the other is still at pvclock_update_vm_gtod_copy().
> 
> Makes sense, but this stuff has always seemed unnecessarily complicated to
> me.
>
> KVM_REQ_MCLOCK_INPROGRESS is only needed to kick running vCPUs out of the
> execution loop; 

We do not want vcpus with different system_timestamp/tsc_timestamp
pair:

 * To avoid that problem, do not allow visibility of distinct
 * system_timestamp/tsc_timestamp values simultaneously: use a master
 * copy of host monotonic time values. Update that master copy
 * in lockstep.

So KVM_REQ_MCLOCK_INPROGRESS also ensures that no vcpu enters 
guest mode (via vcpu->requests check before VM-entry) with a 
different system_timestamp/tsc_timestamp pair.

> clearing it in kvm_gen_update_masterclock is unnecessary,
> because KVM_REQ_CLOCK_UPDATE takes pvclock_gtod_sync_lock too and thus will
> already wait for pvclock_update_vm_gtod_copy to end.
> 
> I think it's possible to use a seqcount in KVM_REQ_CLOCK_UPDATE instead of
> KVM_REQ_MCLOCK_INPROGRESS.  Both cause the vCPUs to spin. I'll take a look.
> 
> Paolo

Re: [PATCH 1/2] KVM: x86: reduce pvclock_gtod_sync_lock critical sections

2021-04-07 Thread Marcelo Tosatti

On Tue, Mar 30, 2021 at 12:59:57PM -0400, Paolo Bonzini wrote:
> There is no need to include changes to vcpu->requests into
> the pvclock_gtod_sync_lock critical section.  The changes to
> the shared data structures (in pvclock_update_vm_gtod_copy)
> already occur under the lock.
> 
> Cc: David Woodhouse 
> Cc: Marcelo Tosatti 
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/kvm/x86.c | 10 --
>  1 file changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index fe806e894212..0a83eff40b43 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2562,10 +2562,12 @@ static void kvm_gen_update_masterclock(struct kvm 
> *kvm)
>  
>   kvm_hv_invalidate_tsc_page(kvm);
>  
> - spin_lock(>pvclock_gtod_sync_lock);
>   kvm_make_mclock_inprogress_request(kvm);
> +

Might be good to serialize against two kvm_gen_update_masterclock
callers? Otherwise one caller could clear KVM_REQ_MCLOCK_INPROGRESS,
while the other is still at pvclock_update_vm_gtod_copy().

Otherwise, looks good.

[PATCH] hrtimer: avoid retrigger_next_event IPI

2021-04-07 Thread Marcelo Tosatti



Setting the realtime clock triggers an IPI to all CPUs to reprogram
hrtimers.

However, only base, boottime and tai clocks have their offsets updated
(and therefore potentially require a reprogram).

If the CPU is a nohz_full one, check if it only has 
monotonic active timers, and in that case update the 
realtime base offsets, skipping the IPI.

This reduces interruptions to nohz_full CPUs.

Signed-off-by: Marcelo Tosatti 

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 743c852e10f2..b42b1a434b22 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -853,6 +853,28 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
tick_program_event(expires, 1);
 }
 
+#define CLOCK_SET_BASES ((1U << HRTIMER_BASE_REALTIME)|\
+(1U << HRTIMER_BASE_REALTIME_SOFT)|\
+(1U << HRTIMER_BASE_BOOTTIME)| \
+(1U << HRTIMER_BASE_BOOTTIME_SOFT)|\
+(1U << HRTIMER_BASE_TAI)|  \
+(1U << HRTIMER_BASE_TAI_SOFT))
+
+static bool need_reprogram_timer(struct hrtimer_cpu_base *cpu_base)
+{
+   unsigned int active = 0;
+
+   if (!cpu_base->softirq_activated)
+   active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT;
+
+   active = active | (cpu_base->active_bases & HRTIMER_ACTIVE_HARD);
+
+   if ((active & CLOCK_SET_BASES) == 0)
+   return false;
+
+   return true;
+}
+
 /*
  * Clock realtime was set
  *
@@ -867,9 +889,41 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool 
reprogram)
 void clock_was_set(void)
 {
 #ifdef CONFIG_HIGH_RES_TIMERS
-   /* Retrigger the CPU local events everywhere */
-   on_each_cpu(retrigger_next_event, NULL, 1);
+   cpumask_var_t mask;
+   int cpu;
+
+   if (!tick_nohz_full_enabled()) {
+   /* Retrigger the CPU local events everywhere */
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   if (!zalloc_cpumask_var(, GFP_KERNEL)) {
+   on_each_cpu(retrigger_next_event, NULL, 1);
+   goto set_timerfd;
+   }
+
+   /* Avoid interrupting nohz_full CPUs if possible */
+   preempt_disable();
+   for_each_online_cpu(cpu) {
+   if (tick_nohz_full_cpu(cpu)) {
+   unsigned long flags;
+   struct hrtimer_cpu_base *cpu_base = 
_cpu(hrtimer_bases, cpu);
+
+   raw_spin_lock_irqsave(_base->lock, flags);
+   if (need_reprogram_timer(cpu_base))
+   cpumask_set_cpu(cpu, mask);
+   else
+   hrtimer_update_base(cpu_base);
+   raw_spin_unlock_irqrestore(_base->lock, flags);
+   }
+   }
+
+   smp_call_function_many(mask, retrigger_next_event, NULL, 1);
+   preempt_enable();
+   free_cpumask_var(mask);
 #endif
+set_timerfd:
timerfd_clock_was_set();
 }

Re: [PATCH AUTOSEL 5.10 09/33] net: correct sk_acceptq_is_full()

2021-04-05 Thread Marcelo Ricardo Leitner

On Mon, Mar 29, 2021 at 06:21:57PM -0400, Sasha Levin wrote:
> From: liuyacan 
> 
> [ Upstream commit f211ac154577ec9ccf07c15f18a6abf0d9bdb4ab ]
> 
> The "backlog" argument in listen() specifies
> the maximom length of pending connections,
> so the accept queue should be considered full
> if there are exactly "backlog" elements.

Hi Sasha. Can you please confirm that this one was dropped as well?
Thanks.

Re: [PATCH] iio: adc: ad7292: Modify the bool initialization assignment

2021-03-20 Thread Marcelo Schmitt

Okay, looks good to me.

Reviewed-by: Marcelo Schmitt 

On 03/19, Guoqing chi wrote:
> From: Guoqing Chi 
> 
> A bool initializer is best assigned to false rather than 0.
> 
> Signed-off-by: Guoqing Chi 
> ---
>  drivers/iio/adc/ad7292.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/iio/adc/ad7292.c b/drivers/iio/adc/ad7292.c
> index 70e33dd1c9f7..3271a31afde1 100644
> --- a/drivers/iio/adc/ad7292.c
> +++ b/drivers/iio/adc/ad7292.c
> @@ -260,7 +260,7 @@ static int ad7292_probe(struct spi_device *spi)
>   struct ad7292_state *st;
>   struct iio_dev *indio_dev;
>   struct device_node *child;
> - bool diff_channels = 0;
> + bool diff_channels = false;
>   int ret;
>  
>   indio_dev = devm_iio_device_alloc(>dev, sizeof(*st));
> -- 
> 2.17.1
> 
>

Re: [patch 2/3] nohz: change signal tick dependency to wakeup CPUs of member tasks

2021-02-12 Thread Marcelo Tosatti

On Fri, Feb 12, 2021 at 01:25:21PM +0100, Frederic Weisbecker wrote:
> On Thu, Jan 28, 2021 at 05:21:36PM -0300, Marcelo Tosatti wrote:
> > Rather than waking up all nohz_full CPUs on the system, only wakeup 
> > the target CPUs of member threads of the signal.
> > 
> > Reduces interruptions to nohz_full CPUs.
> > 
> > Signed-off-by: Marcelo Tosatti 
> > 
> > Index: linux-2.6/kernel/time/tick-sched.c
> > ===
> > --- linux-2.6.orig/kernel/time/tick-sched.c
> > +++ linux-2.6/kernel/time/tick-sched.c
> > @@ -444,9 +444,20 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
> >   * Set a per-taskgroup tick dependency. Posix CPU timers need this in 
> > order to elapse
> >   * per process timers.
> >   */
> > -void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits
> > bit)
> 
> Why not keeping the signal struct as a parameter?
> 
> Thanks.

All callers use "struct signal_struct *sig = tsk->signal" as
signal parameter anyway...

Can change parameters to (task, signal, bit) if you prefer.

Re: [PATCH 3/3] net: fix iteration for sctp transport seq_files

2021-02-05 Thread Marcelo Ricardo Leitner

On Fri, Feb 05, 2021 at 11:36:30AM +1100, NeilBrown wrote:
> The sctp transport seq_file iterators take a reference to the transport
> in the ->start and ->next functions and releases the reference in the
> ->show function.  The preferred handling for such resources is to
> release them in the subsequent ->next or ->stop function call.
> 
> Since Commit 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration
> code and interface") there is no guarantee that ->show will be called
> after ->next, so this function can now leak references.
> 
> So move the sctp_transport_put() call to ->next and ->stop.
> 
> Fixes: 1f4aace60b0e ("fs/seq_file.c: simplify seq_file iteration code and 
> interface")
> Reported-by: Xin Long 
> Signed-off-by: NeilBrown 

Acked-by: Marcelo Ricardo Leitner

Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-02-04 Thread Marcelo Tosatti

On Thu, Feb 04, 2021 at 01:47:38PM -0500, Nitesh Narayan Lal wrote:
> 
> On 2/4/21 1:15 PM, Marcelo Tosatti wrote:
> > On Thu, Jan 28, 2021 at 09:01:37PM +0100, Thomas Gleixner wrote:
> >> On Thu, Jan 28 2021 at 13:59, Marcelo Tosatti wrote:
> >>>> The whole pile wants to be reverted. It's simply broken in several ways.
> >>> I was asking for your comments on interaction with CPU hotplug :-)
> >> Which I answered in an seperate mail :)
> >>
> >>> So housekeeping_cpumask has multiple meanings. In this case:
> >> ...
> >>
> >>> So as long as the meaning of the flags are respected, seems
> >>> alright.
> >> Yes. Stuff like the managed interrupts preference for housekeeping CPUs
> >> when a affinity mask spawns housekeeping and isolated is perfectly
> >> fine. It's well thought out and has no limitations.
> >>
> >>> Nitesh, is there anything preventing this from being fixed
> >>> in userspace ? (as Thomas suggested previously).
> >> Everything with is not managed can be steered by user space.
> > Yes, but it seems to be racy (that is, there is a window where the 
> > interrupt can be delivered to an isolated CPU).
> >
> > ethtool ->
> > xgbe_set_channels ->
> > xgbe_full_restart_dev ->
> > xgbe_alloc_memory ->
> > xgbe_alloc_channels ->
> > cpumask_local_spread
> >
> > Also ifconfig eth0 down / ifconfig eth0 up leads
> > to cpumask_spread_local.
> 
> There's always that possibility.

Then there is a window where isolation can be broken.

> We have to ensure that we move the IRQs by a tuned daemon or some other
> userspace script every time there is a net-dev change (eg. device comes up,
> creates VFs, etc).

Again, race window open can result in interrupt to isolated CPU.

> > How about adding a new flag for isolcpus instead?
> >
> 
> Do you mean a flag based on which we can switch the affinity mask to
> housekeeping for all the devices at the time of IRQ distribution?

Yes a new flag for isolcpus. HK_FLAG_IRQ_SPREAD or some better name.

Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-02-04 Thread Marcelo Tosatti

On Thu, Jan 28, 2021 at 09:01:37PM +0100, Thomas Gleixner wrote:
> On Thu, Jan 28 2021 at 13:59, Marcelo Tosatti wrote:
> >> The whole pile wants to be reverted. It's simply broken in several ways.
> >
> > I was asking for your comments on interaction with CPU hotplug :-)
> 
> Which I answered in an seperate mail :)
> 
> > So housekeeping_cpumask has multiple meanings. In this case:
> 
> ...
> 
> > So as long as the meaning of the flags are respected, seems
> > alright.
> 
> Yes. Stuff like the managed interrupts preference for housekeeping CPUs
> when a affinity mask spawns housekeeping and isolated is perfectly
> fine. It's well thought out and has no limitations.
> 
> > Nitesh, is there anything preventing this from being fixed
> > in userspace ? (as Thomas suggested previously).
> 
> Everything with is not managed can be steered by user space.

Yes, but it seems to be racy (that is, there is a window where the 
interrupt can be delivered to an isolated CPU).

ethtool ->
xgbe_set_channels ->
xgbe_full_restart_dev ->
xgbe_alloc_memory ->
xgbe_alloc_channels ->
cpumask_local_spread

Also ifconfig eth0 down / ifconfig eth0 up leads
to cpumask_spread_local.

How about adding a new flag for isolcpus instead?

Re: [EXT] Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-02-01 Thread Marcelo Tosatti

On Fri, Jan 29, 2021 at 07:41:27AM -0800, Alex Belits wrote:
> On 1/28/21 07:56, Thomas Gleixner wrote:
> > External Email
> > 
> > --
> > On Wed, Jan 27 2021 at 10:09, Marcelo Tosatti wrote:
> > > On Wed, Jan 27, 2021 at 12:36:30PM +, Robin Murphy wrote:
> > > > > > > /**
> > > > > > >  * cpumask_next - get the next cpu in a cpumask
> > > > > > > @@ -205,22 +206,27 @@ void __init 
> > > > > > > free_bootmem_cpumask_var(cpumask_var_t mask)
> > > > > > >  */
> > > > > > > unsigned int cpumask_local_spread(unsigned int i, int node)
> > > > > > > {
> > > > > > > - int cpu;
> > > > > > > + int cpu, hk_flags;
> > > > > > > + const struct cpumask *mask;
> > > > > > > + hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
> > > > > > > + mask = housekeeping_cpumask(hk_flags);
> > > > > > 
> > > > > > AFAICS, this generally resolves to something based on 
> > > > > > cpu_possible_mask
> > > > > > rather than cpu_online_mask as before, so could now potentially 
> > > > > > return an
> > > > > > offline CPU. Was that an intentional change?
> > > > > 
> > > > > Robin,
> > > > > 
> > > > > AFAICS online CPUs should be filtered.
> > > > 
> > > > Apologies if I'm being thick, but can you explain how? In the case of
> > > > isolation being disabled or compiled out, housekeeping_cpumask() is
> > > > literally just "return cpu_possible_mask;". If we then iterate over that
> > > > with for_each_cpu() and just return the i'th possible CPU (e.g. in the
> > > > NUMA_NO_NODE case), what guarantees that CPU is actually online?
> > > > 
> > > > Robin.
> > > 
> > > Nothing, but that was the situation before 
> > > 1abdfe706a579a702799fce465bceb9fb01d407c
> > > as well.
> > > 
> > > cpumask_local_spread() should probably be disabling CPU hotplug.
> > 
> > It can't unless all callers are from preemtible code.
> > 
> > Aside of that this whole frenzy to sprinkle housekeeping_cpumask() all
> > over the kernel is just wrong, really.
> > 
> > As I explained several times before there are very valid reasons for
> > having queues and interrupts on isolated CPUs. Just optimizing for the
> > usecases some people care about is not making anything better.
> 
> However making it mandatory for isolated CPUs to allow interrupts is not a
> good idea, either. Providing an environment free of disturbances is a valid
> goal, so we can't do something that will make it impossible to achieve. We
> know that both there is a great amount of demand for this feature and
> implementing it is doable, so cutting off the possibility of development in
> this direction would be bad.
> 
> Before there was housekeeping mask, I had to implement another, more
> cumbersome model that ended up being more intrusive than I wanted. That was
> one of the reasons why I have spent some time working on it in, please
> forgive me the pun, isolation.
> 
> I was relieved when housekeeping mask appeared, and I was able to remove a
> large chunk of code that distinguished between CPUs that "are there" and
> CPUs "available to run work". Housekeeping is supposed to define the set of
> CPUs that are intended to run work that is not specifically triggered by
> anything running on those CPUs. "CPUs that are there" are CPUs that are
> being maintained as a part of the system, so they are usable for running
> things on them.
> 
> My idea at the time was that we can separate this into two directions of
> development:
> 
> 1. Make sure that housekeeping mask applies to all kinds of work that
> appears on CPUs, so nothing random will end up running there. Because this
> is very much in line of what it does.

Its easier to specify "all members of set" rather than having to specify each
individual member. Thinking of set as a description of types of
activities that should not have a given CPU as a target.

> 2. Rely on housekeeping mask to exclude anything not specifically intended
> to run on isolated CPUs, and concentrate efforts on making sure that things
> that are intended to [eventually] happen on those CPUs are handled properly
> -- in case of my recent proposals, delayed until synchronization even

[patch 3/3] nohz: tick_nohz_kick_task: only IPI if remote task is running

2021-01-28 Thread Marcelo Tosatti

If the task is not running, run_posix_cpu_timers has nothing
to elapsed, so spare IPI in that case.

Suggested-by: Peter Zijlstra 
Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/sched/core.c
===
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -9182,3 +9182,9 @@ void call_trace_sched_update_nr_running(
 {
 trace_sched_update_nr_running_tp(rq, count);
 }
+
+bool task_on_rq(struct task_struct *p)
+{
+   return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -232,6 +232,8 @@ extern void io_schedule_finish(int token
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+extern bool task_on_rq(struct task_struct *p);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -324,8 +324,6 @@ void tick_nohz_full_kick_cpu(int cpu)
 
 static void tick_nohz_kick_task(struct task_struct *tsk)
 {
-   int cpu = task_cpu(tsk);
-
/*
 * If the task concurrently migrates to another cpu,
 * we guarantee it sees the new tick dependency upon
@@ -340,6 +338,23 @@ static void tick_nohz_kick_task(struct t
 *   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
 *  LOAD p->tick_dep_mask   LOAD p->cpu
 */
+   int cpu = task_cpu(tsk);
+
+   /*
+* If the task is not running, run_posix_cpu_timers
+* has nothing to elapsed, can spare IPI in that
+* case.
+*
+* activate_task()  STORE p->tick_dep_mask
+* STORE p->task_on_rq
+* __schedule() (switch to task 'p')smp_mb() (atomic_fetch_or())
+* LOCK rq->lockLOAD p->task_on_rq
+* smp_mb__after_spin_lock()
+* tick_nohz_task_switch()
+*  LOAD p->tick_dep_mask
+*/
+   if (!task_on_rq(tsk))
+   return;
 
preempt_disable();
if (cpu_online(cpu))

[patch 2/3] nohz: change signal tick dependency to wakeup CPUs of member tasks

2021-01-28 Thread Marcelo Tosatti

Rather than waking up all nohz_full CPUs on the system, only wakeup 
the target CPUs of member threads of the signal.

Reduces interruptions to nohz_full CPUs.

Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -444,9 +444,20 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
  * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to 
elapse
  * per process timers.
  */
-void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)
+void tick_nohz_dep_set_signal(struct task_struct *tsk,
+ enum tick_dep_bits bit)
 {
-   tick_nohz_dep_set_all(>tick_dep_mask, bit);
+   int prev;
+   struct signal_struct *sig = tsk->signal;
+
+   prev = atomic_fetch_or(BIT(bit), >tick_dep_mask);
+   if (!prev) {
+   struct task_struct *t;
+
+   lockdep_assert_held(>sighand->siglock);
+   __for_each_thread(sig, t)
+   tick_nohz_kick_task(t);
+   }
 }
 
 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)
Index: linux-2.6/include/linux/tick.h
===
--- linux-2.6.orig/include/linux/tick.h
+++ linux-2.6/include/linux/tick.h
@@ -207,7 +207,7 @@ extern void tick_nohz_dep_set_task(struc
   enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
 enum tick_dep_bits bit);
-extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+extern void tick_nohz_dep_set_signal(struct task_struct *tsk,
 enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
   enum tick_dep_bits bit);
@@ -252,11 +252,11 @@ static inline void tick_dep_clear_task(s
if (tick_nohz_full_enabled())
tick_nohz_dep_clear_task(tsk, bit);
 }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
   enum tick_dep_bits bit)
 {
if (tick_nohz_full_enabled())
-   tick_nohz_dep_set_signal(signal, bit);
+   tick_nohz_dep_set_signal(tsk, bit);
 }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 enum tick_dep_bits bit)
@@ -284,7 +284,7 @@ static inline void tick_dep_set_task(str
 enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_task(struct task_struct *tsk,
   enum tick_dep_bits bit) { }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
   enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 enum tick_dep_bits bit) { }
Index: linux-2.6/kernel/time/posix-cpu-timers.c
===
--- linux-2.6.orig/kernel/time/posix-cpu-timers.c
+++ linux-2.6/kernel/time/posix-cpu-timers.c
@@ -523,7 +523,7 @@ static void arm_timer(struct k_itimer *t
if (CPUCLOCK_PERTHREAD(timer->it_clock))
tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
else
-   tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
+   tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 /*
@@ -1358,7 +1358,7 @@ void set_process_cpu_timer(struct task_s
if (*newval < *nextevt)
*nextevt = *newval;
 
-   tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
+   tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,

[patch 1/3] nohz: only wakeup a single target cpu when kicking a task

2021-01-28 Thread Marcelo Tosatti

When adding a tick dependency to a task, its necessary to
wakeup the CPU where the task resides to reevaluate tick
dependencies on that CPU.

However the current code wakes up all nohz_full CPUs, which 
is unnecessary.

Switch to waking up a single CPU, by using ordering of writes
to task->cpu and task->tick_dep_mask.

From: Frederic Weisbecker 
Suggested-by: Peter Zijlstra 
Signed-off-by: Frederic Weisbecker 
Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -322,6 +322,31 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(_cpu(nohz_full_kick_work, cpu), cpu);
 }
 
+static void tick_nohz_kick_task(struct task_struct *tsk)
+{
+   int cpu = task_cpu(tsk);
+
+   /*
+* If the task concurrently migrates to another cpu,
+* we guarantee it sees the new tick dependency upon
+* schedule.
+*
+*
+* set_task_cpu(p, cpu);
+*   STORE p->cpu = @cpu
+* __schedule() (switch to task 'p')
+*   LOCK rq->lock
+*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
+*   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
+*  LOAD p->tick_dep_mask   LOAD p->cpu
+*/
+
+   preempt_disable();
+   if (cpu_online(cpu))
+   tick_nohz_full_kick_cpu(cpu);
+   preempt_enable();
+}
+
 /*
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
@@ -404,19 +429,8 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cp
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask)) {
-   if (tsk == current) {
-   preempt_disable();
-   tick_nohz_full_kick();
-   preempt_enable();
-   } else {
-   /*
-* Some future tick_nohz_full_kick_task()
-* should optimize this.
-*/
-   tick_nohz_full_kick_all();
-   }
-   }
+   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask))
+   tick_nohz_kick_task(tsk);
 }
 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

[patch 0/3] nohz_full: only wakeup target CPUs when notifying new tick dependency (v5)

2021-01-28 Thread Marcelo Tosatti

When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
performed (to re-read the dependencies and possibly not re-enter
nohz_full on a given CPU).

A common case is for applications that run on nohz_full= CPUs
to not use POSIX timers (eg DPDK). This patch changes the notification
to only IPI the target CPUs where the task(s) whose tick dependencies
are being updated are executing.

This reduces interruptions to nohz_full= CPUs.

v5: actually replace superfluous rcu_read_lock with lockdep_assert
v4: only IPI if the remote task is on the remote runqueue (PeterZ/Frederic)
v3: replace superfluous rcu_read_lock with lockdep_assert (PeterZ)

[patch 0/3] nohz_full: only wakeup target CPUs when notifying new tick dependency (v4)

2021-01-28 Thread Marcelo Tosatti

When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
performed (to re-read the dependencies and possibly not re-enter
nohz_full on a given CPU).

A common case is for applications that run on nohz_full= CPUs
to not use POSIX timers (eg DPDK). This patch changes the notification
to only IPI the target CPUs where the task(s) whose tick dependencies
are being updated are executing.

This reduces interruptions to nohz_full= CPUs.

v4: only IPI if the remote task is on the remote runqueue (PeterZ/Frederic)
v3: replace superfluous rcu_read_lock with lockdep_assert (PeterZ)

[patch 1/3] nohz: only wakeup a single target cpu when kicking a task

2021-01-28 Thread Marcelo Tosatti

When adding a tick dependency to a task, its necessary to
wakeup the CPU where the task resides to reevaluate tick
dependencies on that CPU.

However the current code wakes up all nohz_full CPUs, which 
is unnecessary.

Switch to waking up a single CPU, by using ordering of writes
to task->cpu and task->tick_dep_mask.

From: Frederic Weisbecker 
Suggested-by: Peter Zijlstra 
Signed-off-by: Frederic Weisbecker 
Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -322,6 +322,31 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(_cpu(nohz_full_kick_work, cpu), cpu);
 }
 
+static void tick_nohz_kick_task(struct task_struct *tsk)
+{
+   int cpu = task_cpu(tsk);
+
+   /*
+* If the task concurrently migrates to another cpu,
+* we guarantee it sees the new tick dependency upon
+* schedule.
+*
+*
+* set_task_cpu(p, cpu);
+*   STORE p->cpu = @cpu
+* __schedule() (switch to task 'p')
+*   LOCK rq->lock
+*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
+*   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
+*  LOAD p->tick_dep_mask   LOAD p->cpu
+*/
+
+   preempt_disable();
+   if (cpu_online(cpu))
+   tick_nohz_full_kick_cpu(cpu);
+   preempt_enable();
+}
+
 /*
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
@@ -404,19 +429,8 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cp
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask)) {
-   if (tsk == current) {
-   preempt_disable();
-   tick_nohz_full_kick();
-   preempt_enable();
-   } else {
-   /*
-* Some future tick_nohz_full_kick_task()
-* should optimize this.
-*/
-   tick_nohz_full_kick_all();
-   }
-   }
+   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask))
+   tick_nohz_kick_task(tsk);
 }
 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

[patch 2/3] nohz: change signal tick dependency to wakeup CPUs of member tasks

2021-01-28 Thread Marcelo Tosatti

Rather than waking up all nohz_full CPUs on the system, only wakeup 
the target CPUs of member threads of the signal.

Reduces interruptions to nohz_full CPUs.

Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -446,7 +446,17 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
  */
 void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)
 {
-   tick_nohz_dep_set_all(>tick_dep_mask, bit);
+   int prev;
+
+   prev = atomic_fetch_or(BIT(bit), >tick_dep_mask);
+   if (!prev) {
+   struct task_struct *t;
+
+   rcu_read_lock();
+   __for_each_thread(sig, t)
+   tick_nohz_kick_task(t);
+   rcu_read_unlock();
+   }
 }
 
 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)

[patch 3/3] nohz: tick_nohz_kick_task: only IPI if remote task is running

2021-01-28 Thread Marcelo Tosatti

If the task is not running, run_posix_cpu_timers has nothing
to elapsed, so spare IPI in that case.

Suggested-by: Peter Zijlstra 
Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/sched/core.c
===
--- linux-2.6.orig/kernel/sched/core.c
+++ linux-2.6/kernel/sched/core.c
@@ -9182,3 +9182,9 @@ void call_trace_sched_update_nr_running(
 {
 trace_sched_update_nr_running_tp(rq, count);
 }
+
+bool task_on_rq(struct task_struct *p)
+{
+   return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+
Index: linux-2.6/include/linux/sched.h
===
--- linux-2.6.orig/include/linux/sched.h
+++ linux-2.6/include/linux/sched.h
@@ -232,6 +232,8 @@ extern void io_schedule_finish(int token
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+extern bool task_on_rq(struct task_struct *p);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -324,8 +324,6 @@ void tick_nohz_full_kick_cpu(int cpu)
 
 static void tick_nohz_kick_task(struct task_struct *tsk)
 {
-   int cpu = task_cpu(tsk);
-
/*
 * If the task concurrently migrates to another cpu,
 * we guarantee it sees the new tick dependency upon
@@ -340,6 +338,23 @@ static void tick_nohz_kick_task(struct t
 *   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
 *  LOAD p->tick_dep_mask   LOAD p->cpu
 */
+   int cpu = task_cpu(tsk);
+
+   /*
+* If the task is not running, run_posix_cpu_timers
+* has nothing to elapsed, can spare IPI in that
+* case.
+*
+* activate_task()  STORE p->tick_dep_mask
+* STORE p->task_on_rq
+* __schedule() (switch to task 'p')smp_mb() (atomic_fetch_or())
+* LOCK rq->lockLOAD p->task_on_rq
+* smp_mb__after_spin_lock()
+* tick_nohz_task_switch()
+*  LOAD p->tick_dep_mask
+*/
+   if (!task_on_rq(tsk))
+   return;
 
preempt_disable();
if (cpu_online(cpu))

Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-01-28 Thread Marcelo Tosatti

On Thu, Jan 28, 2021 at 04:56:07PM +0100, Thomas Gleixner wrote:
> On Wed, Jan 27 2021 at 10:09, Marcelo Tosatti wrote:
> > On Wed, Jan 27, 2021 at 12:36:30PM +, Robin Murphy wrote:
> >> > > >/**
> >> > > > * cpumask_next - get the next cpu in a cpumask
> >> > > > @@ -205,22 +206,27 @@ void __init 
> >> > > > free_bootmem_cpumask_var(cpumask_var_t mask)
> >> > > > */
> >> > > >unsigned int cpumask_local_spread(unsigned int i, int node)
> >> > > >{
> >> > > > -int cpu;
> >> > > > +int cpu, hk_flags;
> >> > > > +const struct cpumask *mask;
> >> > > > +hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
> >> > > > +mask = housekeeping_cpumask(hk_flags);
> >> > > 
> >> > > AFAICS, this generally resolves to something based on cpu_possible_mask
> >> > > rather than cpu_online_mask as before, so could now potentially return 
> >> > > an
> >> > > offline CPU. Was that an intentional change?
> >> > 
> >> > Robin,
> >> > 
> >> > AFAICS online CPUs should be filtered.
> >> 
> >> Apologies if I'm being thick, but can you explain how? In the case of
> >> isolation being disabled or compiled out, housekeeping_cpumask() is
> >> literally just "return cpu_possible_mask;". If we then iterate over that
> >> with for_each_cpu() and just return the i'th possible CPU (e.g. in the
> >> NUMA_NO_NODE case), what guarantees that CPU is actually online?
> >> 
> >> Robin.
> >
> > Nothing, but that was the situation before 
> > 1abdfe706a579a702799fce465bceb9fb01d407c
> > as well.
> >
> > cpumask_local_spread() should probably be disabling CPU hotplug.
> 
> It can't unless all callers are from preemtible code.
> 
> Aside of that this whole frenzy to sprinkle housekeeping_cpumask() all
> over the kernel is just wrong, really.
> 
> As I explained several times before there are very valid reasons for
> having queues and interrupts on isolated CPUs. Just optimizing for the
> usecases some people care about is not making anything better.

And that is right.

Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-01-28 Thread Marcelo Tosatti

On Thu, Jan 28, 2021 at 05:02:41PM +0100, Thomas Gleixner wrote:
> On Wed, Jan 27 2021 at 09:19, Marcelo Tosatti wrote:
> > On Wed, Jan 27, 2021 at 11:57:16AM +, Robin Murphy wrote:
> >> > +hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
> >> > +mask = housekeeping_cpumask(hk_flags);
> >> 
> >> AFAICS, this generally resolves to something based on cpu_possible_mask
> >> rather than cpu_online_mask as before, so could now potentially return an
> >> offline CPU. Was that an intentional change?
> >
> > Robin,
> >
> > AFAICS online CPUs should be filtered.
> 
> The whole pile wants to be reverted. It's simply broken in several ways.

I was asking for your comments on interaction with CPU hotplug :-)
Anyway...

So housekeeping_cpumask has multiple meanings. In this case:

HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ

 domain
   Isolate from the general SMP balancing and scheduling
   algorithms. Note that performing domain isolation this way
   is irreversible: it's not possible to bring back a CPU to
   the domains once isolated through isolcpus. It's strongly
   advised to use cpusets instead to disable scheduler load
   balancing through the "cpuset.sched_load_balance" file.
   It offers a much more flexible interface where CPUs can
   move in and out of an isolated set anytime.

   You can move a process onto or off an "isolated" CPU via
   the CPU affinity syscalls or cpuset.
begins at 0 and the maximum value is
   "number of CPUs in system - 1".

 managed_irq

   Isolate from being targeted by managed interrupts
   which have an interrupt mask containing isolated
   CPUs. The affinity of managed interrupts is
   handled by the kernel and cannot be changed via
   the /proc/irq/* interfaces.

   This isolation is best effort and only effective
   if the automatically assigned interrupt mask of a
   device queue contains isolated and housekeeping
   CPUs. If housekeeping CPUs are online then such
   interrupts are directed to the housekeeping CPU
   so that IO submitted on the housekeeping CPU
   cannot disturb the isolated CPU.

   If a queue's affinity mask contains only isolated
   CPUs then this parameter has no effect on the
   interrupt routing decision, though interrupts are
   only delivered when tasks running on those
   isolated CPUs submit IO. IO submitted on
   housekeeping CPUs has no influence on those
   queues.

So as long as the meaning of the flags are respected, seems
alright.

Nitesh, is there anything preventing this from being fixed
in userspace ? (as Thomas suggested previously).

Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-01-27 Thread Marcelo Tosatti

On Wed, Jan 27, 2021 at 12:36:30PM +, Robin Murphy wrote:
> On 2021-01-27 12:19, Marcelo Tosatti wrote:
> > On Wed, Jan 27, 2021 at 11:57:16AM +, Robin Murphy wrote:
> > > Hi,
> > > 
> > > On 2020-06-25 23:34, Nitesh Narayan Lal wrote:
> > > > From: Alex Belits 
> > > > 
> > > > The current implementation of cpumask_local_spread() does not respect 
> > > > the
> > > > isolated CPUs, i.e., even if a CPU has been isolated for Real-Time task,
> > > > it will return it to the caller for pinning of its IRQ threads. Having
> > > > these unwanted IRQ threads on an isolated CPU adds up to a latency
> > > > overhead.
> > > > 
> > > > Restrict the CPUs that are returned for spreading IRQs only to the
> > > > available housekeeping CPUs.
> > > > 
> > > > Signed-off-by: Alex Belits 
> > > > Signed-off-by: Nitesh Narayan Lal 
> > > > ---
> > > >lib/cpumask.c | 16 +++-
> > > >1 file changed, 11 insertions(+), 5 deletions(-)
> > > > 
> > > > diff --git a/lib/cpumask.c b/lib/cpumask.c
> > > > index fb22fb266f93..85da6ab4fbb5 100644
> > > > --- a/lib/cpumask.c
> > > > +++ b/lib/cpumask.c
> > > > @@ -6,6 +6,7 @@
> > > >#include 
> > > >#include 
> > > >#include 
> > > > +#include 
> > > >/**
> > > > * cpumask_next - get the next cpu in a cpumask
> > > > @@ -205,22 +206,27 @@ void __init 
> > > > free_bootmem_cpumask_var(cpumask_var_t mask)
> > > > */
> > > >unsigned int cpumask_local_spread(unsigned int i, int node)
> > > >{
> > > > -   int cpu;
> > > > +   int cpu, hk_flags;
> > > > +   const struct cpumask *mask;
> > > > +   hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
> > > > +   mask = housekeeping_cpumask(hk_flags);
> > > 
> > > AFAICS, this generally resolves to something based on cpu_possible_mask
> > > rather than cpu_online_mask as before, so could now potentially return an
> > > offline CPU. Was that an intentional change?
> > 
> > Robin,
> > 
> > AFAICS online CPUs should be filtered.
> 
> Apologies if I'm being thick, but can you explain how? In the case of
> isolation being disabled or compiled out, housekeeping_cpumask() is
> literally just "return cpu_possible_mask;". If we then iterate over that
> with for_each_cpu() and just return the i'th possible CPU (e.g. in the
> NUMA_NO_NODE case), what guarantees that CPU is actually online?
> 
> Robin.

Nothing, but that was the situation before 
1abdfe706a579a702799fce465bceb9fb01d407c
as well.

cpumask_local_spread() should probably be disabling CPU hotplug.

Thomas?

> 
> > > I was just looking at the current code since I had the rare presence of 
> > > mind
> > > to check if something suitable already existed before I start open-coding
> > > "any online CPU, but local node preferred" logic for handling IRQ affinity
> > > in a driver - cpumask_local_spread() appears to be almost what I want (if 
> > > a
> > > bit more heavyweight), if only it would actually guarantee an online CPU 
> > > as
> > > the kerneldoc claims :(
> > > 
> > > Robin.
> > > 
> > > > /* Wrap: we always want a cpu. */
> > > > -   i %= num_online_cpus();
> > > > +   i %= cpumask_weight(mask);
> > > > if (node == NUMA_NO_NODE) {
> > > > -   for_each_cpu(cpu, cpu_online_mask)
> > > > +   for_each_cpu(cpu, mask) {
> > > > if (i-- == 0)
> > > > return cpu;
> > > > +   }
> > > > } else {
> > > > /* NUMA first. */
> > > > -   for_each_cpu_and(cpu, cpumask_of_node(node), 
> > > > cpu_online_mask)
> > > > +   for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
> > > > if (i-- == 0)
> > > > return cpu;
> > > > +   }
> > > > -   for_each_cpu(cpu, cpu_online_mask) {
> > > > +   for_each_cpu(cpu, mask) {
> > > > /* Skip NUMA nodes, done above. */
> > > > if (cpumask_test_cpu(cpu, 
> > > > cpumask_of_node(node)))
> > > > continue;
> > > > 
> >

Re: [Patch v4 1/3] lib: Restrict cpumask_local_spread to houskeeping CPUs

2021-01-27 Thread Marcelo Tosatti

On Wed, Jan 27, 2021 at 11:57:16AM +, Robin Murphy wrote:
> Hi,
> 
> On 2020-06-25 23:34, Nitesh Narayan Lal wrote:
> > From: Alex Belits 
> > 
> > The current implementation of cpumask_local_spread() does not respect the
> > isolated CPUs, i.e., even if a CPU has been isolated for Real-Time task,
> > it will return it to the caller for pinning of its IRQ threads. Having
> > these unwanted IRQ threads on an isolated CPU adds up to a latency
> > overhead.
> > 
> > Restrict the CPUs that are returned for spreading IRQs only to the
> > available housekeeping CPUs.
> > 
> > Signed-off-by: Alex Belits 
> > Signed-off-by: Nitesh Narayan Lal 
> > ---
> >   lib/cpumask.c | 16 +++-
> >   1 file changed, 11 insertions(+), 5 deletions(-)
> > 
> > diff --git a/lib/cpumask.c b/lib/cpumask.c
> > index fb22fb266f93..85da6ab4fbb5 100644
> > --- a/lib/cpumask.c
> > +++ b/lib/cpumask.c
> > @@ -6,6 +6,7 @@
> >   #include 
> >   #include 
> >   #include 
> > +#include 
> >   /**
> >* cpumask_next - get the next cpu in a cpumask
> > @@ -205,22 +206,27 @@ void __init free_bootmem_cpumask_var(cpumask_var_t 
> > mask)
> >*/
> >   unsigned int cpumask_local_spread(unsigned int i, int node)
> >   {
> > -   int cpu;
> > +   int cpu, hk_flags;
> > +   const struct cpumask *mask;
> > +   hk_flags = HK_FLAG_DOMAIN | HK_FLAG_MANAGED_IRQ;
> > +   mask = housekeeping_cpumask(hk_flags);
> 
> AFAICS, this generally resolves to something based on cpu_possible_mask
> rather than cpu_online_mask as before, so could now potentially return an
> offline CPU. Was that an intentional change?

Robin,

AFAICS online CPUs should be filtered.

> I was just looking at the current code since I had the rare presence of mind
> to check if something suitable already existed before I start open-coding
> "any online CPU, but local node preferred" logic for handling IRQ affinity
> in a driver - cpumask_local_spread() appears to be almost what I want (if a
> bit more heavyweight), if only it would actually guarantee an online CPU as
> the kerneldoc claims :(
> 
> Robin.
> 
> > /* Wrap: we always want a cpu. */
> > -   i %= num_online_cpus();
> > +   i %= cpumask_weight(mask);
> > if (node == NUMA_NO_NODE) {
> > -   for_each_cpu(cpu, cpu_online_mask)
> > +   for_each_cpu(cpu, mask) {
> > if (i-- == 0)
> > return cpu;
> > +   }
> > } else {
> > /* NUMA first. */
> > -   for_each_cpu_and(cpu, cpumask_of_node(node), cpu_online_mask)
> > +   for_each_cpu_and(cpu, cpumask_of_node(node), mask) {
> > if (i-- == 0)
> > return cpu;
> > +   }
> > -   for_each_cpu(cpu, cpu_online_mask) {
> > +   for_each_cpu(cpu, mask) {
> > /* Skip NUMA nodes, done above. */
> > if (cpumask_test_cpu(cpu, cpumask_of_node(node)))
> > continue;
> >

Re: [PATCH 5.4 29/33] net, sctp, filter: remap copy_from_user failure error

2021-01-22 Thread Marcelo Ricardo Leitner

On Fri, Jan 22, 2021 at 03:12:45PM +0100, Greg Kroah-Hartman wrote:
> From: Daniel Borkmann 
> 
> [ no upstream commit ]
> 
> Fix a potential kernel address leakage for the prerequisite where there is
> a BPF program attached to the cgroup/setsockopt hook. The latter can only
> be attached under root, however, if the attached program returns 1 to then
> run the related kernel handler, an unprivileged program could probe for
> kernel addresses that way. The reason this is possible is that we're under
> set_fs(KERNEL_DS) when running the kernel setsockopt handler. Aside from
> old cBPF there is also SCTP's struct sctp_getaddrs_old which contains
> pointers in the uapi struct that further need copy_from_user() inside the
> handler. In the normal case this would just return -EFAULT, but under a
> temporary KERNEL_DS setting the memory would be copied and we'd end up at
> a different error code, that is, -EINVAL, for both cases given subsequent
> validations fail, which then allows the app to distinguish and make use of
> this fact for probing the address space. In case of later kernel versions
> this issue won't work anymore thanks to Christoph Hellwig's work that got
> rid of the various temporary set_fs() address space overrides altogether.
> One potential option for 5.4 as the only affected stable kernel with the
> least complexity would be to remap those affected -EFAULT copy_from_user()
> error codes with -EINVAL such that they cannot be probed anymore. Risk of
> breakage should be rather low for this particular error case.
> 
> Fixes: 0d01da6afc54 ("bpf: implement getsockopt and setsockopt hooks")
> Reported-by: Ryota Shiga (Flatt Security)
> Signed-off-by: Daniel Borkmann 
> Cc: Stanislav Fomichev 
> Cc: Eric Dumazet 
> Cc: Marcelo Ricardo Leitner 
> Signed-off-by: Greg Kroah-Hartman 

For sctp bits,
Acked-by: Marcelo Ricardo Leitner 

...
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -1319,7 +1319,7 @@ static int __sctp_setsockopt_connectx(st
>  
>   kaddrs = memdup_user(addrs, addrs_size);
>   if (IS_ERR(kaddrs))
> - return PTR_ERR(kaddrs);
> + return PTR_ERR(kaddrs) == -EFAULT ? -EINVAL : PTR_ERR(kaddrs);
>  
>   /* Allow security module to validate connectx addresses. */
>   err = security_sctp_bind_connect(sk, SCTP_SOCKOPT_CONNECTX,
> 
>

Re: [EXT] Re: [PATCH v5 9/9] task_isolation: kick_all_cpus_sync: don't kick isolated cpus

2021-01-22 Thread Marcelo Tosatti

On Tue, Nov 24, 2020 at 12:21:06AM +0100, Frederic Weisbecker wrote:
> On Mon, Nov 23, 2020 at 10:39:34PM +, Alex Belits wrote:
> > 
> > On Mon, 2020-11-23 at 23:29 +0100, Frederic Weisbecker wrote:
> > > External Email
> > > 
> > > ---
> > > ---
> > > On Mon, Nov 23, 2020 at 05:58:42PM +, Alex Belits wrote:
> > > > From: Yuri Norov 
> > > > 
> > > > Make sure that kick_all_cpus_sync() does not call CPUs that are
> > > > running
> > > > isolated tasks.
> > > > 
> > > > Signed-off-by: Yuri Norov 
> > > > [abel...@marvell.com: use safe task_isolation_cpumask()
> > > > implementation]
> > > > Signed-off-by: Alex Belits 
> > > > ---
> > > >  kernel/smp.c | 14 +-
> > > >  1 file changed, 13 insertions(+), 1 deletion(-)
> > > > 
> > > > diff --git a/kernel/smp.c b/kernel/smp.c
> > > > index 4d17501433be..b2faecf58ed0 100644
> > > > --- a/kernel/smp.c
> > > > +++ b/kernel/smp.c
> > > > @@ -932,9 +932,21 @@ static void do_nothing(void *unused)
> > > >   */
> > > >  void kick_all_cpus_sync(void)
> > > >  {
> > > > +   struct cpumask mask;
> > > > +
> > > > /* Make sure the change is visible before we kick the cpus */
> > > > smp_mb();
> > > > -   smp_call_function(do_nothing, NULL, 1);
> > > > +
> > > > +   preempt_disable();
> > > > +#ifdef CONFIG_TASK_ISOLATION
> > > > +   cpumask_clear();
> > > > +   task_isolation_cpumask();
> > > > +   cpumask_complement(, );
> > > > +#else
> > > > +   cpumask_setall();
> > > > +#endif
> > > > +   smp_call_function_many(, do_nothing, NULL, 1);
> > > > +   preempt_enable();
> > > 
> > > Same comment about IPIs here.
> > 
> > This is different from timers. The original design was based on the
> > idea that every CPU should be able to enter kernel at any time and run
> > kernel code with no additional preparation. Then the only solution is
> > to always do full broadcast and require all CPUs to process it.
> > 
> > What I am trying to introduce is the idea of CPU that is not likely to
> > run kernel code any soon, and can afford to go through an additional
> > synchronization procedure on the next entry into kernel. The
> > synchronization is not skipped, it simply happens later, early in
> > kernel entry code.

Perhaps a bitmask of pending flushes makes more sense? 
static_key_enable IPIs is one of the users, but for its case it would 
be necessary to differentiate between in-kernel mode and out of kernel 
mode atomically (since i-cache flush must be performed if isolated CPU 
is in kernel mode).

> Ah I see, this is ordered that way:
> 
> ll_isol_flags = ISOLATED
> 
>  CPU 0CPU 1
> --   -
> // kernel entry
> data_to_sync = 1ll_isol_flags = ISOLATED_BROKEN
> smp_mb()smp_mb()
> if ll_isol_flags(CPU 1) == ISOLATED READ data_to_sync
>  smp_call(CPU 1)

Since isolated mode with syscalls is a desired feature, having a
separate atomic with in_kernel_mode = 0/1 (that is set/cleared 
on kernel entry / kernel exit, while on TIF_TASK_ISOLATION), would be
necessary (and a similar race-free logic as above).

> You should document that, ie: explain why what you're doing is safe.
> 
> Also Beware though that the data to sync in question doesn't need to be 
> visible
> in the entry code before task_isolation_kernel_enter(). You need to audit all
> the callers of kick_all_cpus_sync().

Cscope tag: flush_icache_range
   #   line  filename / context / line
   1 96  arch/arc/kernel/jump_label.c <>
 flush_icache_range(entry->code, entry->code + JUMP_LABEL_NOP_SIZE);

This case would be OK for delayed processing before kernel entry, as long as
no code before task_isolation_kernel_enter can be modified (which i am
not sure about).

But:

  36 28  arch/ia64/include/asm/cacheflush.h <>
 flush_icache_range(_addr, _addr + (len)); \

Is less certain.

Alex do you recall if arch_jump_label_transform was the only offender or 
there were others as well? (suppose handling only the ones which matter
in production at the moment, and later fixing individual ones makes most
sense).

Re: [PATCH v4 11/13] task_isolation: net: don't flush backlog on CPUs running isolated tasks

2021-01-22 Thread Marcelo Tosatti

On Thu, Oct 01, 2020 at 04:47:31PM +0200, Frederic Weisbecker wrote:
> On Wed, Jul 22, 2020 at 02:58:24PM +, Alex Belits wrote:
> > From: Yuri Norov 
> > 

> > so we don't need to flush it.
> 
> What guarantees that we have no backlog on it?

>From Paolo's work to use lockless reading of 
per-CPU skb lists

https://www.spinics.net/lists/netdev/msg682693.html

It also exposed skb queue length to userspace

https://www.spinics.net/lists/netdev/msg684939.html

But if i remember correctly waiting for a RCU grace
period was also necessary to ensure no backlog !?! 

Paolo would you please remind us what was the sequence of steps?
(and then also, for the userspace isolation interface, where 
the application informs the kernel that its entering isolated
mode, is just confirming the queues have zero length is
sufficient?).

TIA!

> 
> > Currently flush_all_backlogs()
> > enqueues corresponding work on all CPUs including ones that run
> > isolated tasks. It leads to breaking task isolation for nothing.
> > 
> > In this patch, backlog flushing is enqueued only on non-isolated CPUs.
> > 
> > Signed-off-by: Yuri Norov 
> > [abel...@marvell.com: use safe task_isolation_on_cpu() implementation]
> > Signed-off-by: Alex Belits 
> > ---
> >  net/core/dev.c | 7 ++-
> >  1 file changed, 6 insertions(+), 1 deletion(-)
> > 
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 90b59fc50dc9..83a282f7453d 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -74,6 +74,7 @@
> >  #include 
> >  #include 
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -5624,9 +5625,13 @@ static void flush_all_backlogs(void)
> >  
> > get_online_cpus();
> >  
> > -   for_each_online_cpu(cpu)
> > +   smp_rmb();
> 
> What is it ordering?
> 
> > +   for_each_online_cpu(cpu) {
> > +   if (task_isolation_on_cpu(cpu))
> > +   continue;
> > queue_work_on(cpu, system_highpri_wq,
> >   per_cpu_ptr(_works, cpu));
> > +   }
> >  
> > for_each_online_cpu(cpu)
> > flush_work(per_cpu_ptr(_works, cpu));
> 
> Thanks.

Re: "general protection fault in sctp_ulpevent_notify_peer_addr_change" and "general protection fault in sctp_ulpevent_nofity_peer_addr_change" should share the same root cause

2021-01-11 Thread Marcelo Ricardo Leitner

On Tue, Jan 12, 2021 at 10:18:00AM +0800, 慕冬亮 wrote:
> Dear developers,
> 
> I find that "general protection fault in l2cap_sock_getsockopt" and
> "general protection fault in sco_sock_getsockopt" may be duplicated
> bugs from the same root cause.
> 
> First, by comparing the PoC similarity after own minimization, we find
> they share the same PoC. Second, the stack traces for both bug reports
> are the same except for the last function. And the different last
> functions are due to a function name change (typo fix) from
> "sctp_ulpevent_nofity_peer_addr_change" to
> "sctp_ulpevent_notify_peer_addr_change"

Not sure where you saw stack traces with this sctp function in it, but
the syzkaller reports from 17 Feb 2020 are not related to SCTP.

The one on sco_sock_getsockopt() seems to be lack of parameter
validation: it doesn't check if optval is big enough when handling
BT_PHY (which has the same value as SCTP_STATUS). It seems also miss a
check on if level != SOL_BLUETOOTH, but I may be wrong here.

l2cap_sock_getsockopt also lacks checking optlen.

  Marcelo

Re: drivers/char/random.c needs a (new) maintainer

2020-12-18 Thread Marcelo Henrique Cerri

Hi, Ted and Jason.

Any updates on that?

I don't believe Torsten's concerns are simply about *applying* patches
but more about these long periods of radio silence. That kills
collaboration and disengage people. More than simply reviewing patches
I would expect a maintainer to give directions and drive the
community. Asking Jason to review Nicolai's patches was a step towards
that, but I believe we still could benefit from better communication.

Besides Nicolai's RFC, are you also planning to take another look at
Stephan's patches?

Thank you for your attention.

On Tue, Dec 01, 2020 at 12:42:36PM +0100, Jason A. Donenfeld wrote:
> On Mon, Nov 30, 2020 at 5:56 PM Theodore Y. Ts'o  wrote:
> > patches this cycle.  One thing that would help me is if folks
> > (especially Jason, if you would) could start with a detailed review of
> > Nicolai's patches.
> 
> Sure, I'll take a look.

-- 
Regards,
Marcelo

signature.asc
Description: PGP signature

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-15 Thread Marcelo Tosatti

On Fri, Dec 11, 2020 at 10:59:59PM +0100, Paolo Bonzini wrote:
> On 11/12/20 22:04, Thomas Gleixner wrote:
> > > Its 100ms off with migration, and can be reduced further (customers
> > > complained about 5 seconds but seem happy with 0.1ms).
> > What is 100ms? Guaranteed maximum migration time?
> 
> I suppose it's the length between the time from KVM_GET_CLOCK and
> KVM_GET_MSR(IA32_TSC) to KVM_SET_CLOCK and KVM_SET_MSR(IA32_TSC).  But the
> VM is paused for much longer, the sequence for the non-live part of the
> migration (aka brownout) is as follows:
> 
> pause
> finish sending RAMreceive RAM   ~1 sec
> send paused-VM state  finish receiving RAM \
>   receive paused-VM state   ) 0.1 sec
>   restart  /
> 
> The nanosecond and TSC times are sent as part of the paused-VM state at the
> very end of the live migration process.
> 
> So it's still true that the time advances during live migration brownout;
> 0.1 seconds is just the final part of the live migration process.  But for
> _live_ migration there is no need to design things according to "people are
> happy if their clock is off by 0.1 seconds only".  

Agree. What would be a good way to fix this? 

It seems to me using CLOCK_REALTIME as in the interface Maxim is
proposing is prone to difference in CLOCK_REALTIME itself.

Perhaps there is another way to measure that 0.1 sec which is
independent of the clock values of the source and destination hosts
(say by sending a packet once the clock stops counting).

Then on destination measure delta = clock_restart_time - packet_receival
and increase clock by that amount.



> Again, save-to-disk,
> reverse debugging and the like are a different story, which is why KVM
> should delegate policy to userspace (while documenting how to do it right).
> 
> Paolo
> 
> > CLOCK_REALTIME and CLOCK_TAI are off by the time the VM is paused and
> > this state persists up to the point where NTP corrects it with a time
> > jump.
> > 
> > So if migration takes 5 seconds then CLOCK_REALTIME is not off by 100ms
> > it's off by 5 seconds.
> > 
> > CLOCK_MONOTONIC/BOOTTIME might be off by 100ms between pause and resume.
> >

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-11 Thread Marcelo Tosatti

On Fri, Dec 11, 2020 at 02:30:34PM +0100, Thomas Gleixner wrote:
> On Thu, Dec 10 2020 at 21:27, Marcelo Tosatti wrote:
> > On Thu, Dec 10, 2020 at 10:48:10PM +0100, Thomas Gleixner wrote:
> >> You really all live in a seperate universe creating your own rules how
> >> things which other people work hard on to get it correct can be screwed
> >> over.
> >
> > 1. T = read timestamp.
> > 2. migrate (VM stops for a certain period).
> > 3. use timestamp T.
> 
> This is exactly the problem. Time stops at pause and continues where it
> stopped on resume.
> 
> But CLOCK_REALTIME and CLOCK_TAI advanced in reality. So up to the point
> where NTP fixes this - if there is NTP at all - the guest CLOCK_REALTIME
> and CLOCK_TAI are off by tpause.
> 
> Now the application gets a packet from the outside world with a
> CLOCK_REALTIME timestamp which is suddenly ahead of the value it reads
> from clock_gettime(CLOCK_REALTIME) by tpause. So what is it supposed to
> do with that? Make stupid assumptions that the other end screwed up
> timekeeping, throw an error that the system it is running on screwed up
> timekeeping? And a second later when NTP catched up it gets the next
> surprise because the systems CLOCK_REALTIME jumped forward unexpectedly
> or if there is no NTP it's confused forever.

This can happen even with a "perfect" solution that syncs time
instantly on the migration destination. See steps 1,2,3.

Unless you notify applications to invalidate their time reads,
i can't see a way to fix this.

Therefore if you use VM migration in the first place, a certain amount of
timestamp accuracy error must be tolerated.

> How can you even assume that this is correct?

As noted above, even without a window of unsynchronized time (due to
delay for NTP to sync time), time reads can be stale.

> It is exactly the same problem as we had many years ago with hardware
> clocks suddenly stopping to tick which caused quite some stuff to go
> belly up.

Customers complained when it was 5 seconds off, now its 0.1ms (and
people seem happy).

> In a proper suspend/resume scenario CLOCK_REALTIME/TAI are advanced
> (with a certain degree of accuracy) to compensate for the sleep time, so
> the other end of a communication is at least in the same ballpark, but
> not 50 seconds off.

Its 100ms off with migration, and can be reduced further (customers
complained about 5 seconds but seem happy with 0.1ms).

> >> This features first, correctness later frenzy is insane and it better
> >> stops now before you pile even more crap on the existing steaming pile
> >> of insanities.
> >
> > Sure.
> 
> I wish that would be true. OS people - you should know that - are
> fighting forever with hardware people over feature madness and the
> attitude of 'we can fix that in software' which turns often enough out
> to be wrong.
> 
> Now sadly enough people who suffered from that madness work on
> virtualization and instead of trying to avoid the same problem they go
> off and make it even worse.

So you think its important to reduce the 100ms offset? 

> It's the same problem again as with hardware people. Not talking to the
> other people _before_ making uninformed assumptions and decisions.
> 
> We did it that way because big customer asked for it is not a
> justification for inflicting this on everybody else and thereby
> violating correctness. Works for me and my big customer is not a proof
> of correctness either.
> 
> It's another proof that this industry just "works" by chance.
> 
> Thanks,
> 
> tglx

OK, makes sense, then reducing the 0.1ms window even further
is a useful thing to do. What would be an acceptable 
CLOCK_REALTIME accuracy error, on migration?

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-10 Thread Marcelo Tosatti

On Thu, Dec 10, 2020 at 10:48:10PM +0100, Thomas Gleixner wrote:
> On Thu, Dec 10 2020 at 12:26, Marcelo Tosatti wrote:
> > On Wed, Dec 09, 2020 at 09:58:23PM +0100, Thomas Gleixner wrote:
> >> Marcelo,
> >> 
> >> On Wed, Dec 09 2020 at 13:34, Marcelo Tosatti wrote:
> >> > On Tue, Dec 08, 2020 at 10:33:15PM +0100, Thomas Gleixner wrote:
> >> >> On Tue, Dec 08 2020 at 15:11, Marcelo Tosatti wrote:
> >> >> > max_cycles overflow. Sent a message to Maxim describing it.
> >> >> 
> >> >> Truly helpful. Why the hell did you not talk to me when you ran into
> >> >> that the first time?
> >> >
> >> > Because 
> >> >
> >> > 1) Users wanted CLOCK_BOOTTIME to stop counting while the VM 
> >> > is paused (so we wanted to stop guest clock when VM is paused anyway).
> >> 
> >> How is that supposed to work w/o the guest kernels help if you have to
> >> keep clock realtime up to date? 
> >
> > Upon VM resume, we notify NTP daemon in the guest to sync realtime
> > clock.
> 
> Brilliant. What happens if there is no NTP daemon? What happens if the
> NTP daemon is not part of the virt orchestration magic and cannot be
> notified, then it will notice the time jump after the next update
> interval.
> 
> What about correctness?
> 
> ALL CLOCK_* stop and resume when the VM is resumed at the point where
> they stopped.
> 
> So up to the point where NTP catches up and corrects clock realtime and
> TAI other processes can observe that time jumped in the outside world,
> e.g. via a network packet or whatever, but there is no reason why time
> should have jumped outside vs. the local one.
> 
> You really all live in a seperate universe creating your own rules how
> things which other people work hard on to get it correct can be screwed
> over.

1. T = read timestamp.
2. migrate (VM stops for a certain period).
3. use timestamp T.

> Of course this all is nowhere documented in detail. At least a quick
> search with about 10 different keyword combinations revealed absolutely
> nothing.
> 
> This features first, correctness later frenzy is insane and it better
> stops now before you pile even more crap on the existing steaming pile
> of insanities.

Sure.

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-10 Thread Marcelo Tosatti

On Wed, Dec 09, 2020 at 09:58:23PM +0100, Thomas Gleixner wrote:
> Marcelo,
> 
> On Wed, Dec 09 2020 at 13:34, Marcelo Tosatti wrote:
> > On Tue, Dec 08, 2020 at 10:33:15PM +0100, Thomas Gleixner wrote:
> >> On Tue, Dec 08 2020 at 15:11, Marcelo Tosatti wrote:
> >> > max_cycles overflow. Sent a message to Maxim describing it.
> >> 
> >> Truly helpful. Why the hell did you not talk to me when you ran into
> >> that the first time?
> >
> > Because 
> >
> > 1) Users wanted CLOCK_BOOTTIME to stop counting while the VM 
> > is paused (so we wanted to stop guest clock when VM is paused anyway).
> 
> How is that supposed to work w/o the guest kernels help if you have to
> keep clock realtime up to date? 

Upon VM resume, we notify NTP daemon in the guest to sync realtime
clock.
> 
> > 2) The solution to inject NMIs to the guest seemed overly
> > complicated.
> 
> Why do you need NMIs?
> 
> All you need is a way to communicate to the guest that it should prepare
> for clock madness to happen. Whether that's an IPI or a bit in a
> hyperpage which gets checked during the update of the guest timekeeping
> does not matter at all.
> 
> But you certainly do not need an NMI because there is nothing useful you
> can do within an NMI.
> 
> Thanks,
> 
> tglx

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-09 Thread Marcelo Tosatti

On Tue, Dec 08, 2020 at 10:33:15PM +0100, Thomas Gleixner wrote:
> On Tue, Dec 08 2020 at 15:11, Marcelo Tosatti wrote:
> > On Tue, Dec 08, 2020 at 05:02:07PM +0100, Thomas Gleixner wrote:
> >> On Tue, Dec 08 2020 at 16:50, Maxim Levitsky wrote:
> >> > On Mon, 2020-12-07 at 20:29 -0300, Marcelo Tosatti wrote:
> >> >> > +This ioctl allows to reconstruct the guest's IA32_TSC and TSC_ADJUST 
> >> >> > value
> >> >> > +from the state obtained in the past by KVM_GET_TSC_STATE on the same 
> >> >> > vCPU.
> >> >> > +
> >> >> > +If 'KVM_TSC_STATE_TIMESTAMP_VALID' is set in flags,
> >> >> > +KVM will adjust the guest TSC value by the time that passed since 
> >> >> > the moment
> >> >> > +CLOCK_REALTIME timestamp was saved in the struct and current value of
> >> >> > +CLOCK_REALTIME, and set the guest's TSC to the new value.
> >> >> 
> >> >> This introduces the wraparound bug in Linux timekeeping, doesnt it?
> >> 
> >> Which bug?
> >
> > max_cycles overflow. Sent a message to Maxim describing it.
> 
> Truly helpful. Why the hell did you not talk to me when you ran into
> that the first time?

Because 

1) Users wanted CLOCK_BOOTTIME to stop counting while the VM 
is paused (so we wanted to stop guest clock when VM is paused anyway).

2) The solution to inject NMIs to the guest seemed overly
complicated.

> >> For one I have no idea which bug you are talking about and if the bug is
> >> caused by the VMM then why would you "fix" it in the guest kernel.
> >
> > 1) Stop guest, save TSC value of cpu-0 = V.
> > 2) Wait for some amount of time = W.
> > 3) Start guest, load TSC value with V+W.
> >
> > Can cause an overflow on Linux timekeeping.
> 
> Yes, because you violate the basic assumption which Linux timekeeping
> makes. See the other mail in this thread.
> 
> Thanks,
> 
> tglx

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-08 Thread Marcelo Tosatti

On Tue, Dec 08, 2020 at 06:25:13PM +0200, Maxim Levitsky wrote:
> On Tue, 2020-12-08 at 17:02 +0100, Thomas Gleixner wrote:
> > On Tue, Dec 08 2020 at 16:50, Maxim Levitsky wrote:
> > > On Mon, 2020-12-07 at 20:29 -0300, Marcelo Tosatti wrote:
> > > > > +This ioctl allows to reconstruct the guest's IA32_TSC and TSC_ADJUST 
> > > > > value
> > > > > +from the state obtained in the past by KVM_GET_TSC_STATE on the same 
> > > > > vCPU.
> > > > > +
> > > > > +If 'KVM_TSC_STATE_TIMESTAMP_VALID' is set in flags,
> > > > > +KVM will adjust the guest TSC value by the time that passed since 
> > > > > the moment
> > > > > +CLOCK_REALTIME timestamp was saved in the struct and current value of
> > > > > +CLOCK_REALTIME, and set the guest's TSC to the new value.
> > > > 
> > > > This introduces the wraparound bug in Linux timekeeping, doesnt it?
> > 
> > Which bug?
> > 
> > > It does.
> > > Could you prepare a reproducer for this bug so I get a better idea about
> > > what are you talking about?
> > > 
> > > I assume you need very long (like days worth) jump to trigger this bug
> > > and for such case we can either work around it in qemu / kernel 
> > > or fix it in the guest kernel and I strongly prefer the latter.
> > > 
> > > Thomas, what do you think about it?
> > 
> > For one I have no idea which bug you are talking about and if the bug is
> > caused by the VMM then why would you "fix" it in the guest kernel.
> 
> The "bug" is that if VMM moves a hardware time counter (tsc or anything else) 
> forward by large enough value in one go, 
> then the guest kernel will supposingly have an overflow in the time code.
> I don't consider this to be a buggy VMM behavior, but rather a kernel
> bug that should be fixed (if this bug actually exists)

It exists.

> Purely in theory this can even happen on real hardware if for example SMM 
> handler
> blocks a CPU from running for a long duration, or hardware debugging
> interface does, or some other hardware transparent sleep mechanism kicks in
> and blocks a CPU from running.
> (We do handle this gracefully for S3/S4)
> 
> > 
> > Aside of that I think I made it pretty clear what the right thing to do
> > is.
> 
> This is orthogonal to this issue of the 'bug'. 
> Here we are not talking about per-vcpu TSC offsets, something that I said 
> that I do agree with you that it would be very nice to get rid of.
>  
> We are talking about the fact that TSC can jump forward by arbitrary large
> value if the migration took arbitrary amount of time, which 
> (assuming that the bug is real) can crash the guest kernel.

QE reproduced it.

> This will happen even if we use per VM global tsc offset.
> 
> So what do you think?
> 
> Best regards,
>   Maxim Levitsky
> 
> > 
> > Thanks,
> > 
> > tglx
> > 
>

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-08 Thread Marcelo Tosatti

On Tue, Dec 08, 2020 at 05:02:07PM +0100, Thomas Gleixner wrote:
> On Tue, Dec 08 2020 at 16:50, Maxim Levitsky wrote:
> > On Mon, 2020-12-07 at 20:29 -0300, Marcelo Tosatti wrote:
> >> > +This ioctl allows to reconstruct the guest's IA32_TSC and TSC_ADJUST 
> >> > value
> >> > +from the state obtained in the past by KVM_GET_TSC_STATE on the same 
> >> > vCPU.
> >> > +
> >> > +If 'KVM_TSC_STATE_TIMESTAMP_VALID' is set in flags,
> >> > +KVM will adjust the guest TSC value by the time that passed since the 
> >> > moment
> >> > +CLOCK_REALTIME timestamp was saved in the struct and current value of
> >> > +CLOCK_REALTIME, and set the guest's TSC to the new value.
> >> 
> >> This introduces the wraparound bug in Linux timekeeping, doesnt it?
> 
> Which bug?

max_cycles overflow. Sent a message to Maxim describing it.

> 
> > It does.
> > Could you prepare a reproducer for this bug so I get a better idea about
> > what are you talking about?
> >
> > I assume you need very long (like days worth) jump to trigger this bug
> > and for such case we can either work around it in qemu / kernel 
> > or fix it in the guest kernel and I strongly prefer the latter.
> >
> > Thomas, what do you think about it?
> 
> For one I have no idea which bug you are talking about and if the bug is
> caused by the VMM then why would you "fix" it in the guest kernel.

1) Stop guest, save TSC value of cpu-0 = V.
2) Wait for some amount of time = W.
3) Start guest, load TSC value with V+W.

Can cause an overflow on Linux timekeeping.

> Aside of that I think I made it pretty clear what the right thing to do
> is.

Sure: the notion of a "unique TSC offset" already exists (it is detected
by write TSC logic, and not explicit in the interface, though).

But AFAIK it works pretty well.

Exposing a single TSC value on the interface level seems alright to
me...

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-08 Thread Marcelo Tosatti

On Tue, Dec 08, 2020 at 04:50:53PM +0200, Maxim Levitsky wrote:
> On Mon, 2020-12-07 at 20:29 -0300, Marcelo Tosatti wrote:
> > On Thu, Dec 03, 2020 at 07:11:16PM +0200, Maxim Levitsky wrote:
> > > These two new ioctls allow to more precisly capture and
> > > restore guest's TSC state.
> > > 
> > > Both ioctls are meant to be used to accurately migrate guest TSC
> > > even when there is a significant downtime during the migration.
> > > 
> > > Suggested-by: Paolo Bonzini 
> > > Signed-off-by: Maxim Levitsky 
> > > ---
> > >  Documentation/virt/kvm/api.rst | 65 ++
> > >  arch/x86/kvm/x86.c | 73 ++
> > >  include/uapi/linux/kvm.h   | 15 +++
> > >  3 files changed, 153 insertions(+)
> > > 
> > > diff --git a/Documentation/virt/kvm/api.rst 
> > > b/Documentation/virt/kvm/api.rst
> > > index 70254eaa5229f..ebecfe4b414ce 100644
> > > --- a/Documentation/virt/kvm/api.rst
> > > +++ b/Documentation/virt/kvm/api.rst
> > > @@ -4826,6 +4826,71 @@ If a vCPU is in running state while this ioctl is 
> > > invoked, the vCPU may
> > >  experience inconsistent filtering behavior on MSR accesses.
> > >  
> > >  
> > > +4.127 KVM_GET_TSC_STATE
> > > +
> > > +
> > > +:Capability: KVM_CAP_PRECISE_TSC
> > > +:Architectures: x86
> > > +:Type: vcpu ioctl
> > > +:Parameters: struct kvm_tsc_state
> > > +:Returns: 0 on success, < 0 on error
> > > +
> > > +::
> > > +
> > > +  #define KVM_TSC_STATE_TIMESTAMP_VALID 1
> > > +  #define KVM_TSC_STATE_TSC_ADJUST_VALID 2
> > > +  struct kvm_tsc_state {
> > > + __u32 flags;
> > > + __u64 nsec;
> > > + __u64 tsc;
> > > + __u64 tsc_adjust;
> > > +  };
> > > +
> > > +flags values for ``struct kvm_tsc_state``:
> > > +
> > > +``KVM_TSC_STATE_TIMESTAMP_VALID``
> > > +
> > > +  ``nsec`` contains nanoseconds from unix epoch.
> > > +Always set by KVM_GET_TSC_STATE, might be omitted in 
> > > KVM_SET_TSC_STATE
> > > +
> > > +``KVM_TSC_STATE_TSC_ADJUST_VALID``
> > > +
> > > +  ``tsc_adjust`` contains valid IA32_TSC_ADJUST value
> > > +
> > > +
> > > +This ioctl allows the user space to read the guest's 
> > > IA32_TSC,IA32_TSC_ADJUST,
> > > +and the current value of host's CLOCK_REALTIME clock in nanoseconds 
> > > since unix
> > > +epoch.
> > 
> > Why is CLOCK_REALTIME necessary at all? kvmclock uses the host clock as
> > a time base, but for TSC it should not be necessary.
> 
> 
> CLOCK_REALTIME is used as an absolute time reference that should match
> on both computers. I could have used CLOCK_TAI instead for example.
> 
> The reference allows to account for time passed between saving and restoring
> the TSC as explained above.

As mentioned we don't want this due to the overflow. 

Again, i think higher priority is to allow enablement of invariant TSC
by default (to disable kvmclock).

> > > +
> > > +
> > > +4.128 KVM_SET_TSC_STATE
> > > +
> > > +
> > > +:Capability: KVM_CAP_PRECISE_TSC
> > > +:Architectures: x86
> > > +:Type: vcpu ioctl
> > > +:Parameters: struct kvm_tsc_state
> > > +:Returns: 0 on success, < 0 on error
> > > +
> > > +::
> > > +
> > > +This ioctl allows to reconstruct the guest's IA32_TSC and TSC_ADJUST 
> > > value
> > > +from the state obtained in the past by KVM_GET_TSC_STATE on the same 
> > > vCPU.
> > > +
> > > +If 'KVM_TSC_STATE_TIMESTAMP_VALID' is set in flags,
> > > +KVM will adjust the guest TSC value by the time that passed since the 
> > > moment
> > > +CLOCK_REALTIME timestamp was saved in the struct and current value of
> > > +CLOCK_REALTIME, and set the guest's TSC to the new value.
> > 
> > This introduces the wraparound bug in Linux timekeeping, doesnt it?
> 
> It does.
> Could you prepare a reproducer for this bug so I get a better idea about
> what are you talking about?

Enable CONFIG_DEBUG_TIMEKEEPING, check what max_cycles is from the TSC
clocksource:

#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */

static void timekeeping_check_update(struct timekeeper *tk, u64 offset)
{

u64 max_cycles = tk->tkr_mono.clock->max_c

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-08 Thread Marcelo Tosatti

On Mon, Dec 07, 2020 at 10:04:45AM -0800, Andy Lutomirski wrote:
> 
> > On Dec 7, 2020, at 9:00 AM, Maxim Levitsky  wrote:
> > 
> > On Mon, 2020-12-07 at 08:53 -0800, Andy Lutomirski wrote:
>  On Dec 7, 2020, at 8:38 AM, Thomas Gleixner  wrote:
> >>> 
> >>> On Mon, Dec 07 2020 at 14:16, Maxim Levitsky wrote:
> > On Sun, 2020-12-06 at 17:19 +0100, Thomas Gleixner wrote:
> > From a timekeeping POV and the guests expectation of TSC this is
> > fundamentally wrong:
> > 
> > tscguest = scaled(hosttsc) + offset
> > 
> > The TSC has to be viewed systemwide and not per CPU. It's systemwide
> > used for timekeeping and for that to work it has to be synchronized. 
> > 
> > Why would this be different on virt? Just because it's virt or what? 
> > 
> > Migration is a guest wide thing and you're not migrating single vCPUs.
> > 
> > This hackery just papers over he underlying design fail that KVM looks
> > at the TSC per vCPU which is the root cause and that needs to be fixed.
>  
>  I don't disagree with you.
>  As far as I know the main reasons that kvm tracks TSC per guest are
>  
>  1. cases when host tsc is not stable 
>  (hopefully rare now, and I don't mind making
>  the new API just refuse to work when this is detected, and revert to old 
>  way
>  of doing things).
> >>> 
> >>> That's a trainwreck to begin with and I really would just not support it
> >>> for anything new which aims to be more precise and correct.  TSC has
> >>> become pretty reliable over the years.
> >>> 
>  2. (theoretical) ability of the guest to introduce per core tsc offfset
>  by either using TSC_ADJUST (for which I got recently an idea to stop
>  advertising this feature to the guest), or writing TSC directly which
>  is allowed by Intel's PRM:
> >>> 
> >>> For anything halfways modern the write to TSC is reflected in TSC_ADJUST
> >>> which means you get the precise offset.
> >>> 
> >>> The general principle still applies from a system POV.
> >>> 
> >>>TSC base (systemwide view) - The sane case
> >>> 
> >>>TSC CPU  = TSC base + TSC_ADJUST
> >>> 
> >>> The guest TSC base is a per guest constant offset to the host TSC.
> >>> 
> >>>TSC guest base = TSC host base + guest base offset
> >>> 
> >>> If the guest want's this different per vCPU by writing to the MSR or to
> >>> TSC_ADJUST then you still can have a per vCPU offset in TSC_ADJUST which
> >>> is the offset to the TSC base of the guest.
> >> 
> >> How about, if the guest wants to write TSC_ADJUST, it can turn off all 
> >> paravirt features and keep both pieces?
> >> 
> > 
> > This is one of the things I had in mind recently.
> > 
> > Even better, we can stop advertising TSC_ADJUST in CPUID to the guest 
> > and forbid it from writing it at all.
> 
> Seems reasonable to me.
> 
> It also seems okay for some MSRs to stop working after the guest enabled new 
> PV timekeeping.
> 
> I do have a feature request, though: IMO it would be quite nifty if the new 
> kvmclock structure could also expose NTP corrections. In other words, if you 
> could expose enough info to calculate CLOCK_MONOTONIC_RAW, CLOCK_MONOTONIC, 
> and CLOCK_REALTIME, then we could have paravirt NTP.

Hi Andy,

Any reason why drivers/ptp/ptp_kvm.c does not work for you?

> Bonus points if whatever you do for CLOCK_REALTIME also exposes leap seconds 
> in a race free way :). But I suppose that just exposing TAI and letting the 
> guest deal with the TAI - UTC offset itself would get the job done just fine.

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-08 Thread Marcelo Tosatti

On Thu, Dec 03, 2020 at 07:11:16PM +0200, Maxim Levitsky wrote:
> These two new ioctls allow to more precisly capture and
> restore guest's TSC state.
> 
> Both ioctls are meant to be used to accurately migrate guest TSC
> even when there is a significant downtime during the migration.
> 
> Suggested-by: Paolo Bonzini 
> Signed-off-by: Maxim Levitsky 
> ---
>  Documentation/virt/kvm/api.rst | 65 ++
>  arch/x86/kvm/x86.c | 73 ++
>  include/uapi/linux/kvm.h   | 15 +++
>  3 files changed, 153 insertions(+)
> 
> diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
> index 70254eaa5229f..ebecfe4b414ce 100644
> --- a/Documentation/virt/kvm/api.rst
> +++ b/Documentation/virt/kvm/api.rst
> @@ -4826,6 +4826,71 @@ If a vCPU is in running state while this ioctl is 
> invoked, the vCPU may
>  experience inconsistent filtering behavior on MSR accesses.
>  
>  
> +4.127 KVM_GET_TSC_STATE
> +
> +
> +:Capability: KVM_CAP_PRECISE_TSC
> +:Architectures: x86
> +:Type: vcpu ioctl
> +:Parameters: struct kvm_tsc_state
> +:Returns: 0 on success, < 0 on error
> +
> +::
> +
> +  #define KVM_TSC_STATE_TIMESTAMP_VALID 1
> +  #define KVM_TSC_STATE_TSC_ADJUST_VALID 2
> +  struct kvm_tsc_state {
> + __u32 flags;
> + __u64 nsec;
> + __u64 tsc;
> + __u64 tsc_adjust;
> +  };
> +
> +flags values for ``struct kvm_tsc_state``:
> +
> +``KVM_TSC_STATE_TIMESTAMP_VALID``
> +
> +  ``nsec`` contains nanoseconds from unix epoch.
> +Always set by KVM_GET_TSC_STATE, might be omitted in KVM_SET_TSC_STATE
> +
> +``KVM_TSC_STATE_TSC_ADJUST_VALID``
> +
> +  ``tsc_adjust`` contains valid IA32_TSC_ADJUST value
> +
> +
> +This ioctl allows the user space to read the guest's 
> IA32_TSC,IA32_TSC_ADJUST,
> +and the current value of host's CLOCK_REALTIME clock in nanoseconds since 
> unix
> +epoch.

Why is CLOCK_REALTIME necessary at all? kvmclock uses the host clock as
a time base, but for TSC it should not be necessary.

> +
> +
> +4.128 KVM_SET_TSC_STATE
> +
> +
> +:Capability: KVM_CAP_PRECISE_TSC
> +:Architectures: x86
> +:Type: vcpu ioctl
> +:Parameters: struct kvm_tsc_state
> +:Returns: 0 on success, < 0 on error
> +
> +::
> +
> +This ioctl allows to reconstruct the guest's IA32_TSC and TSC_ADJUST value
> +from the state obtained in the past by KVM_GET_TSC_STATE on the same vCPU.
> +
> +If 'KVM_TSC_STATE_TIMESTAMP_VALID' is set in flags,
> +KVM will adjust the guest TSC value by the time that passed since the moment
> +CLOCK_REALTIME timestamp was saved in the struct and current value of
> +CLOCK_REALTIME, and set the guest's TSC to the new value.

This introduces the wraparound bug in Linux timekeeping, doesnt it?

> +
> +Otherwise KVM will set the guest TSC value to the exact value as given
> +in the struct.
> +
> +if KVM_TSC_STATE_TSC_ADJUST_VALID is set, and guest supports 
> IA32_MSR_TSC_ADJUST,
> +then its value will be set to the given value from the struct.
> +
> +It is assumed that either both ioctls will be run on the same machine,
> +or that source and destination machines have synchronized clocks.



>  5. The kvm_run structure
>  
>  
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index a3fdc16cfd6f3..9b8a2fe3a2398 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2438,6 +2438,21 @@ static bool kvm_get_walltime_and_clockread(struct 
> timespec64 *ts,
>  
>   return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
>  }
> +
> +
> +static void kvm_get_walltime(u64 *walltime_ns, u64 *host_tsc)
> +{
> + struct timespec64 ts;
> +
> + if (kvm_get_walltime_and_clockread(, host_tsc)) {
> + *walltime_ns = timespec64_to_ns();
> + return;
> + }
> +
> + *host_tsc = rdtsc();
> + *walltime_ns = ktime_get_real_ns();
> +}
> +
>  #endif
>  
>  /*
> @@ -3757,6 +3772,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
> ext)
>   case KVM_CAP_X86_USER_SPACE_MSR:
>   case KVM_CAP_X86_MSR_FILTER:
>   case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
> +#ifdef CONFIG_X86_64
> + case KVM_CAP_PRECISE_TSC:
> +#endif
>   r = 1;
>   break;
>   case KVM_CAP_SYNC_REGS:
> @@ -4999,6 +5017,61 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>   case KVM_GET_SUPPORTED_HV_CPUID:
>   r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
>   break;
> +#ifdef CONFIG_X86_64
> + case KVM_GET_TSC_STATE: {
> + struct kvm_tsc_state __user *user_tsc_state = argp;
> + u64 host_tsc;
> +
> + struct kvm_tsc_state tsc_state = {
> + .flags = KVM_TSC_STATE_TIMESTAMP_VALID
> + };
> +
> + kvm_get_walltime(_state.nsec, _tsc);
> + tsc_state.tsc = kvm_read_l1_tsc(vcpu, host_tsc);
> +
> + if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
> +

Re: [PATCH v2 0/3] RFC: Precise TSC migration

2020-12-08 Thread Marcelo Tosatti

On Thu, Dec 03, 2020 at 07:11:15PM +0200, Maxim Levitsky wrote:
> Hi!
> 
> This is the second version of the work to make TSC migration more accurate,
> as was defined by Paulo at:
> https://www.spinics.net/lists/kvm/msg225525.html

Maxim,

Can you please make a description of what is the practical problem that is 
being fixed, preferably with instructions on how to reproduce ?

> I omitted most of the semi-offtopic points I raised related to TSC
> in the previous RFC where we can continue the discussion.
> 
> I do want to raise another thing that I almost forgot.
> 
> On AMD systems, the Linux kernel will mark the guest tsc as
> unstable unless invtsc is set which is set on recent AMD
> hardware.
> 
> Take a look at 'unsynchronized_tsc()' to verify this.
> 
> This is another thing that IMHO should be fixed at least when
> running under KVM.
> 
> Note that I forgot to mention that
> X86_FEATURE_TSC_RELIABLE also short-circuits this code,
> thus giving another reason to enable it under KVM.
> 
> Changes from V1:
> 
> - added KVM_TSC_STATE_TIMESTAMP_VALID instead of testing ns == 0
> - allow diff < 0, because it is still better that capping it to 0
> - updated tsc_msr_test unit test to cover this feature
> - refactoring
> 
> Patches to enable this feature in qemu are in the process of
> being sent to qemu-devel mailing list.
> 
> Best regards,
> Maxim Levitsky
> 
> Maxim Levitsky (3):
>   KVM: x86: implement KVM_{GET|SET}_TSC_STATE
>   KVM: x86: introduce KVM_X86_QUIRK_TSC_HOST_ACCESS
>   kvm/selftests: update tsc_msrs_test to cover
> KVM_X86_QUIRK_TSC_HOST_ACCESS
> 
>  Documentation/virt/kvm/api.rst| 65 +
>  arch/x86/include/uapi/asm/kvm.h   |  1 +
>  arch/x86/kvm/x86.c| 92 ++-
>  include/uapi/linux/kvm.h  | 15 +++
>  .../selftests/kvm/x86_64/tsc_msrs_test.c  | 79 ++--
>  5 files changed, 237 insertions(+), 15 deletions(-)
> 
> -- 
> 2.26.2
>

Re: [PATCH v2 1/3] KVM: x86: implement KVM_{GET|SET}_TSC_STATE

2020-12-08 Thread Marcelo Tosatti

On Sun, Dec 06, 2020 at 05:19:16PM +0100, Thomas Gleixner wrote:
> On Thu, Dec 03 2020 at 19:11, Maxim Levitsky wrote:
> > +   case KVM_SET_TSC_STATE: {
> > +   struct kvm_tsc_state __user *user_tsc_state = argp;
> > +   struct kvm_tsc_state tsc_state;
> > +   u64 host_tsc, wall_nsec;
> > +
> > +   u64 new_guest_tsc, new_guest_tsc_offset;
> > +
> > +   r = -EFAULT;
> > +   if (copy_from_user(_state, user_tsc_state, 
> > sizeof(tsc_state)))
> > +   goto out;
> > +
> > +   kvm_get_walltime(_nsec, _tsc);
> > +   new_guest_tsc = tsc_state.tsc;
> > +
> > +   if (tsc_state.flags & KVM_TSC_STATE_TIMESTAMP_VALID) {
> > +   s64 diff = wall_nsec - tsc_state.nsec;
> > +   if (diff >= 0)
> > +   new_guest_tsc += nsec_to_cycles(vcpu, diff);
> > +   else
> > +   new_guest_tsc -= nsec_to_cycles(vcpu, -diff);
> > +   }
> > +
> > +   new_guest_tsc_offset = new_guest_tsc - kvm_scale_tsc(vcpu, 
> > host_tsc);
> > +   kvm_vcpu_write_tsc_offset(vcpu, new_guest_tsc_offset);
> 
> >From a timekeeping POV and the guests expectation of TSC this is
> fundamentally wrong:
> 
>   tscguest = scaled(hosttsc) + offset
> 
> The TSC has to be viewed systemwide and not per CPU. It's systemwide
> used for timekeeping and for that to work it has to be synchronized. 
> 
> Why would this be different on virt? Just because it's virt or what? 
> 
> Migration is a guest wide thing and you're not migrating single vCPUs.
> 
> This hackery just papers over he underlying design fail that KVM looks
> at the TSC per vCPU which is the root cause and that needs to be fixed.

It already does it: The unified TSC offset is kept at kvm->arch.cur_tsc_offset.

Re: [PATCH 0/2] RFC: Precise TSC migration

2020-12-04 Thread Marcelo Tosatti



On Thu, Dec 03, 2020 at 01:39:42PM +0200, Maxim Levitsky wrote:
> On Tue, 2020-12-01 at 16:48 -0300, Marcelo Tosatti wrote:
> > On Tue, Dec 01, 2020 at 02:30:39PM +0200, Maxim Levitsky wrote:
> > > On Mon, 2020-11-30 at 16:16 -0300, Marcelo Tosatti wrote:
> > > > Hi Maxim,
> > > > 
> > > > On Mon, Nov 30, 2020 at 03:35:57PM +0200, Maxim Levitsky wrote:
> > > > > Hi!
> > > > > 
> > > > > This is the first version of the work to make TSC migration more 
> > > > > accurate,
> > > > > as was defined by Paulo at:
> > > > > https://www.spinics.net/lists/kvm/msg225525.html
> > > > 
> > > > Description from Oliver's patch:
> > > > 
> > > > "To date, VMMs have typically restored the guest's TSCs by value using
> > > > the KVM_SET_MSRS ioctl for each vCPU. However, restoring the TSCs by
> > > > value introduces some challenges with synchronization as the TSCs
> > > > continue to tick throughout the restoration process. As such, KVM has
> > > > some heuristics around TSC writes to infer whether or not the guest or
> > > > host is attempting to synchronize the TSCs."
> > > > 
> > > > Not really. The synchronization logic tries to sync TSCs during
> > > > BIOS boot (and CPU hotplug), because the TSC values are loaded
> > > > sequentially, say:
> > > > 
> > > > CPU realtimeTSC val
> > > > vcpu0   0 usec  0
> > > > vcpu1   100 usec0
> > > > vcpu2   200 usec0
> > > > ...
> > > > 
> > > > And we'd like to see all vcpus to read the same value at all times.
> > > > 
> > > > Other than that, comment makes sense. The problem with live migration
> > > > is as follows:
> > > > 
> > > > We'd like the TSC value to be written, ideally, just before the first
> > > > VM-entry a vCPU (because at the moment the TSC_OFFSET has been written, 
> > > > the vcpus tsc is ticking, which will cause a visible forward jump
> > > > in vcpus tsc time).
> > > > 
> > > > Before the first VM-entry is the farthest point in time before guest
> > > > entry that one could do that.
> > > > 
> > > > The window (or forward jump) between KVM_SET_TSC and VM-entry was about
> > > > 100ms last time i checked (which results in a 100ms time jump forward), 
> > > > See QEMU's 6053a86fe7bd3d5b07b49dae6c05f2cd0d44e687.
> > > > 
> > > > Have we measured any improvement with this patchset?
> > > 
> > > Its not about this window. 
> > > It is about time that passes between the point that we read the 
> > > TSC on source system (and we do it in qemu each time the VM is paused) 
> > > and the moment that we set the same TSC value on the target. 
> > > That time is unbounded.
> > 
> > OK. Well, its the same problem: ideally you'd want to do that just
> > before VCPU-entry.
> > 
> > > Also this patchset should decrease TSC skew that happens
> > > between restoring it on multiple vCPUs as well, since 
> > > KVM_SET_TSC_STATE doesn't have to happen at the same time,
> > > as it accounts for time passed on each vCPU.
> > > 
> > > 
> > > Speaking of kvmclock, somewhat offtopic since this is a different issue,
> > > I found out that qemu reads the kvmclock value on each pause, 
> > > and then 'restores' on unpause, using
> > > KVM_SET_CLOCK (this modifies the global kvmclock offset)
> > > 
> > > This means (and I tested it) that if guest uses kvmclock
> > > for time reference, it will not account for time passed in
> > > the paused state.
> > 
> > Yes, this is necessary because otherwise there might be an overflow
> > in the kernel time accounting code (if the clock delta is too large).
> 
> Could you elaborate on this? Do you mean that guest kernel can crash,
> when the time 'jumps' too far forward in one go?

It can crash (there will a overflow and time will jump backwards).

> If so this will happen with kernel using TSC as well, 
> since we do let the virtual TSC to 'keep running' while VM is suspended, 
> and the goal of this patchset is to let it 'run' even while
> the VM is migrating.

True. For the overflow one, perhaps TSC can be stopped (and restored) similarly 
to
KVMCLOCK.

See QEMU's commit 00f4d64ee76e873be881a82d893a591487aa7950.

> And if there

Re: [PATCH 0/2] RFC: Precise TSC migration

2020-12-01 Thread Marcelo Tosatti

On Tue, Dec 01, 2020 at 02:30:39PM +0200, Maxim Levitsky wrote:
> On Mon, 2020-11-30 at 16:16 -0300, Marcelo Tosatti wrote:
> > Hi Maxim,
> > 
> > On Mon, Nov 30, 2020 at 03:35:57PM +0200, Maxim Levitsky wrote:
> > > Hi!
> > > 
> > > This is the first version of the work to make TSC migration more accurate,
> > > as was defined by Paulo at:
> > > https://www.spinics.net/lists/kvm/msg225525.html
> > 
> > Description from Oliver's patch:
> > 
> > "To date, VMMs have typically restored the guest's TSCs by value using
> > the KVM_SET_MSRS ioctl for each vCPU. However, restoring the TSCs by
> > value introduces some challenges with synchronization as the TSCs
> > continue to tick throughout the restoration process. As such, KVM has
> > some heuristics around TSC writes to infer whether or not the guest or
> > host is attempting to synchronize the TSCs."
> > 
> > Not really. The synchronization logic tries to sync TSCs during
> > BIOS boot (and CPU hotplug), because the TSC values are loaded
> > sequentially, say:
> > 
> > CPU realtimeTSC val
> > vcpu0   0 usec  0
> > vcpu1   100 usec0
> > vcpu2   200 usec0
> > ...
> > 
> > And we'd like to see all vcpus to read the same value at all times.
> > 
> > Other than that, comment makes sense. The problem with live migration
> > is as follows:
> > 
> > We'd like the TSC value to be written, ideally, just before the first
> > VM-entry a vCPU (because at the moment the TSC_OFFSET has been written, 
> > the vcpus tsc is ticking, which will cause a visible forward jump
> > in vcpus tsc time).
> > 
> > Before the first VM-entry is the farthest point in time before guest
> > entry that one could do that.
> > 
> > The window (or forward jump) between KVM_SET_TSC and VM-entry was about
> > 100ms last time i checked (which results in a 100ms time jump forward), 
> > See QEMU's 6053a86fe7bd3d5b07b49dae6c05f2cd0d44e687.
> > 
> > Have we measured any improvement with this patchset?
> 
> Its not about this window. 
> It is about time that passes between the point that we read the 
> TSC on source system (and we do it in qemu each time the VM is paused) 
> and the moment that we set the same TSC value on the target. 
> That time is unbounded.

OK. Well, its the same problem: ideally you'd want to do that just
before VCPU-entry.

> Also this patchset should decrease TSC skew that happens
> between restoring it on multiple vCPUs as well, since 
> KVM_SET_TSC_STATE doesn't have to happen at the same time,
> as it accounts for time passed on each vCPU.
> 
> 
> Speaking of kvmclock, somewhat offtopic since this is a different issue,
> I found out that qemu reads the kvmclock value on each pause, 
> and then 'restores' on unpause, using
> KVM_SET_CLOCK (this modifies the global kvmclock offset)
> 
> This means (and I tested it) that if guest uses kvmclock
> for time reference, it will not account for time passed in
> the paused state.

Yes, this is necessary because otherwise there might be an overflow
in the kernel time accounting code (if the clock delta is too large).

> 
> > 
> > Then Paolo mentions (with >), i am replying as usual.
> > 
> > > Ok, after looking more at the code with Maxim I can confidently say that
> > > it's a total mess.  And a lot of the synchronization code is dead
> > > because 1) as far as we could see no guest synchronizes the TSC using
> > > MSR_IA32_TSC; 
> > 
> > Well, recent BIOS'es take care of synchronizing the TSC. So when Linux
> > boots, it does not have to synchronize TSC in software. 
> 
> Do you have an example of such BIOS? I tested OVMF which I compiled
> from git master a few weeks ago, and I also tested this with seabios 
> from qemu repo, and I have never seen writes to either TSC, or TSC_ADJUST
> from BIOS.

Oh, well, QEMU then.

> Or do you refer to the native BIOS on the host doing TSC synchronization?

No, virt.

> > However, upon migration (and initialization), the KVM_SET_TSC's do 
> > not happen at exactly the same time (the MSRs for each vCPU are loaded
> > in sequence). The synchronization code in kvm_set_tsc() is for those cases.
> 
> I agree with that, and this is one of the issues that KVM_SET_TSC_STATE
> is going to fix, since it accounts for it.
> 
> 
> > 
> > > and 2) writing to MSR_IA32_TSC_ADJUST does not trigger the
> > > synchronization code in kvm_write_tsc.
> > 
> > Not familiar how guests are using MSR_IA32_TSC_ADJUST (o

Re: [PATCH 0/2] RFC: Precise TSC migration

2020-12-01 Thread Marcelo Tosatti

On Tue, Dec 01, 2020 at 02:48:11PM +0100, Thomas Gleixner wrote:
> On Mon, Nov 30 2020 at 16:16, Marcelo Tosatti wrote:
> >> Besides, Linux guests don't sync the TSC via IA32_TSC write,
> >> but rather use IA32_TSC_ADJUST which currently doesn't participate
> >> in the tsc sync heruistics.
> >
> > Linux should not try to sync the TSC with IA32_TSC_ADJUST. It expects
> > the BIOS to boot with synced TSCs.
> 
> That's wishful thinking.
> 
> Reality is that BIOS tinkerers fail to get it right. TSC_ADJUST allows
> us to undo the wreckage they create.
> 
> Thanks,
> 
> tglx

Have not seen any multicore Dell/HP systems require that.

Anyway, for QEMU/KVM it should be synced (unless there is a bug
in the sync logic in the first place).

Re: [PATCH 0/2] RFC: Precise TSC migration

2020-11-30 Thread Marcelo Tosatti

Hi Maxim,

On Mon, Nov 30, 2020 at 03:35:57PM +0200, Maxim Levitsky wrote:
> Hi!
> 
> This is the first version of the work to make TSC migration more accurate,
> as was defined by Paulo at:
> https://www.spinics.net/lists/kvm/msg225525.html

Description from Oliver's patch:

"To date, VMMs have typically restored the guest's TSCs by value using
the KVM_SET_MSRS ioctl for each vCPU. However, restoring the TSCs by
value introduces some challenges with synchronization as the TSCs
continue to tick throughout the restoration process. As such, KVM has
some heuristics around TSC writes to infer whether or not the guest or
host is attempting to synchronize the TSCs."

Not really. The synchronization logic tries to sync TSCs during
BIOS boot (and CPU hotplug), because the TSC values are loaded
sequentially, say:

CPU realtimeTSC val
vcpu0   0 usec  0
vcpu1   100 usec0
vcpu2   200 usec0
...

And we'd like to see all vcpus to read the same value at all times.

Other than that, comment makes sense. The problem with live migration
is as follows:

We'd like the TSC value to be written, ideally, just before the first
VM-entry a vCPU (because at the moment the TSC_OFFSET has been written, 
the vcpus tsc is ticking, which will cause a visible forward jump
in vcpus tsc time).

Before the first VM-entry is the farthest point in time before guest
entry that one could do that.

The window (or forward jump) between KVM_SET_TSC and VM-entry was about
100ms last time i checked (which results in a 100ms time jump forward), 
See QEMU's 6053a86fe7bd3d5b07b49dae6c05f2cd0d44e687.

Have we measured any improvement with this patchset?

Then Paolo mentions (with >), i am replying as usual.

> Ok, after looking more at the code with Maxim I can confidently say that
> it's a total mess.  And a lot of the synchronization code is dead
> because 1) as far as we could see no guest synchronizes the TSC using
> MSR_IA32_TSC; 

Well, recent BIOS'es take care of synchronizing the TSC. So when Linux
boots, it does not have to synchronize TSC in software. 

However, upon migration (and initialization), the KVM_SET_TSC's do 
not happen at exactly the same time (the MSRs for each vCPU are loaded
in sequence). The synchronization code in kvm_set_tsc() is for those cases.

> and 2) writing to MSR_IA32_TSC_ADJUST does not trigger the
> synchronization code in kvm_write_tsc.

Not familiar how guests are using MSR_IA32_TSC_ADJUST (or Linux)...
Lets see:

/*
 * Freshly booted CPUs call into this:
 */
void check_tsc_sync_target(void)
{
struct tsc_adjust *cur = this_cpu_ptr(_adjust);
unsigned int cpu = smp_processor_id();
cycles_t cur_max_warp, gbl_max_warp;
int cpus = 2;

/* Also aborts if there is no TSC. */
if (unsynchronized_tsc())
return;

/*
 * Store, verify and sanitize the TSC adjust register. If
 * successful skip the test.
 *
 * The test is also skipped when the TSC is marked reliable. This
 * is true for SoCs which have no fallback clocksource. On these
 * SoCs the TSC is frequency synchronized, but still the TSC ADJUST
 * register might have been wreckaged by the BIOS..
 */
if (tsc_store_and_check_tsc_adjust(false) || tsc_clocksource_reliable) {
atomic_inc(_test);
return;
}

retry:

I'd force that synchronization path to be taken as a test-case.

> I have a few thoughts about the kvm masterclock synchronization,
> which relate to the Paulo's proposal that I implemented.
> 
> The idea of masterclock is that when the host TSC is synchronized
> (or as kernel call it, stable), and the guest TSC is synchronized as well,
> then we can base the kvmclock, on the same pair of
> (host time in nsec, host tsc value), for all vCPUs.

We _have_ to base. See the comment which starts with

"Assuming a stable TSC across physical CPUS, and a stable TSC"

at x86.c.

> 
> This makes the random error in calculation of this value invariant
> across vCPUS, and allows the guest to do kvmclock calculation in userspace
> (vDSO) since kvmclock parameters are vCPU invariant.

Actually, without synchronized host TSCs (and the masterclock scheme,
with a single base read from a vCPU), kvmclock in kernel is buggy as
well:

u64 pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
unsigned version;
u64 ret;
u64 last;
u8 flags;

do {
version = pvclock_read_begin(src);
ret = __pvclock_read_cycles(src, rdtsc_ordered());
flags = src->flags;
} while (pvclock_read_retry(src, version));

if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
src->flags &= ~PVCLOCK_GUEST_STOPPED;
pvclock_touch_watchdogs();
}

if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
(flags &

Re: [PATCH] cpuidle: Allow configuration of the polling interval before cpuidle enters a c-state

2020-11-27 Thread Marcelo Tosatti

On Thu, Nov 26, 2020 at 07:24:41PM +0100, Rafael J. Wysocki wrote:
> On Thu, Nov 26, 2020 at 6:25 PM Mel Gorman  
> wrote:
> >
> > It was noted that a few workloads that idle rapidly regressed when commit
> > 36fcb4292473 ("cpuidle: use first valid target residency as poll time")
> > was merged. The workloads in question were heavy communicators that idle
> > rapidly and were impacted by the c-state exit latency as the active CPUs
> > were not polling at the time of wakeup. As they were not particularly
> > realistic workloads, it was not considered to be a major problem.
> >
> > Unfortunately, a bug was then reported for a real workload in a production
> > environment that relied on large numbers of threads operating in a worker
> > pool pattern. These threads would idle for periods of time slightly
> > longer than the C1 exit latency and so incurred the c-state exit latency.
> > The application is sensitive to wakeup latency and appears to indirectly
> > rely on behaviour prior to commit on a37b969a61c1 ("cpuidle: poll_state:
> > Add time limit to poll_idle()") to poll for long enough to avoid the exit
> > latency cost.
> 
> Well, this means that it depends on the governor to mispredict short
> idle durations (so it selects "poll" over "C1" when it should select
> "C1" often enough) and on the lack of a polling limit (or a large
> enough one).
> 
> While the latter can be kind of addressed by increasing the polling
> limit, the misprediction in the governor really isn't guaranteed to
> happen and it really is necessary to have a PM QoS request in place to
> ensure a suitable latency.
> 
> > The current behaviour favours power consumption over wakeup latency
> > and it is reasonable behaviour but it should be tunable.
> 
> Only if there is no way to cover all of the relevant use cases in a
> generally acceptable way without adding more module params etc.
> 
> In this particular case, it should be possible to determine a polling
> limit acceptable to everyone.
> 
> BTW, I admit that using the exit latency of the lowest enabled C-state
> was kind of arbitrary and it was based on the assumption that it would
> make more sense to try to enter C1 instead of polling for that much
> time, but C1 is an exception, because it is often artificially made
> particularly attractive to the governors (by reducing its target
> residency as much as possible).  Also making the polling limit that
> short distorts the governor statistics somewhat.
> 
> So the polling limit equal to the target residency of C1 really may be
> overly aggressive and something tick-based may work better in general
> (e.g. 1/8 or 1/16 of the tick period).
> 
> In principle, a multiple of C1 target residency could be used too.
> 
> > In theory applications could use /dev/cpu_dma_latency but not all 
> > applications
> > are aware of cpu_dma_latency. Similarly, a tool could be installed
> > that opens cpu_dma_latency for the whole system but such a tool is not
> > always available, is not always known to the sysadmin or the tool can have
> > unexpected side-effects if it tunes more than cpu_dma_latency. In practice,
> > it is more common for sysadmins to try idle=poll (which is x86 specific)
> 
> And really should be avoided if one cares about turbo or wants to
> avoid thermal issues.
> 
> > or try disabling c-states and hope for the best.
> >
> > This patch makes it straight-forward to configure how long a CPU should
> > poll before entering a c-state.
> 
> Well, IMV this is not straightforward at all.
> 
> It requires the admin to know how cpuidle works and why this
> particular polling limit is likely to be suitable for the given
> workload.  And whether or not the default polling limit should be
> changed at all.

KVM polling (virt/kvm/kvm_main.c grow_halt_poll_ns/shrink_halt_poll_ns)
tries to adjust the polling window based on poll success/failure. 

The cpuidle haltpoll governor (for KVM guests) uses the same adjustment
logic.

Perhaps a similar (or improved) scheme can be adapted to baremetal.

https://www.kernel.org/doc/Documentation/virtual/kvm/halt-polling.txt
> 
> Honestly, nobody knows that in advance (with all due respect) and this
> would cause people to try various settings at random and stick to the
> one that they feel works best for them without much understanding.
> 
> > By default, there is no behaviour change.
> > At build time a decision can be made to favour performance over power
> > by default even if that potentially impacts turbo boosting for workloads
> > that are sensitive to wakeup latency. In the event the kernel default is
> > not suitable, the kernel command line can be used as a substitute for
> > implementing cpu_dma_latency support in an application or requiring an
> > additional tool to be installed.
> >
> > Note that it is not expected that tuning for longer polling times will be a
> > universal win. For example, extra polling might prevent a turbo state being
> > used or prevent hyperthread resources being

Re: [PATCH] iio: adc: ad7292: remove unneeded spi_set_drvdata()

2020-11-20 Thread Marcelo Schmitt

LGTM.
Tested on raspberry pi kernel - rpi-5.9.y.

ad7292 was heavily based on ad7768-1. 
Maybe this might apply to ad7768-1 as well.

Reviewed-by: Marcelo Schmitt 
Tested-by: Marcelo Schmitt 

On 11/19, Alexandru Ardelean wrote:
> This seems to have been copied from a driver that calls spi_set_drvdata()
> but doesn't call spi_get_drvdata().
> Setting a private object on the SPI device's object isn't necessary if it
> won't be accessed.
> This change removes the spi_set_drvdata() call.
> 
> Signed-off-by: Alexandru Ardelean 
> ---
>  drivers/iio/adc/ad7292.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/drivers/iio/adc/ad7292.c b/drivers/iio/adc/ad7292.c
> index ab204e9199e9..70e33dd1c9f7 100644
> --- a/drivers/iio/adc/ad7292.c
> +++ b/drivers/iio/adc/ad7292.c
> @@ -276,8 +276,6 @@ static int ad7292_probe(struct spi_device *spi)
>   return -EINVAL;
>   }
>  
> - spi_set_drvdata(spi, indio_dev);
> -
>   st->reg = devm_regulator_get_optional(>dev, "vref");
>   if (!IS_ERR(st->reg)) {
>   ret = regulator_enable(st->reg);
> -- 
> 2.17.1
>

Re: [PATCH] sctp: Fix sending when PMTU is less than SCTP_DEFAULT_MINSEGMENT

2020-11-06 Thread Marcelo Ricardo Leitner

On Fri, Nov 06, 2020 at 10:48:24AM +0100, Petr Malat wrote:
> On Fri, Nov 06, 2020 at 05:46:34AM -0300, Marcelo Ricardo Leitner wrote:
> > On Thu, Nov 05, 2020 at 11:39:47AM +0100, Petr Malat wrote:
> > > Function sctp_dst_mtu() never returns lower MTU than
> > > SCTP_TRUNC4(SCTP_DEFAULT_MINSEGMENT) even when the actual MTU is less,
> > > in which case we rely on the IP fragmentation and must enable it.
> > 
> > This should be being handled at sctp_packet_will_fit():
> 
> sctp_packet_will_fit() does something a little bit different, it
> allows fragmentation, if the packet must be longer than the pathmtu
> set in SCTP structures, which is never less than 512 (see
> sctp_dst_mtu()) even when the actual mtu is less than 512.
> 
> One can test it by setting mtu of an interface to e.g. 300,
> and sending a longer packet (e.g. 400B):
> >   psize = packet->size;
> >   if (packet->transport->asoc)
> >   pmtu = packet->transport->asoc->pathmtu;
> >   else
> >   pmtu = packet->transport->pathmtu;
> here the returned pmtu will be 512

Thing is, your patch is using the same vars to check for it:
+   pmtu = tp->asoc ? tp->asoc->pathmtu : tp->pathmtu;

> 
> > 
> >   /* Decide if we need to fragment or resubmit later. */
> >   if (psize + chunk_len > pmtu) {
> This branch will not be taken as the packet length is less then 512

Right, ok. While then your patch will catch it because pmtu will be
SCTP_DEFAULT_MINSEGMENT, as it is checking with '<='.

> 
> >}
> > 
> And the whole function will return SCTP_XMIT_OK without setting
> ipfragok.
> 
> I think the idea of never going bellow 512 in sctp_dst_mtu() is to
> reduce overhead of SCTP headers, which is fine, but when we do that,
> we must be sure to allow the IP fragmentation, which is currently
> missing.

Hmm. ip frag is probably just worse than higher header/payload
overhead.

> 
> The other option would be to keep track of the real MTU in pathmtu
> and perform max(512, pathmtu) in sctp_packet_will_fit() function.

I need to check where this 512 came from. I don't recall it from top
of my head and it's from before git history. Maybe we should just drop
this limit, if it's artificial. IPV4_MIN_MTU is 68.

> 
> Not sure when exactly this got broken, but using MTU less than 512
> used to work in 4.9.

Uhh, that's a bit old already. If you could narrow it down, that would
be nice.

  Marcelo

Re: [PATCH] sctp: Fix sending when PMTU is less than SCTP_DEFAULT_MINSEGMENT

2020-11-06 Thread Marcelo Ricardo Leitner

On Thu, Nov 05, 2020 at 11:39:47AM +0100, Petr Malat wrote:
> Function sctp_dst_mtu() never returns lower MTU than
> SCTP_TRUNC4(SCTP_DEFAULT_MINSEGMENT) even when the actual MTU is less,
> in which case we rely on the IP fragmentation and must enable it.

This should be being handled at sctp_packet_will_fit():

  psize = packet->size;
  if (packet->transport->asoc)
  pmtu = packet->transport->asoc->pathmtu;
  else
  pmtu = packet->transport->pathmtu;

  /* Decide if we need to fragment or resubmit later. */
  if (psize + chunk_len > pmtu) {
  /* It's OK to fragment at IP level if any one of the following
   * is true:
   *  1. The packet is empty (meaning this chunk is greater
   * the MTU)
   *  2. The packet doesn't have any data in it yet and data
   * requires authentication.
   */
  if (sctp_packet_empty(packet) ||
  (!packet->has_data && chunk->auth)) {
  /* We no longer do re-fragmentation.
   * Just fragment at the IP layer, if we
   * actually hit this condition
   */
  packet->ipfragok = 1;
  goto out;
  }

Why the above doesn't handle it already?

Re: [PATCH v36 00/13] /dev/random - a new approach

2020-11-04 Thread Marcelo Henrique Cerri

On Mon, Nov 02, 2020 at 02:44:35PM +0100, Torsten Duwe wrote:
> On Wed, 28 Oct 2020 19:07:28 +0100
> Greg Kroah-Hartman  wrote:
> 
> > On Wed, Oct 28, 2020 at 06:51:17PM +0100, Torsten Duwe wrote:
> > > On Mon, 19 Oct 2020 21:28:50 +0200
> > > Stephan Müller  wrote:
> > > [...]
> > > > * Sole use of crypto for data processing:
> > > [...]
> > > >  - The LRNG uses only properly defined and implemented
> > > > cryptographic algorithms unlike the use of the SHA-1
> > > > transformation in the existing /dev/random implementation.
> > > > 
> > > >  - Hash operations use NUMA-node-local hash instances to benefit
> > > > large parallel systems.
> > > > 
> > > >  - LRNG uses limited number of data post-processing steps
> > > [...]
> > > > * Performance
> > > > 
> > > >  - Faster by up to 75% in the critical code path of the interrupt
> > > > handler depending on data collection size configurable at kernel
> > > > compile time - the default is about equal in performance with
> > > > existing /dev/random as outlined in [2] section 4.2.
> > > 
> > > [...]
> > > >  - ChaCha20 DRNG is significantly faster as implemented in the
> > > > existing /dev/random as demonstrated with [2] table 2.
> > > > 
> > > >  - Faster entropy collection during boot time to reach fully
> > > > seeded level, including on virtual systems or systems with SSDs as
> > > > outlined in [2] section 4.1.
> > > > 
> > > > * Testing
> > > [...]
> > > 
> > > So we now have 2 proposals for a state-of-the-art RNG, and over a
> > > month without a single comment on-topic from any `get_maintainer.pl`
> > > 
> > > I don't want to emphasise the certification aspects so much. The
> > > interrelation is rather that those certifications require certain
> > > code features, features which are reasonable per se. But the
> > > current code is lagging way behind.
> > > 
> > > I see the focus namely on performance, scalability, testability and
> > > virtualisation. And it certainly is an advantage to use the code
> > > already present under crypto, with its optimisations, and not rely
> > > on some home brew.
> > > 
> > > Can we please have a discussion about how to proceed?
> > > Ted, Greg, Arnd: which approach would you prefer?
> > 
> > Greg and Arnd are not the random driver maintainers, as is now
> > correctly shown in the 5.10-rc1 MAINTAINERS file, so I doubt we (well
> > at least I) have any say here, sorry.
> 
> No problem. get_maintainer (for the proposals) works on paths, not on
> topics and I didn't want to leave anybody out.
> 
> Ted, if you don't have the time any more to take care of /dev/random,
> it's not a shame to hand over maintainership, especially given your
> long history of Linux contributions.
> 
> Please do seriously consider to hand it over to someone new. This would
> be a good opportunity.
> 
>   Torsten
>

I'd like to help with any solution upstream decide to follow either
testing or with code. I understand some of the concerns the community
has regarding FIPS but that doesn't make it less relevant and it's
totally possible to improve /dev/random while allowing it users to
decide if they want to comply to SP 800 90B. I believe the main
blocker now is the lack of direction.

-- 
Regards,
Marcelo



signature.asc
Description: PGP signature

Re: [PATCH] sctp: Fix COMM_LOST/CANT_STR_ASSOC err reporting on big-endian platforms

2020-11-02 Thread Marcelo Ricardo Leitner

On Fri, Oct 30, 2020 at 02:26:33PM +0100, Petr Malat wrote:
> Commit 978aa0474115 ("sctp: fix some type cast warnings introduced since
> very beginning")' broke err reading from sctp_arg, because it reads the
> value as 32-bit integer, although the value is stored as 16-bit integer.
> Later this value is passed to the userspace in 16-bit variable, thus the
> user always gets 0 on big-endian platforms. Fix it by reading the __u16
> field of sctp_arg union, as reading err field would produce a sparse
> warning.

Makes sense.

> 
> Signed-off-by: Petr Malat 

Then, it also needs:
Fixes: 978aa0474115 ("sctp: fix some type cast warnings introduced since very 
beginning")'

Acked-by: Marcelo Ricardo Leitner 
(If the maintainers can't add the Fixes tag above, please keep the ack
on the v2)

Thanks.

Re: [PATCH v4 4/4] PCI: Limit pci_alloc_irq_vectors() to housekeeping CPUs

2020-10-27 Thread Marcelo Tosatti

On Mon, Oct 26, 2020 at 06:22:29PM -0400, Nitesh Narayan Lal wrote:
> 
> On 10/26/20 5:50 PM, Thomas Gleixner wrote:
> > On Mon, Oct 26 2020 at 14:11, Jacob Keller wrote:
> >> On 10/26/2020 1:11 PM, Thomas Gleixner wrote:
> >>> On Mon, Oct 26 2020 at 12:21, Jacob Keller wrote:
> >>>> Are there drivers which use more than one interrupt per queue? I know
> >>>> drivers have multiple management interrupts.. and I guess some drivers
> >>>> do combined 1 interrupt per pair of Tx/Rx..  It's also plausible to to
> >>>> have multiple queues for one interrupt .. I'm not sure how a single
> >>>> queue with multiple interrupts would work though.
> >>> For block there is always one interrupt per queue. Some Network drivers
> >>> seem to have seperate RX and TX interrupts per queue.
> >> That's true when thinking of Tx and Rx as a single queue. Another way to
> >> think about it is "one rx queue" and "one tx queue" each with their own
> >> interrupt...
> >>
> >> Even if there are devices which force there to be exactly queue pairs,
> >> you could still think of them as separate entities?
> > Interesting thought.
> >
> > But as Jakub explained networking queues are fundamentally different
> > from block queues on the RX side. For block the request issued on queue
> > X will raise the complete interrupt on queue X.
> >
> > For networking the TX side will raise the TX interrupt on the queue on
> > which the packet was queued obviously or should I say hopefully. :)
> 
> This is my impression as well.
> 
> > But incoming packets will be directed to some receive queue based on a
> > hash or whatever crystallball logic the firmware decided to implement.
> >
> > Which makes this not really suitable for the managed interrupt and
> > spreading approach which is used by block-mq. Hrm...
> >
> > But I still think that for curing that isolation stuff we want at least
> > some information from the driver. Alternative solution would be to grant
> > the allocation of interrupts and queues and have some sysfs knob to shut
> > down queues at runtime. If that shutdown results in releasing the queue
> > interrupt (via free_irq()) then the vector exhaustion problem goes away.
> 
> I think this is close to what I and Marcelo were discussing earlier today
> privately.
> 
> I don't think there is currently a way to control the enablement/disablement 
> of
> interrupts from the userspace.

As long as the interrupt obeys the "trigger when request has been
performed by local CPU" rule (#1) (for MSI type interrupts, where driver 
allocates
one I/O interrupt per CPU), don't see a need for the interface.

For other types of interrupts, interrupt controller should be programmed
to not include the isolated CPU on its "destination CPU list".

About the block VS network discussion, what we are trying to do at skb
level (Paolo Abeni CC'ed, author of the suggestion) is to use RPS to
avoid skbs from being queued to a CPU (on RX), and to queue skbs
on housekeeping CPUs for processing (TX).

However, if per-CPU interrupts are not disabled, then the (for example)
network device is free to include the CPU in its list of destinations.
Which would require one to say, configure RPS (or whatever mechanism
is distributing interrupts).

Hum, it would feel safer (rather than trust the #1 rule to be valid
in all cases) to ask the driver to disable the interrupt (after shutting
down queue) for that particular CPU.

BTW, Thomas, software is free to configure a particular MSI-X interrupt
to point to any CPU:

10.11 MESSAGE SIGNALLED INTERRUPTS
The PCI Local Bus Specification, Rev 2.2 (www.pcisig.com) introduces the 
concept of message signalled interrupts.
As the specification indicates:
“Message signalled interrupts (MSI) is an optional feature that enables PCI 
devices to request
service by writing a system-specified message to a system-specified address 
(PCI DWORD memory
write transaction). The transaction address specifies the message destination 
while the transaction
data specifies the message. System software is expected to initialize the 
message destination and
message during device configuration, allocating one or more non-shared messages 
to each MSI
capable function.”

Fields in the Message Address Register are as follows:
1. Bits 31-20 — These bits contain a fixed value for interrupt messages 
(0FEEH). This value locates interrupts at
the 1-MByte area with a base address of 4G – 18M. All accesses to this region 
are directed as interrupt
messages. Care must to be taken to ensure that no other device claims the 
region as I/O space.
2. Destination ID — This field contains an 8-bit destination

Re: [PATCH v4 4/4] PCI: Limit pci_alloc_irq_vectors() to housekeeping CPUs

2020-10-27 Thread Marcelo Tosatti

On Mon, Oct 26, 2020 at 08:00:39PM +0100, Thomas Gleixner wrote:
> On Mon, Oct 26 2020 at 14:30, Marcelo Tosatti wrote:
> > On Fri, Oct 23, 2020 at 11:00:52PM +0200, Thomas Gleixner wrote:
> >> So without information from the driver which tells what the best number
> >> of interrupts is with a reduced number of CPUs, this cutoff will cause
> >> more problems than it solves. Regressions guaranteed.
> >
> > One might want to move from one interrupt per isolated app core
> > to zero, or vice versa. It seems that "best number of interrupts 
> > is with reduced number of CPUs" information, is therefore in userspace, 
> > not in driver...
> 
> How does userspace know about the driver internals? Number of management
> interrupts, optimal number of interrupts per queue?
> 
> >> Managed interrupts base their interrupt allocation and spreading on
> >> information which is handed in by the individual driver and not on crude
> >> assumptions. They are not imposing restrictions on the use case.
> >> 
> >> It's perfectly fine for isolated work to save a data set to disk after
> >> computation has finished and that just works with the per-cpu I/O queue
> >> which is otherwise completely silent. 
> >
> > Userspace could only change the mask of interrupts which are not 
> > triggered by requests from the local CPU (admin, error, mgmt, etc),
> > to avoid the vector exhaustion problem.
> >
> > However, there is no explicit way for userspace to know that, as far as
> > i know.
> >
> >  130:  34845  0  0  0  0  0 
> >  0  0  IR-PCI-MSI 33554433-edge  nvme0q1
> >  131:  0  27062  0  0  0  0 
> >  0  0  IR-PCI-MSI 33554434-edge  nvme0q2
> >  132:  0  0  24393  0  0  0 
> >  0  0  IR-PCI-MSI 33554435-edge  nvme0q3
> >  133:  0  0  0  24313  0  0 
> >  0  0  IR-PCI-MSI 33554436-edge  nvme0q4
> >  134:  0  0  0  0  20608  0 
> >  0  0  IR-PCI-MSI 33554437-edge  nvme0q5
> >  135:  0  0  0  0  0  22163 
> >  0  0  IR-PCI-MSI 33554438-edge  nvme0q6
> >  136:  0  0  0  0  0  0 
> >  23020  0  IR-PCI-MSI 33554439-edge  nvme0q7
> >  137:  0  0  0  0  0  0 
> >  0  24285  IR-PCI-MSI 33554440-edge  nvme0q8
> >
> > Can that be retrieved from PCI-MSI information, or drivers
> > have to inform this?
> 
> The driver should use a different name for the admin queues.

Works for me.

Sounds more like a heuristic which can break, so documenting this 
as an "interface" seems appropriate.

Re: [PATCH v4 4/4] PCI: Limit pci_alloc_irq_vectors() to housekeeping CPUs

2020-10-26 Thread Marcelo Tosatti

On Fri, Oct 23, 2020 at 11:00:52PM +0200, Thomas Gleixner wrote:
> On Fri, Oct 23 2020 at 09:10, Nitesh Narayan Lal wrote:
> > On 10/23/20 4:58 AM, Peter Zijlstra wrote:
> >> On Thu, Oct 22, 2020 at 01:47:14PM -0400, Nitesh Narayan Lal wrote:
> >> So shouldn't we then fix the drivers / interface first, to get rid of
> >> this inconsistency?
> >>
> > Considering we agree that excess vector is a problem that needs to be
> > solved across all the drivers and that you are comfortable with the other
> > three patches in the set. If I may suggest the following:
> >
> > - We can pick those three patches for now, as that will atleast fix a
> >   driver that is currently impacting RT workloads. Is that a fair
> >   expectation?
> 
> No. Blindly reducing the maximum vectors to the number of housekeeping
> CPUs is patently wrong. The PCI core _cannot_ just nilly willy decide
> what the right number of interrupts for this situation is.
> 
> Many of these drivers need more than queue interrupts, admin, error
> interrupt and some operate best with seperate RX/TX interrupts per
> queue. They all can "work" with a single PCI interrupt of course, but
> the price you pay is performance.
> 
> An isolated setup, which I'm familiar with, has two housekeeping
> CPUs. So far I restricted the number of network queues with a module
> argument to two, which allocates two management interrupts for the
> device and two interrupts (RX/TX) per queue, i.e. a total of six.
> 
> Now I reduced the number of available interrupts to two according to
> your hack, which makes it use one queue RX/TX combined and one
> management interrupt. Guess what happens? Network performance tanks to
> the points that it breaks a carefully crafted setup.
> 
> The same applies to a device which is application specific and wants one
> channel including an interrupt per isolated application core. Today I
> can isolate 8 out of 12 CPUs and let the device create 8 channels and
> set one interrupt and channel affine to each isolated CPU. With your
> hack, I get only 4 interrupts and channels. Fail!

Good point.

> You cannot declare that all this is perfectly fine, just because it does
> not matter for your particular use case.
> 
> So without information from the driver which tells what the best number
> of interrupts is with a reduced number of CPUs, this cutoff will cause
> more problems than it solves. Regressions guaranteed.

One might want to move from one interrupt per isolated app core
to zero, or vice versa. It seems that "best number of interrupts 
is with reduced number of CPUs" information, is therefore in userspace, 
not in driver...

No?

> Managed interrupts base their interrupt allocation and spreading on
> information which is handed in by the individual driver and not on crude
> assumptions. They are not imposing restrictions on the use case.
> 
> It's perfectly fine for isolated work to save a data set to disk after
> computation has finished and that just works with the per-cpu I/O queue
> which is otherwise completely silent. 

Userspace could only change the mask of interrupts which are not 
triggered by requests from the local CPU (admin, error, mgmt, etc),
to avoid the vector exhaustion problem.

However, there is no explicit way for userspace to know that, as far as
i know.

 130:  34845  0  0  0  0  0 
 0  0  IR-PCI-MSI 33554433-edge  nvme0q1
 131:  0  27062  0  0  0  0 
 0  0  IR-PCI-MSI 33554434-edge  nvme0q2
 132:  0  0  24393  0  0  0 
 0  0  IR-PCI-MSI 33554435-edge  nvme0q3
 133:  0  0  0  24313  0  0 
 0  0  IR-PCI-MSI 33554436-edge  nvme0q4
 134:  0  0  0  0  20608  0 
 0  0  IR-PCI-MSI 33554437-edge  nvme0q5
 135:  0  0  0  0  0  22163 
 0  0  IR-PCI-MSI 33554438-edge  nvme0q6
 136:  0  0  0  0  0  0  
23020  0  IR-PCI-MSI 33554439-edge  nvme0q7
 137:  0  0  0  0  0  0 
 0  24285  IR-PCI-MSI 33554440-edge  nvme0q8


Can that be retrieved from PCI-MSI information, or drivers
have to inform this? 

> All isolated workers can do the
> same in parallel without trampling on each other toes by competing for a
> reduced number of queues which are affine to the housekeeper CPUs.
> 
> Unfortunately network multi-queue is substantially different from block
> multi-queue (as I learned in this conversation), so the concept cannot
> be applied one-to-one to networking as is. But there are certainly part
> of it which can be reused.
> 
> This needs a lot more thought than just these crude hacks.
> 
> Especially under the aspect that there

Re: [PATCH v4 4/4] PCI: Limit pci_alloc_irq_vectors() to housekeeping CPUs

2020-10-22 Thread Marcelo Tosatti

On Wed, Oct 21, 2020 at 10:25:48PM +0200, Thomas Gleixner wrote:
> On Tue, Oct 20 2020 at 20:07, Thomas Gleixner wrote:
> > On Tue, Oct 20 2020 at 12:18, Nitesh Narayan Lal wrote:
> >> However, IMHO we would still need a logic to prevent the devices from
> >> creating excess vectors.
> >
> > Managed interrupts are preventing exactly that by pinning the interrupts
> > and queues to one or a set of CPUs, which prevents vector exhaustion on
> > CPU hotplug.
> >
> > Non-managed, yes that is and always was a problem. One of the reasons
> > why managed interrupts exist.
> 
> But why is this only a problem for isolation? The very same problem
> exists vs. CPU hotplug and therefore hibernation.
> 
> On x86 we have at max. 204 vectors available for device interrupts per
> CPU. So assumed the only device interrupt in use is networking then any
> machine which has more than 204 network interrupts (queues, aux ...)
> active will prevent the machine from hibernation.
> 
> Aside of that it's silly to have multiple queues targeted at a single
> CPU in case of hotplug. And that's not a theoretical problem.  Some
> power management schemes shut down sockets when the utilization of a
> system is low enough, e.g. outside of working hours.

Exactly. It seems the proper way to do handle this is to disable
individual vectors rather than moving them. And that is needed for 
dynamic isolate / unisolate anyway...

> The whole point of multi-queue is to have locality so that traffic from
> a CPU goes through the CPU local queue. What's the point of having two
> or more queues on a CPU in case of hotplug?
> 
> The right answer to this is to utilize managed interrupts and have
> according logic in your network driver to handle CPU hotplug. When a CPU
> goes down, then the queue which is associated to that CPU is quiesced
> and the interrupt core shuts down the relevant interrupt instead of
> moving it to an online CPU (which causes the whole vector exhaustion
> problem on x86). When the CPU comes online again, then the interrupt is
> reenabled in the core and the driver reactivates the queue.

Aha... But it would be necessary to do that from userspace (for runtime
isolate/unisolate).

Re: [patch 1/2] nohz: only wakeup a single target cpu when kicking a task

2020-10-20 Thread Marcelo Tosatti

On Thu, Oct 15, 2020 at 01:40:53AM +0200, Frederic Weisbecker wrote:
> On Wed, Oct 14, 2020 at 10:33:21AM +0200, Peter Zijlstra wrote:
> > On Tue, Oct 13, 2020 at 02:13:28PM -0300, Marcelo Tosatti wrote:
> > 
> > > > Yes but if the task isn't running, run_posix_cpu_timers() doesn't have
> > > > anything to elapse. So indeed we can spare the IPI if the task is not
> > > > running. Provided ordering makes sure that the task sees the new 
> > > > dependency
> > > > when it schedules in of course.
> > > 
> > > True.
> > > 
> > >  * p->on_cpu <- { 0, 1 }:
> > >  *
> > >  *   is set by prepare_task() and cleared by finish_task() such that it 
> > > will be
> > >  *   set before p is scheduled-in and cleared after p is scheduled-out, 
> > > both
> > >  *   under rq->lock. Non-zero indicates the task is running on its CPU.
> > > 
> > > 
> > > CPU-0 (tick_set_dep)CPU-1 (task switch)
> > > 
> > > STORE p->tick_dep_mask
> > > smp_mb() (atomic_fetch_or())
> > > LOAD p->on_cpu
> > > 
> > > 
> > > context_switch(prev, next)
> > > STORE next->on_cpu = 1
> > > ... [*]
> > > 
> > > LOAD current->tick_dep_mask
> > > 
> > 
> > That load is in tick_nohz_task_switch() right? (which BTW is placed
> > completely wrong) You could easily do something like the below I
> > suppose.
> > 
> > diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> > index 81632cd5e3b7..2a5fafe66bb0 100644
> > --- a/kernel/time/tick-sched.c
> > +++ b/kernel/time/tick-sched.c
> > @@ -410,6 +410,14 @@ void __tick_nohz_task_switch(void)
> > ts = this_cpu_ptr(_cpu_sched);
> >  
> > if (ts->tick_stopped) {
> > +   /*
> > +* tick_set_dep()   (this)
> > +*
> > +* STORE p->tick_dep_mask   STORE p->on_cpu
> > +* smp_mb() smp_mb()
> > +* LOAD p->on_cpu   LOAD p->tick_dep_mask
> > +*/
> > +   smp_mb();
> > if (atomic_read(>tick_dep_mask) ||
> > atomic_read(>signal->tick_dep_mask))
> > tick_nohz_full_kick();
> 
> It would then need to be unconditional (whatever value of ts->tick_stopped).
> Assuming the tick isn't stopped, we may well have an interrupt firing right
> after schedule() which doesn't see the new value of tick_dep_map.
> 
> Alternatively, we could rely on p->on_rq which is set to TASK_ON_RQ_QUEUED
> at wake up time, prior to the schedule() full barrier. Of course that doesn't
> mean that the task is actually the one running on the CPU but it's a good 
> sign,
> considering that we are running in nohz_full mode and it's usually optimized
> for single task mode.

Unfortunately that would require exporting p->on_rq which is internal to
the scheduler, locklessly.

(can surely do that if you prefer!)

> 
> Also setting a remote task's tick dependency is only used by posix cpu timer
> in case the user has the bad taste to enqueue on a task running in nohz_full
> mode. It shouldn't deserve an unconditional full barrier in the schedule path.
> 
> If the target is current, as is used by RCU, I guess we can keep a special
> treatment.

To answer PeterZ's original question:

"So we need to kick the CPU unconditionally, or only when the task is
actually running? AFAICT we only care about current->tick_dep_mask."

If there is a task sharing signals, executing on a remote CPU, yes that remote 
CPU 
should be awakened.

Could skip the IPI if the remote process is not running, however for 
the purposes of low latency isolated processes, this optimization is
not necessary.

So the last posted patchset is good enough for isolated low latency processes.

Do you guys want me to do something or can you take the series as it is?

> > re tick_nohz_task_switch() being placed wrong, it should probably be
> > placed before finish_lock_switch(). Something like so.
> > 
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index cf044580683c..5c92c959824f 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -4084,6 +4084,7 @@ static struct rq *finish_task_switch(struct 
> > task_struct *prev)
> > vtime_task_switch(prev);
> > perf_event_task_sched_in(prev, current);
> > finish_task(prev);
> > +   tick_nohz_task_switch();
> > finish_lock_switch(rq);
> > finish_arch_post_lock_switch();
> > kcov_finish_switch(current);
> > @@ -4121,7 +4122,6 @@ static struct rq *finish_task_switch(struct 
> > task_struct *prev)
> > put_task_struct_rcu_user(prev);
> > }
> >  
> > -   tick_nohz_task_switch();
> 
> IIRC, we wanted to keep it outside rq lock because it shouldn't it...

Re: [PATCH v4 4/4] PCI: Limit pci_alloc_irq_vectors() to housekeeping CPUs

2020-10-19 Thread Marcelo Tosatti

On Mon, Oct 19, 2020 at 01:11:37PM +0200, Peter Zijlstra wrote:
> On Sun, Oct 18, 2020 at 02:14:46PM -0400, Nitesh Narayan Lal wrote:
> > >> +hk_cpus = housekeeping_num_online_cpus(HK_FLAG_MANAGED_IRQ);
> > >> +
> > >> +/*
> > >> + * If we have isolated CPUs for use by real-time tasks, to keep 
> > >> the
> > >> + * latency overhead to a minimum, device-specific IRQ vectors 
> > >> are moved
> > >> + * to the housekeeping CPUs from the userspace by changing their
> > >> + * affinity mask. Limit the vector usage to keep housekeeping 
> > >> CPUs from
> > >> + * running out of IRQ vectors.
> > >> + */
> > >> +if (hk_cpus < num_online_cpus()) {
> > >> +if (hk_cpus < min_vecs)
> > >> +max_vecs = min_vecs;
> > >> +else if (hk_cpus < max_vecs)
> > >> +max_vecs = hk_cpus;
> > > is that:
> > >
> > >   max_vecs = clamp(hk_cpus, min_vecs, max_vecs);
> > 
> > Yes, I think this will do.
> > 
> > >
> > > Also, do we really need to have that conditional on hk_cpus <
> > > num_online_cpus()? That is, why can't we do this unconditionally?
> > 
> > FWIU most of the drivers using this API already restricts the number of
> > vectors based on the num_online_cpus, if we do it unconditionally we can
> > unnecessary duplicate the restriction for cases where we don't have any
> > isolated CPUs.
> 
> unnecessary isn't really a concern here, this is a slow path. What's
> important is code clarity.
> 
> > Also, different driver seems to take different factors into consideration
> > along with num_online_cpus while finding the max_vecs to request, for
> > example in the case of mlx5:
> > MLX5_CAP_GEN(dev, num_ports) * num_online_cpus() +
> >    MLX5_EQ_VEC_COMP_BASE
> > 
> > Having hk_cpus < num_online_cpus() helps us ensure that we are only
> > changing the behavior when we have isolated CPUs.
> > 
> > Does that make sense?
> 
> That seems to want to allocate N interrupts per cpu (plus some random
> static amount, which seems weird, but whatever). This patch breaks that.

On purpose. For the isolated CPUs we don't want network device 
interrupts (in this context).

> So I think it is important to figure out what that driver really wants
> in the nohz_full case. If it wants to retain N interrupts per CPU, and
> only reduce the number of CPUs, the proposed interface is wrong.

It wants N interrupts per non-isolated (AKA housekeeping) CPU.
Zero interrupts for isolated interrupts.

> > > And what are the (desired) semantics vs hotplug? Using a cpumask without
> > > excluding hotplug is racy.
> > 
> > The housekeeping_mask should still remain constant, isn't?
> > In any case, I can double check this.
> 
> The goal is very much to have that dynamically configurable.

Yes, but this patch is a fix for customer bug in the old, static on-boot 
isolation CPU configuration.

---

Discussing the dynamic configuration (not this patch!) case:

Would need to enable/disable interrupts for a particular device 
on a per-CPU basis. Such interface does not exist yet.

Perhaps that is what you are looking for when writing "proposed interface
is wrong" Peter?

Re: [patch 1/2] nohz: only wakeup a single target cpu when kicking a task

2020-10-13 Thread Marcelo Tosatti

On Thu, Oct 08, 2020 at 09:54:44PM +0200, Frederic Weisbecker wrote:
> On Thu, Oct 08, 2020 at 02:54:09PM -0300, Marcelo Tosatti wrote:
> > On Thu, Oct 08, 2020 at 02:22:56PM +0200, Peter Zijlstra wrote:
> > > On Wed, Oct 07, 2020 at 03:01:52PM -0300, Marcelo Tosatti wrote:
> > > > When adding a tick dependency to a task, its necessary to
> > > > wakeup the CPU where the task resides to reevaluate tick
> > > > dependencies on that CPU.
> > > > 
> > > > However the current code wakes up all nohz_full CPUs, which 
> > > > is unnecessary.
> > > > 
> > > > Switch to waking up a single CPU, by using ordering of writes
> > > > to task->cpu and task->tick_dep_mask.
> > > > 
> > > > From: Frederic Weisbecker 
> > > > Suggested-by: Peter Zijlstra 
> > > > Signed-off-by: Frederic Weisbecker 
> > > > Signed-off-by: Marcelo Tosatti 
> > > > 
> > > > Index: linux-2.6/kernel/time/tick-sched.c
> > > > ===
> > > > --- linux-2.6.orig/kernel/time/tick-sched.c
> > > > +++ linux-2.6/kernel/time/tick-sched.c
> > > > @@ -274,6 +274,31 @@ void tick_nohz_full_kick_cpu(int cpu)
> > > > irq_work_queue_on(_cpu(nohz_full_kick_work, cpu), cpu);
> > > >  }
> > > >  
> > > > +static void tick_nohz_kick_task(struct task_struct *tsk)
> > > > +{
> > > > +   int cpu = task_cpu(tsk);
> > > > +
> > > > +   /*
> > > > +* If the task concurrently migrates to another cpu,
> > > > +* we guarantee it sees the new tick dependency upon
> > > > +* schedule.
> > > > +*
> > > > +*
> > > > +* set_task_cpu(p, cpu);
> > > > +*   STORE p->cpu = @cpu
> > > > +* __schedule() (switch to task 'p')
> > > > +*   LOCK rq->lock
> > > > +*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
> > > > +*   tick_nohz_task_switch()smp_mb() 
> > > > (atomic_fetch_or())
> > > > +*  LOAD p->tick_dep_mask   LOAD p->cpu
> > > > +*/
> > > > +
> > > > +   preempt_disable();
> > > > +   if (cpu_online(cpu))
> > > > +   tick_nohz_full_kick_cpu(cpu);
> > > > +   preempt_enable();
> > > > +}
> > > 
> > > So we need to kick the CPU unconditionally, or only when the task is
> > > actually running? AFAICT we only care about current->tick_dep_mask.
> > 
> > tick is necessary to execute run_posix_cpu_timers, from tick interrupt, 
> > even if task is not running.
> 
> Yes but if the task isn't running, run_posix_cpu_timers() doesn't have
> anything to elapse. So indeed we can spare the IPI if the task is not
> running. Provided ordering makes sure that the task sees the new dependency
> when it schedules in of course.

True.

 * p->on_cpu <- { 0, 1 }:
 *
 *   is set by prepare_task() and cleared by finish_task() such that it will be
 *   set before p is scheduled-in and cleared after p is scheduled-out, both
 *   under rq->lock. Non-zero indicates the task is running on its CPU.


CPU-0 (tick_set_dep)CPU-1 (task switch)

STORE p->tick_dep_mask
smp_mb() (atomic_fetch_or())
LOAD p->on_cpu


context_switch(prev, next)
STORE next->on_cpu = 1
... [*]

LOAD current->tick_dep_mask


Don't see any explicit memory barrier in the [*] section?

[patch 2/2] nohz: change signal tick dependency to wakeup CPUs of member tasks

2020-10-08 Thread Marcelo Tosatti

Rather than waking up all nohz_full CPUs on the system, only wakeup 
the target CPUs of member threads of the signal.

Reduces interruptions to nohz_full CPUs.

Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -396,9 +396,17 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
  * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to 
elapse
  * per process timers.
  */
-void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)
+void tick_nohz_dep_set_signal(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-   tick_nohz_dep_set_all(>tick_dep_mask, bit);
+   int prev;
+   struct task_struct *p;
+
+   prev = atomic_fetch_or(BIT(bit), >signal->tick_dep_mask);
+   if (!prev) {
+   lockdep_assert_held(>sighand->siglock);
+   for_each_thread(tsk, p)
+   tick_nohz_kick_task(p);
+   }
 }
 
 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)
Index: linux-2.6/include/linux/tick.h
===
--- linux-2.6.orig/include/linux/tick.h
+++ linux-2.6/include/linux/tick.h
@@ -207,7 +207,7 @@ extern void tick_nohz_dep_set_task(struc
   enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
 enum tick_dep_bits bit);
-extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+extern void tick_nohz_dep_set_signal(struct task_struct *tsk,
 enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
   enum tick_dep_bits bit);
@@ -252,11 +252,11 @@ static inline void tick_dep_clear_task(s
if (tick_nohz_full_enabled())
tick_nohz_dep_clear_task(tsk, bit);
 }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
   enum tick_dep_bits bit)
 {
if (tick_nohz_full_enabled())
-   tick_nohz_dep_set_signal(signal, bit);
+   tick_nohz_dep_set_signal(tsk, bit);
 }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 enum tick_dep_bits bit)
@@ -284,7 +284,7 @@ static inline void tick_dep_set_task(str
 enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_task(struct task_struct *tsk,
   enum tick_dep_bits bit) { }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
   enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 enum tick_dep_bits bit) { }
Index: linux-2.6/kernel/time/posix-cpu-timers.c
===
--- linux-2.6.orig/kernel/time/posix-cpu-timers.c
+++ linux-2.6/kernel/time/posix-cpu-timers.c
@@ -523,7 +523,7 @@ static void arm_timer(struct k_itimer *t
if (CPUCLOCK_PERTHREAD(timer->it_clock))
tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
else
-   tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
+   tick_dep_set_signal(p, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 /*
@@ -1358,7 +1358,7 @@ void set_process_cpu_timer(struct task_s
if (*newval < *nextevt)
*nextevt = *newval;
 
-   tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
+   tick_dep_set_signal(tsk, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,

[patch 1/2] nohz: only wakeup a single target cpu when kicking a task

2020-10-08 Thread Marcelo Tosatti

When adding a tick dependency to a task, its necessary to
wakeup the CPU where the task resides to reevaluate tick
dependencies on that CPU.

However the current code wakes up all nohz_full CPUs, which 
is unnecessary.

Switch to waking up a single CPU, by using ordering of writes
to task->cpu and task->tick_dep_mask.

From: Frederic Weisbecker 
Suggested-by: Peter Zijlstra 
Signed-off-by: Frederic Weisbecker 
Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -274,6 +274,31 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(_cpu(nohz_full_kick_work, cpu), cpu);
 }
 
+static void tick_nohz_kick_task(struct task_struct *tsk)
+{
+   int cpu = task_cpu(tsk);
+
+   /*
+* If the task concurrently migrates to another cpu,
+* we guarantee it sees the new tick dependency upon
+* schedule.
+*
+*
+* set_task_cpu(p, cpu);
+*   STORE p->cpu = @cpu
+* __schedule() (switch to task 'p')
+*   LOCK rq->lock
+*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
+*   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
+*  LOAD p->tick_dep_mask   LOAD p->cpu
+*/
+
+   preempt_disable();
+   if (cpu_online(cpu))
+   tick_nohz_full_kick_cpu(cpu);
+   preempt_enable();
+}
+
 /*
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
@@ -356,19 +381,8 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cp
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask)) {
-   if (tsk == current) {
-   preempt_disable();
-   tick_nohz_full_kick();
-   preempt_enable();
-   } else {
-   /*
-* Some future tick_nohz_full_kick_task()
-* should optimize this.
-*/
-   tick_nohz_full_kick_all();
-   }
-   }
+   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask))
+   tick_nohz_kick_task(tsk);
 }
 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

[patch 0/2] nohz_full: only wakeup target CPUs when notifying new tick dependency (v3)

2020-10-08 Thread Marcelo Tosatti

When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
performed (to re-read the dependencies and possibly not re-enter
nohz_full on a given CPU).

A common case is for applications that run on nohz_full= CPUs
to not use POSIX timers (eg DPDK). This patch changes the notification
to only IPI the target CPUs where the task(s) whose tick dependencies
are being updated are executing.

This reduces interruptions to nohz_full= CPUs.

---

v3: replace superfluous rcu_read_lock with lockdep_assert (PeterZ)

Re: [patch 1/2] nohz: only wakeup a single target cpu when kicking a task

2020-10-08 Thread Marcelo Tosatti

On Thu, Oct 08, 2020 at 02:22:56PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 07, 2020 at 03:01:52PM -0300, Marcelo Tosatti wrote:
> > When adding a tick dependency to a task, its necessary to
> > wakeup the CPU where the task resides to reevaluate tick
> > dependencies on that CPU.
> > 
> > However the current code wakes up all nohz_full CPUs, which 
> > is unnecessary.
> > 
> > Switch to waking up a single CPU, by using ordering of writes
> > to task->cpu and task->tick_dep_mask.
> > 
> > From: Frederic Weisbecker 
> > Suggested-by: Peter Zijlstra 
> > Signed-off-by: Frederic Weisbecker 
> > Signed-off-by: Marcelo Tosatti 
> > 
> > Index: linux-2.6/kernel/time/tick-sched.c
> > ===
> > --- linux-2.6.orig/kernel/time/tick-sched.c
> > +++ linux-2.6/kernel/time/tick-sched.c
> > @@ -274,6 +274,31 @@ void tick_nohz_full_kick_cpu(int cpu)
> > irq_work_queue_on(_cpu(nohz_full_kick_work, cpu), cpu);
> >  }
> >  
> > +static void tick_nohz_kick_task(struct task_struct *tsk)
> > +{
> > +   int cpu = task_cpu(tsk);
> > +
> > +   /*
> > +* If the task concurrently migrates to another cpu,
> > +* we guarantee it sees the new tick dependency upon
> > +* schedule.
> > +*
> > +*
> > +* set_task_cpu(p, cpu);
> > +*   STORE p->cpu = @cpu
> > +* __schedule() (switch to task 'p')
> > +*   LOCK rq->lock
> > +*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
> > +*   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
> > +*  LOAD p->tick_dep_mask   LOAD p->cpu
> > +*/
> > +
> > +   preempt_disable();
> > +   if (cpu_online(cpu))
> > +   tick_nohz_full_kick_cpu(cpu);
> > +   preempt_enable();
> > +}
> 
> So we need to kick the CPU unconditionally, or only when the task is
> actually running? AFAICT we only care about current->tick_dep_mask.

tick is necessary to execute run_posix_cpu_timers, from tick interrupt, 
even if task is not running.

Re: [patch 2/2] nohz: change signal tick dependency to wakeup CPUs of member tasks

2020-10-08 Thread Marcelo Tosatti

On Thu, Oct 08, 2020 at 02:35:44PM +0200, Peter Zijlstra wrote:
> On Wed, Oct 07, 2020 at 03:01:53PM -0300, Marcelo Tosatti wrote:
> > Rather than waking up all nohz_full CPUs on the system, only wakeup 
> > the target CPUs of member threads of the signal.
> > 
> > Reduces interruptions to nohz_full CPUs.
> > 
> > Signed-off-by: Marcelo Tosatti 
> > 
> > Index: linux-2.6/kernel/time/tick-sched.c
> > ===
> > --- linux-2.6.orig/kernel/time/tick-sched.c
> > +++ linux-2.6/kernel/time/tick-sched.c
> > @@ -398,7 +398,15 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
> >   */
> >  void tick_nohz_dep_set_signal(struct signal_struct *sig, enum 
> > tick_dep_bits bit)
> >  {
> > -   tick_nohz_dep_set_all(>tick_dep_mask, bit);
> > +   int prev;
> > +
> > +   prev = atomic_fetch_or(BIT(bit), >tick_dep_mask);
> > +   if (!prev) {
> > +   rcu_read_lock();
> > +   for_each_thread(sig, t)
> > +   tick_nohz_kick_task(t);
> > +   rcu_read_unlock();
> > +   }
> >  }
> 
> AFAICT, and this makes perfect sense, this function is only ever used
> while holding sighand->siglock, which makes the RCU read lock
> superfluous.
> 
> Would it make sense to change the signal_struct argument to task_struct,
> such that we can write:
> 
>   lockdep_assert_held(>sighand->siglock);
>   for_each_thread(p->signal, t)
>   tick_nohz_kick_task(t);
> 
> ?

Makes sense, resending -v3.

Re: [patch 1/2] nohz: only wakeup a single target cpu when kicking a task

2020-10-08 Thread Marcelo Tosatti

On Thu, Oct 08, 2020 at 10:59:40AM -0400, Peter Xu wrote:
> On Wed, Oct 07, 2020 at 03:01:52PM -0300, Marcelo Tosatti wrote:
> > +static void tick_nohz_kick_task(struct task_struct *tsk)
> > +{
> > +   int cpu = task_cpu(tsk);
> > +
> > +   /*
> > +* If the task concurrently migrates to another cpu,
> > +* we guarantee it sees the new tick dependency upon
> > +* schedule.
> > +*
> > +*
> > +* set_task_cpu(p, cpu);
> > +*   STORE p->cpu = @cpu
> > +* __schedule() (switch to task 'p')
> > +*   LOCK rq->lock
> > +*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
> > +*   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
> > +*  LOAD p->tick_dep_mask   LOAD p->cpu
> > +*/
> > +
> > +   preempt_disable();
> 
> Pure question: is preempt_disable() required here?  Same question to
> tick_nohz_full_kick_all().

Hi Peter,

Don't see why: irq_queue_work_on() disables preemption if necessary.

> 
> > +   if (cpu_online(cpu))
> > +   tick_nohz_full_kick_cpu(cpu);
> > +   preempt_enable();
> > +}
> 
> -- 
> Peter Xu

[patch 2/2] nohz: change signal tick dependency to wakeup CPUs of member tasks

2020-10-07 Thread Marcelo Tosatti

Rather than waking up all nohz_full CPUs on the system, only wakeup 
the target CPUs of member threads of the signal.

Reduces interruptions to nohz_full CPUs.

Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -398,7 +398,15 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
  */
 void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)
 {
-   tick_nohz_dep_set_all(>tick_dep_mask, bit);
+   int prev;
+
+   prev = atomic_fetch_or(BIT(bit), >tick_dep_mask);
+   if (!prev) {
+   rcu_read_lock();
+   for_each_thread(sig, t)
+   tick_nohz_kick_task(t);
+   rcu_read_unlock();
+   }
 }
 
 void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits 
bit)

[patch 1/2] nohz: only wakeup a single target cpu when kicking a task

2020-10-07 Thread Marcelo Tosatti

When adding a tick dependency to a task, its necessary to
wakeup the CPU where the task resides to reevaluate tick
dependencies on that CPU.

However the current code wakes up all nohz_full CPUs, which 
is unnecessary.

Switch to waking up a single CPU, by using ordering of writes
to task->cpu and task->tick_dep_mask.

From: Frederic Weisbecker 
Suggested-by: Peter Zijlstra 
Signed-off-by: Frederic Weisbecker 
Signed-off-by: Marcelo Tosatti 

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -274,6 +274,31 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(_cpu(nohz_full_kick_work, cpu), cpu);
 }
 
+static void tick_nohz_kick_task(struct task_struct *tsk)
+{
+   int cpu = task_cpu(tsk);
+
+   /*
+* If the task concurrently migrates to another cpu,
+* we guarantee it sees the new tick dependency upon
+* schedule.
+*
+*
+* set_task_cpu(p, cpu);
+*   STORE p->cpu = @cpu
+* __schedule() (switch to task 'p')
+*   LOCK rq->lock
+*   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
+*   tick_nohz_task_switch()smp_mb() (atomic_fetch_or())
+*  LOAD p->tick_dep_mask   LOAD p->cpu
+*/
+
+   preempt_disable();
+   if (cpu_online(cpu))
+   tick_nohz_full_kick_cpu(cpu);
+   preempt_enable();
+}
+
 /*
  * Kick all full dynticks CPUs in order to force these to re-evaluate
  * their dependency on the tick and restart it if necessary.
@@ -356,19 +381,8 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cp
  */
 void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
 {
-   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask)) {
-   if (tsk == current) {
-   preempt_disable();
-   tick_nohz_full_kick();
-   preempt_enable();
-   } else {
-   /*
-* Some future tick_nohz_full_kick_task()
-* should optimize this.
-*/
-   tick_nohz_full_kick_all();
-   }
-   }
+   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask))
+   tick_nohz_kick_task(tsk);
 }
 EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

[patch 0/2] nohz_full: only wakeup target CPUs when notifying new tick dependency (v2)

2020-10-07 Thread Marcelo Tosatti

When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
performed (to re-read the dependencies and possibly not re-enter
nohz_full on a given CPU).

A common case is for applications that run on nohz_full= CPUs
to not use POSIX timers (eg DPDK). This patch changes the notification
to only IPI the target CPUs where the task(s) whose tick dependencies
are being updated are executing.

This reduces interruptions to nohz_full= CPUs.

Re: [RFC][Patch v1 2/3] i40e: limit msix vectors based on housekeeping CPUs

2020-09-11 Thread Marcelo Tosatti

On Wed, Sep 09, 2020 at 11:08:17AM -0400, Nitesh Narayan Lal wrote:
> In a realtime environment, it is essential to isolate unwanted IRQs from
> isolated CPUs to prevent latency overheads. Creating MSIX vectors only
> based on the online CPUs could lead to a potential issue on an RT setup
> that has several isolated CPUs but a very few housekeeping CPUs. This is
> because in these kinds of setups an attempt to move the IRQs to the
> limited housekeeping CPUs from isolated CPUs might fail due to the per
> CPU vector limit. This could eventually result in latency spikes because
> of the IRQ threads that we fail to move from isolated CPUs.
> 
> This patch prevents i40e to add vectors only based on available
> housekeeping CPUs by using num_housekeeping_cpus().
> 
> Signed-off-by: Nitesh Narayan Lal 
> ---
>  drivers/net/ethernet/intel/i40e/i40e_main.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
> b/drivers/net/ethernet/intel/i40e/i40e_main.c
> index 2e433fdbf2c3..3b4cd4b3de85 100644
> --- a/drivers/net/ethernet/intel/i40e/i40e_main.c
> +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
> @@ -5,6 +5,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  /* Local includes */
> @@ -11002,7 +11003,7 @@ static int i40e_init_msix(struct i40e_pf *pf)
>* will use any remaining vectors to reach as close as we can to the
>* number of online CPUs.
>*/
> - cpus = num_online_cpus();
> + cpus = num_housekeeping_cpus();
>   pf->num_lan_msix = min_t(int, cpus, vectors_left / 2);
>   vectors_left -= pf->num_lan_msix;
>  
> -- 
> 2.27.0

For patches 1 and 2:

Reviewed-by: Marcelo Tosatti

Re: [RFC][Patch v1 3/3] PCI: Limit pci_alloc_irq_vectors as per housekeeping CPUs

2020-09-10 Thread Marcelo Tosatti

On Wed, Sep 09, 2020 at 11:08:18AM -0400, Nitesh Narayan Lal wrote:
> This patch limits the pci_alloc_irq_vectors max vectors that is passed on
> by the caller based on the available housekeeping CPUs by only using the
> minimum of the two.
> 
> A minimum of the max_vecs passed and available housekeeping CPUs is
> derived to ensure that we don't create excess vectors which can be
> problematic specifically in an RT environment. This is because for an RT
> environment unwanted IRQs are moved to the housekeeping CPUs from
> isolated CPUs to keep the latency overhead to a minimum. If the number of
> housekeeping CPUs are significantly lower than that of the isolated CPUs
> we can run into failures while moving these IRQs to housekeeping due to
> per CPU vector limit.
> 
> Signed-off-by: Nitesh Narayan Lal 
> ---
>  include/linux/pci.h | 16 
>  1 file changed, 16 insertions(+)
> 
> diff --git a/include/linux/pci.h b/include/linux/pci.h
> index 835530605c0d..750ba927d963 100644
> --- a/include/linux/pci.h
> +++ b/include/linux/pci.h
> @@ -38,6 +38,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include 
> @@ -1797,6 +1798,21 @@ static inline int
>  pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
> unsigned int max_vecs, unsigned int flags)
>  {
> + unsigned int num_housekeeping = num_housekeeping_cpus();
> + unsigned int num_online = num_online_cpus();
> +
> + /*
> +  * Try to be conservative and at max only ask for the same number of
> +  * vectors as there are housekeeping CPUs. However, skip any
> +  * modification to the of max vectors in two conditions:
> +  * 1. If the min_vecs requested are higher than that of the
> +  *housekeeping CPUs as we don't want to prevent the initialization
> +  *of a device.
> +  * 2. If there are no isolated CPUs as in this case the driver should
> +  *already have taken online CPUs into consideration.
> +  */
> + if (min_vecs < num_housekeeping && num_housekeeping != num_online)
> + max_vecs = min_t(int, max_vecs, num_housekeeping);
>   return pci_alloc_irq_vectors_affinity(dev, min_vecs, max_vecs, flags,
> NULL);
>  }

If min_vecs > num_housekeeping, for example:

/* PCI MSI/MSIx support */
#define XGBE_MSI_BASE_COUNT 4
#define XGBE_MSI_MIN_COUNT  (XGBE_MSI_BASE_COUNT + 1)

Then the protection fails.

How about reducing max_vecs down to min_vecs, if min_vecs >
num_housekeeping ?

Re: [patch 2/2] nohz: try to avoid IPI when setting tick dependency for task

2020-09-10 Thread Marcelo Tosatti

On Thu, Sep 03, 2020 at 05:01:53PM +0200, Frederic Weisbecker wrote:
> On Tue, Aug 25, 2020 at 03:41:49PM -0300, Marcelo Tosatti wrote:
> > When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
> > performed (to re-read the dependencies and possibly not re-enter
> > nohz_full on a given CPU).
> > 
> > A common case is for applications that run on nohz_full= CPUs
> > to not use POSIX timers (eg DPDK).
> > 
> > This patch optimizes tick_nohz_dep_set_task to avoid kicking
> > all nohz_full= CPUs in case the task allowed mask does not
> > intersect with nohz_full= CPU mask,
> > when going through tick_nohz_dep_set_task.
> > 
> > This reduces interruptions to nohz_full= CPUs.
> > 
> > ---
> >  kernel/time/tick-sched.c |9 +++--
> >  1 file changed, 7 insertions(+), 2 deletions(-)
> > 
> > Index: linux-2.6/kernel/time/tick-sched.c
> > ===
> > --- linux-2.6.orig/kernel/time/tick-sched.c
> > +++ linux-2.6/kernel/time/tick-sched.c
> > @@ -383,11 +383,16 @@ void tick_nohz_dep_set_task(struct task_
> > tick_nohz_full_kick();
> > preempt_enable();
> > } else {
> > +   unsigned long flags;
> > +
> > /*
> >  * Some future tick_nohz_full_kick_task()
> > -* should optimize this.
> > +* should further optimize this.
> >  */
> > -   tick_nohz_full_kick_all();
> > +   raw_spin_lock_irqsave(>pi_lock, flags);
> > +   if (cpumask_intersects(>cpus_mask, 
> > tick_nohz_full_mask))
> > +   tick_nohz_full_kick_all();
> > +   raw_spin_unlock_irqrestore(>pi_lock, flags);
> > }
> > }
> >  }
> > 
> > 
> 
> Not long ago, Peterz suggested that we simply do:
> 
> diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
> index f0199a4ba1ad..42ce8e458013 100644
> --- a/kernel/time/tick-sched.c
> +++ b/kernel/time/tick-sched.c
> @@ -357,17 +357,26 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_cpu);
>  void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
>  {
>   if (!atomic_fetch_or(BIT(bit), >tick_dep_mask)) {
> - if (tsk == current) {
> - preempt_disable();
> - tick_nohz_full_kick();
> - preempt_enable();
> - } else {
> - /*
> -  * Some future tick_nohz_full_kick_task()
> -  * should optimize this.
> -  */
> - tick_nohz_full_kick_all();
> - }
> + int cpu = task_cpu(tsk);
> +
> + /*
> +  * If the task concurrently migrates to another cpu,
> +  * we guarantee it sees the new tick dependency upon
> +  * schedule.
> +  *
> +  * set_task_cpu(p, cpu);
> +  *   STORE p->cpu = @cpu
> +  * __schedule() (switch to task 'p')
> +  *   LOCK rq->lock
> +  *   smp_mb__after_spin_lock()  STORE p->tick_dep_mask
> +  *   tick_nohz_task_switch()smp_mb() 
> (atomic_fetch_or())
> +  *  LOAD p->tick_dep_mask   LOAD p->cpu
> +  */
> +
> + preempt_disable();
> + if (cpu_online(cpu))
> + tick_nohz_full_kick_cpu(cpu);
> + preempt_enable();
>   }
>  }
>  EXPORT_SYMBOL_GPL(tick_nohz_dep_set_task);

This can also be used for the signal case... thanks.

Re: Requirements to control kernel isolation/nohz_full at runtime

2020-09-03 Thread Marcelo Tosatti

On Thu, Sep 03, 2020 at 02:36:36PM -0400, Phil Auld wrote:
> On Thu, Sep 03, 2020 at 03:30:15PM -0300 Marcelo Tosatti wrote:
> > On Thu, Sep 03, 2020 at 03:23:59PM -0300, Marcelo Tosatti wrote:
> > > On Tue, Sep 01, 2020 at 12:46:41PM +0200, Frederic Weisbecker wrote:
> > > > Hi,
> > > 
> > > Hi Frederic,
> > > 
> > > Thanks for the summary! Looking forward to your comments...
> > > 
> > > > I'm currently working on making nohz_full/nohz_idle runtime toggable
> > > > and some other people seem to be interested as well. So I've dumped
> > > > a few thoughts about some pre-requirements to achieve that for those
> > > > interested.
> > > > 
> > > > As you can see, there is a bit of hard work in the way. I'm iterating
> > > > that in https://pad.kernel.org/p/isolation, feel free to edit:
> > > > 
> > > > 
> > > > == RCU nocb ==
> > > > 
> > > > Currently controllable with "rcu_nocbs=" boot parameter and/or through 
> > > > nohz_full=/isolcpus=nohz
> > > > We need to make it toggeable at runtime. Currently handling that:
> > > > v1: https://lwn.net/Articles/820544/
> > > > v2: coming soon
> > > 
> > > Nice.
> > > 
> > > > == TIF_NOHZ ==
> > > > 
> > > > Need to get rid of that in order not to trigger syscall slowpath on 
> > > > CPUs that don't want nohz_full.
> > > > Also we don't want to iterate all threads and clear the flag when the 
> > > > last nohz_full CPU exits nohz_full
> > > > mode. Prefer static keys to call context tracking on archs. x86 does 
> > > > that well.
> > > > 
> > > > == Proper entry code ==
> > > > 
> > > > We must make sure that a given arch never calls exception_enter() / 
> > > > exception_exit().
> > > > This saves the previous state of context tracking and switch to kernel 
> > > > mode (from context tracking POV)
> > > > temporarily. Since this state is saved on the stack, this prevents us 
> > > > from turning off context tracking
> > > > entirely on a CPU: The tracking must be done on all CPUs and that takes 
> > > > some cycles.
> > > > 
> > > > This means that, considering early entry code (before the call to 
> > > > context tracking upon kernel entry,
> > > > and after the call to context tracking upon kernel exit), we must take 
> > > > care of few things:
> > > > 
> > > > 1) Make sure early entry code can't trigger exceptions. Or if it does, 
> > > > the given exception can't schedule
> > > > or use RCU (unless it calls rcu_nmi_enter()). Otherwise the exception 
> > > > must call exception_enter()/exception_exit()
> > > > which we don't want.
> > > > 
> > > > 2) No call to schedule_user().
> > > > 
> > > > 3) Make sure early entry code is not interruptible or 
> > > > preempt_schedule_irq() would rely on
> > > > exception_entry()/exception_exit()
> > > > 
> > > > 4) Make sure early entry code can't be traced (no call to 
> > > > preempt_schedule_notrace()), or if it does it
> > > > can't schedule
> > > > 
> > > > I believe x86 does most of that well. In the end we should remove 
> > > > exception_enter()/exit implementations
> > > > in x86 and replace it with a check that makes sure context_tracking 
> > > > state is not in USER. An arch meeting
> > > > all the above conditions would earn a 
> > > > CONFIG_ARCH_HAS_SANE_CONTEXT_TRACKING. Being able to toggle nohz_full
> > > > at runtime would depend on that.
> > > > 
> > > > 
> > > > == Cputime accounting ==
> > > > 
> > > > Both write and read side must switch to tick based accounting and drop 
> > > > the use of seqlock in task_cputime(),
> > > > task_gtime(), kcpustat_field(), kcpustat_cpu_fetch(). Special 
> > > > ordering/state machine is required to make that without races.
> > > > 
> > > > == Nohz ==
> > > > 
> > > > Switch from nohz_full to nohz_idle. Mind a few details:
> > > > 
> > > > 1) Turn off 1Hz offlined tick handled in housekeeping
> > > > 2) Handle tick dependencies, take care of racing CPUs 
> > > > setting/clearing tick dependenc

Re: Requirements to control kernel isolation/nohz_full at runtime

2020-09-03 Thread Marcelo Tosatti

On Thu, Sep 03, 2020 at 03:23:59PM -0300, Marcelo Tosatti wrote:
> On Tue, Sep 01, 2020 at 12:46:41PM +0200, Frederic Weisbecker wrote:
> > Hi,
> 
> Hi Frederic,
> 
> Thanks for the summary! Looking forward to your comments...
> 
> > I'm currently working on making nohz_full/nohz_idle runtime toggable
> > and some other people seem to be interested as well. So I've dumped
> > a few thoughts about some pre-requirements to achieve that for those
> > interested.
> > 
> > As you can see, there is a bit of hard work in the way. I'm iterating
> > that in https://pad.kernel.org/p/isolation, feel free to edit:
> > 
> > 
> > == RCU nocb ==
> > 
> > Currently controllable with "rcu_nocbs=" boot parameter and/or through 
> > nohz_full=/isolcpus=nohz
> > We need to make it toggeable at runtime. Currently handling that:
> > v1: https://lwn.net/Articles/820544/
> > v2: coming soon
> 
> Nice.
> 
> > == TIF_NOHZ ==
> > 
> > Need to get rid of that in order not to trigger syscall slowpath on CPUs 
> > that don't want nohz_full.
> > Also we don't want to iterate all threads and clear the flag when the last 
> > nohz_full CPU exits nohz_full
> > mode. Prefer static keys to call context tracking on archs. x86 does that 
> > well.
> > 
> > == Proper entry code ==
> > 
> > We must make sure that a given arch never calls exception_enter() / 
> > exception_exit().
> > This saves the previous state of context tracking and switch to kernel mode 
> > (from context tracking POV)
> > temporarily. Since this state is saved on the stack, this prevents us from 
> > turning off context tracking
> > entirely on a CPU: The tracking must be done on all CPUs and that takes 
> > some cycles.
> > 
> > This means that, considering early entry code (before the call to context 
> > tracking upon kernel entry,
> > and after the call to context tracking upon kernel exit), we must take care 
> > of few things:
> > 
> > 1) Make sure early entry code can't trigger exceptions. Or if it does, the 
> > given exception can't schedule
> > or use RCU (unless it calls rcu_nmi_enter()). Otherwise the exception must 
> > call exception_enter()/exception_exit()
> > which we don't want.
> > 
> > 2) No call to schedule_user().
> > 
> > 3) Make sure early entry code is not interruptible or 
> > preempt_schedule_irq() would rely on
> > exception_entry()/exception_exit()
> > 
> > 4) Make sure early entry code can't be traced (no call to 
> > preempt_schedule_notrace()), or if it does it
> > can't schedule
> > 
> > I believe x86 does most of that well. In the end we should remove 
> > exception_enter()/exit implementations
> > in x86 and replace it with a check that makes sure context_tracking state 
> > is not in USER. An arch meeting
> > all the above conditions would earn a 
> > CONFIG_ARCH_HAS_SANE_CONTEXT_TRACKING. Being able to toggle nohz_full
> > at runtime would depend on that.
> > 
> > 
> > == Cputime accounting ==
> > 
> > Both write and read side must switch to tick based accounting and drop the 
> > use of seqlock in task_cputime(),
> > task_gtime(), kcpustat_field(), kcpustat_cpu_fetch(). Special 
> > ordering/state machine is required to make that without races.
> > 
> > == Nohz ==
> > 
> > Switch from nohz_full to nohz_idle. Mind a few details:
> > 
> > 1) Turn off 1Hz offlined tick handled in housekeeping
> > 2) Handle tick dependencies, take care of racing CPUs setting/clearing 
> > tick dependency. It's much trickier when
> > we switch from nohz_idle to nohz_full
> > 
> > == Unbound affinity ==
> > 
> > Restore kernel threads, workqueue, timers, etc... wide affinity. But take 
> > care of cpumasks that have been set through other
> > interfaces: sysfs, procfs, etc...
> 
> We were looking at a userspace interface: what would be a proper
> (unified, similar to isolcpus= interface) and its implementation:
> 
> The simplest idea for interface seemed to be exposing the integer list of
> CPUs and isolation flags to userspace (probably via sysfs).
> 
> The scheme would allow flags to be separately enabled/disabled, 
> with not all flags being necessary toggable (could for example
> disallow nohz_full= toggling until it is implemented, but allow for
> other isolation features to be toggable).
> 
> This would require per flag housekeeping_masks (instead of a single).
> 
> Back to the userspace interface, you mentioned earlier that cpu

Re: Requirements to control kernel isolation/nohz_full at runtime

2020-09-03 Thread Marcelo Tosatti

On Tue, Sep 01, 2020 at 12:46:41PM +0200, Frederic Weisbecker wrote:
> Hi,

Hi Frederic,

Thanks for the summary! Looking forward to your comments...

> I'm currently working on making nohz_full/nohz_idle runtime toggable
> and some other people seem to be interested as well. So I've dumped
> a few thoughts about some pre-requirements to achieve that for those
> interested.
> 
> As you can see, there is a bit of hard work in the way. I'm iterating
> that in https://pad.kernel.org/p/isolation, feel free to edit:
> 
> 
> == RCU nocb ==
> 
> Currently controllable with "rcu_nocbs=" boot parameter and/or through 
> nohz_full=/isolcpus=nohz
> We need to make it toggeable at runtime. Currently handling that:
> v1: https://lwn.net/Articles/820544/
> v2: coming soon

Nice.

> == TIF_NOHZ ==
> 
> Need to get rid of that in order not to trigger syscall slowpath on CPUs that 
> don't want nohz_full.
> Also we don't want to iterate all threads and clear the flag when the last 
> nohz_full CPU exits nohz_full
> mode. Prefer static keys to call context tracking on archs. x86 does that 
> well.
> 
> == Proper entry code ==
> 
> We must make sure that a given arch never calls exception_enter() / 
> exception_exit().
> This saves the previous state of context tracking and switch to kernel mode 
> (from context tracking POV)
> temporarily. Since this state is saved on the stack, this prevents us from 
> turning off context tracking
> entirely on a CPU: The tracking must be done on all CPUs and that takes some 
> cycles.
> 
> This means that, considering early entry code (before the call to context 
> tracking upon kernel entry,
> and after the call to context tracking upon kernel exit), we must take care 
> of few things:
> 
> 1) Make sure early entry code can't trigger exceptions. Or if it does, the 
> given exception can't schedule
> or use RCU (unless it calls rcu_nmi_enter()). Otherwise the exception must 
> call exception_enter()/exception_exit()
> which we don't want.
> 
> 2) No call to schedule_user().
> 
> 3) Make sure early entry code is not interruptible or preempt_schedule_irq() 
> would rely on
> exception_entry()/exception_exit()
> 
> 4) Make sure early entry code can't be traced (no call to 
> preempt_schedule_notrace()), or if it does it
> can't schedule
> 
> I believe x86 does most of that well. In the end we should remove 
> exception_enter()/exit implementations
> in x86 and replace it with a check that makes sure context_tracking state is 
> not in USER. An arch meeting
> all the above conditions would earn a CONFIG_ARCH_HAS_SANE_CONTEXT_TRACKING. 
> Being able to toggle nohz_full
> at runtime would depend on that.
> 
> 
> == Cputime accounting ==
> 
> Both write and read side must switch to tick based accounting and drop the 
> use of seqlock in task_cputime(),
> task_gtime(), kcpustat_field(), kcpustat_cpu_fetch(). Special ordering/state 
> machine is required to make that without races.
> 
> == Nohz ==
> 
> Switch from nohz_full to nohz_idle. Mind a few details:
> 
> 1) Turn off 1Hz offlined tick handled in housekeeping
> 2) Handle tick dependencies, take care of racing CPUs setting/clearing 
> tick dependency. It's much trickier when
> we switch from nohz_idle to nohz_full
> 
> == Unbound affinity ==
> 
> Restore kernel threads, workqueue, timers, etc... wide affinity. But take 
> care of cpumasks that have been set through other
> interfaces: sysfs, procfs, etc...

We were looking at a userspace interface: what would be a proper
(unified, similar to isolcpus= interface) and its implementation:

The simplest idea for interface seemed to be exposing the integer list of
CPUs and isolation flags to userspace (probably via sysfs).

The scheme would allow flags to be separately enabled/disabled, 
with not all flags being necessary toggable (could for example
disallow nohz_full= toggling until it is implemented, but allow for
other isolation features to be toggable).

This would require per flag housekeeping_masks (instead of a single).

Back to the userspace interface, you mentioned earlier that cpusets
was a possibility for it. However:

"Cpusets provide a Linux kernel mechanism to constrain which CPUs and
Memory Nodes are used by a process or set of processes.

The Linux kernel already has a pair of mechanisms to specify on which
CPUs a task may be scheduled (sched_setaffinity) and on which Memory
Nodes it may obtain memory (mbind, set_mempolicy).

Cpusets extends these two mechanisms as follows:"

The isolation flags do not necessarily have anything to do with
tasks, but with CPUs: a given feature is disabled or enabled on a
given CPU. 
No?

---

Regarding locking of the masks, since housekeeping_masks can be called
from hot paths (eg: get_nohz_timer_target) it seems RCU is a natural
fit, so userspace would:

1) use interface to change cpumask for a given feature:

-> set_rcu_pointer
-> wait for grace period

2) proceed to trigger actions that rely on

Re: [patch 1/2] nohz: try to avoid IPI when configuring per-CPU posix timer

2020-09-03 Thread Marcelo Tosatti

On Wed, Sep 02, 2020 at 01:38:59AM +0200, Frederic Weisbecker wrote:
> On Tue, Aug 25, 2020 at 03:41:48PM -0300, Marcelo Tosatti wrote:
> > When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
> > performed (to re-read the dependencies and possibly not re-enter
> > nohz_full on a given CPU).
> > 
> > A common case is for applications that run on nohz_full= CPUs 
> > to not use POSIX timers (eg DPDK). This patch skips the IPI 
> > in case the task allowed mask does not intersect with nohz_full= CPU mask,
> > when going through tick_nohz_dep_set_signal.
> > 
> > This reduces interruptions to nohz_full= CPUs.
> > 
> > Signed-off-by: Marcelo Tosatti 
> [...]
> >  /*
> > + * Set bit on nohz full dependency, kicking all cpus
> > + * only if task can run on nohz full CPUs.
> > + */
> > +static void tick_nohz_dep_set_all_cond(struct task_struct *tsk,
> > +  atomic_t *dep,
> > +  enum tick_dep_bits bit)
> > +{
> > +   int prev;
> > +   unsigned long flags;
> > +
> > +   prev = atomic_fetch_or(BIT(bit), dep);
> > +   if (prev)
> > +   return;
> > +
> > +   raw_spin_lock_irqsave(>pi_lock, flags);
> > +   if (cpumask_intersects(>cpus_mask, tick_nohz_full_mask))
> > +   tick_nohz_full_kick_all();
> 
> So that's for one task but what about the other threads in that
> process? We are setting the tick dependency on all tasks sharing that
> struct signal.

Hi Frederic,

Yep, fixing in -v2, thanks.

[patch 1/2] nohz: try to avoid IPI when configuring per-CPU posix timer

2020-08-25 Thread Marcelo Tosatti

When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
performed (to re-read the dependencies and possibly not re-enter
nohz_full on a given CPU).

A common case is for applications that run on nohz_full= CPUs 
to not use POSIX timers (eg DPDK). This patch skips the IPI 
in case the task allowed mask does not intersect with nohz_full= CPU mask,
when going through tick_nohz_dep_set_signal.

This reduces interruptions to nohz_full= CPUs.

Signed-off-by: Marcelo Tosatti 

---
 include/linux/tick.h   |   11 +++
 kernel/time/posix-cpu-timers.c |4 ++--
 kernel/time/tick-sched.c   |   27 +--
 3 files changed, 34 insertions(+), 8 deletions(-)

Index: linux-2.6/include/linux/tick.h
===
--- linux-2.6.orig/include/linux/tick.h
+++ linux-2.6/include/linux/tick.h
@@ -207,7 +207,8 @@ extern void tick_nohz_dep_set_task(struc
   enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_task(struct task_struct *tsk,
 enum tick_dep_bits bit);
-extern void tick_nohz_dep_set_signal(struct signal_struct *signal,
+extern void tick_nohz_dep_set_signal(struct task_struct *tsk,
+struct signal_struct *signal,
 enum tick_dep_bits bit);
 extern void tick_nohz_dep_clear_signal(struct signal_struct *signal,
   enum tick_dep_bits bit);
@@ -252,11 +253,12 @@ static inline void tick_dep_clear_task(s
if (tick_nohz_full_enabled())
tick_nohz_dep_clear_task(tsk, bit);
 }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
+  struct signal_struct *signal,
   enum tick_dep_bits bit)
 {
if (tick_nohz_full_enabled())
-   tick_nohz_dep_set_signal(signal, bit);
+   tick_nohz_dep_set_signal(tsk, signal, bit);
 }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 enum tick_dep_bits bit)
@@ -284,7 +286,8 @@ static inline void tick_dep_set_task(str
 enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_task(struct task_struct *tsk,
   enum tick_dep_bits bit) { }
-static inline void tick_dep_set_signal(struct signal_struct *signal,
+static inline void tick_dep_set_signal(struct task_struct *tsk,
+  struct signal_struct *signal,
   enum tick_dep_bits bit) { }
 static inline void tick_dep_clear_signal(struct signal_struct *signal,
 enum tick_dep_bits bit) { }
Index: linux-2.6/kernel/time/posix-cpu-timers.c
===
--- linux-2.6.orig/kernel/time/posix-cpu-timers.c
+++ linux-2.6/kernel/time/posix-cpu-timers.c
@@ -523,7 +523,7 @@ static void arm_timer(struct k_itimer *t
if (CPUCLOCK_PERTHREAD(timer->it_clock))
tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
else
-   tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
+   tick_dep_set_signal(p, p->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 /*
@@ -1358,7 +1358,7 @@ void set_process_cpu_timer(struct task_s
if (*newval < *nextevt)
*nextevt = *newval;
 
-   tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
+   tick_dep_set_signal(tsk, tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
 }
 
 static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -302,6 +302,27 @@ static void tick_nohz_dep_set_all(atomic
 }
 
 /*
+ * Set bit on nohz full dependency, kicking all cpus
+ * only if task can run on nohz full CPUs.
+ */
+static void tick_nohz_dep_set_all_cond(struct task_struct *tsk,
+  atomic_t *dep,
+  enum tick_dep_bits bit)
+{
+   int prev;
+   unsigned long flags;
+
+   prev = atomic_fetch_or(BIT(bit), dep);
+   if (prev)
+   return;
+
+   raw_spin_lock_irqsave(>pi_lock, flags);
+   if (cpumask_intersects(>cpus_mask, tick_nohz_full_mask))
+   tick_nohz_full_kick_all();
+   raw_spin_unlock_irqrestore(>pi_lock, flags);
+}
+
+/*
  * Set a global tick dependency. Used by perf events that rely on freq and
  * by unstable clock.
  */
@@ -382,9 +403,11 @@ EXPORT_SYMBOL_GPL(tick_nohz_dep_clear_ta
  * Set a per-taskgroup tick dependency. Posix CPU timers need this

[patch 2/2] nohz: try to avoid IPI when setting tick dependency for task

2020-08-25 Thread Marcelo Tosatti

When enabling per-CPU posix timers, an IPI to nohz_full CPUs might be
performed (to re-read the dependencies and possibly not re-enter
nohz_full on a given CPU).

A common case is for applications that run on nohz_full= CPUs
to not use POSIX timers (eg DPDK).

This patch optimizes tick_nohz_dep_set_task to avoid kicking
all nohz_full= CPUs in case the task allowed mask does not
intersect with nohz_full= CPU mask,
when going through tick_nohz_dep_set_task.

This reduces interruptions to nohz_full= CPUs.

---
 kernel/time/tick-sched.c |9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

Index: linux-2.6/kernel/time/tick-sched.c
===
--- linux-2.6.orig/kernel/time/tick-sched.c
+++ linux-2.6/kernel/time/tick-sched.c
@@ -383,11 +383,16 @@ void tick_nohz_dep_set_task(struct task_
tick_nohz_full_kick();
preempt_enable();
} else {
+   unsigned long flags;
+
/*
 * Some future tick_nohz_full_kick_task()
-* should optimize this.
+* should further optimize this.
 */
-   tick_nohz_full_kick_all();
+   raw_spin_lock_irqsave(>pi_lock, flags);
+   if (cpumask_intersects(>cpus_mask, 
tick_nohz_full_mask))
+   tick_nohz_full_kick_all();
+   raw_spin_unlock_irqrestore(>pi_lock, flags);
}
}
 }

[patch 0/2] posix-timers: avoid nohz_full= IPIs via task cpu masks

2020-08-25 Thread Marcelo Tosatti

This patchset avoids IPIs to nohz_full= CPUs when the intersection 
between the set of nohz_full CPUs and task allowed cpus is null.

See individual patches for details.

Re: [PATCH 00/13] lib/generic-radix-tree: genradix bug fix and optimisations.

2020-08-25 Thread 'Marcelo Ricardo Leitner'

On Tue, Aug 25, 2020 at 02:52:34PM +, David Laight wrote:
> The genradix code is used by SCTP for accessing per-stream data.
> This means there are quite a lot of lookups but the code wasn't
> really optimised at all.

My test box is down for the moment and will bring it on later today or
tomorrow, so I can't test it yet. What should we expect as performance
gains here?

  Marcelo

Re: Oops on current Raspian when closing an SCTP connection

2020-08-17 Thread Marcelo Ricardo Leitner

On Sun, Aug 16, 2020 at 06:06:24PM -0500, Corey Minyard wrote:
> I'm seeing the following when an SCTP connection terminates.  This is on
> Raspian on a Raspberry Pi, version is Linux version 5.4.51-v7+.  That's
> 32-bit ARM.
> 
> I haven't looked into it yet, I thought I would report before trying to
> chase anything down.  I'm not seeing it on 5.4 x86_64 systems.
> 
> Aug 16 17:59:00 access kernel: [510640.326415] Unable to handle kernel NULL 
> pointer dereference at virtual address 0008
> Aug 16 17:59:00 access kernel: [510640.341624] pgd = c00fc16c
> Aug 16 17:59:00 access kernel: [510640.347834] [0008] *pgd=355ef835, 
> *pte=, *ppte=
> Aug 16 17:59:00 access kernel: [510640.357731] Internal error: Oops: 17 [#22] 
> SMP ARM
> Aug 16 17:59:01 access kernel: [510640.365931] Modules linked in: md5 sctp 
> ftdi_sio cp210x usbserial raspberrypi_hwmon bcm2835_codec(C) v4l2_mem2mem 
> bcm2835_isp(C) bcm2835_v4l2(C) bcm2835_mmal_vchiq(C) videobuf2_vmalloc 
> videobuf2_dma_contig videobuf2_memops videobuf2_v4l2 snd_bcm2835(C) 
> videobuf2_common i2c_bcm2835 snd_pcm snd_timer videodev snd mc vc_sm_cma(C) 
> uio_pdrv_genirq uio fixed nf_nat_pptp nf_conntrack_pptp nf_nat nf_conntrack 
> nf_defrag_ipv4 rtc_ds1307 regmap_i2c i2c_dev ip_tables x_tables ipv6 
> nf_defrag_ipv6
> Aug 16 17:59:01 access kernel: [510640.425420] CPU: 1 PID: 4592 Comm: gtlsshd 
> Tainted: G  D  C5.4.51-v7+ #1327
> Aug 16 17:59:01 access kernel: [510640.438008] Hardware name: BCM2835
> Aug 16 17:59:01 access kernel: [510640.443823] PC is at 
> sctp_ulpevent_free+0x38/0xa0 [sctp]
> Aug 16 17:59:01 access kernel: [510640.451498] LR is at 
> sctp_queue_purge_ulpevents+0x34/0x50 [sctp]

Not ringing a bell here. Can you pinpoint on which line this crash
was? It seems, by the 0x8 offset and these function offsets, that this
could be when it was trying to access event->rmem_len, but if event
was NULL then it should have crashed earlier.

  Marcelo

Re: general protection fault in sctp_ulpevent_notify_peer_addr_change

2020-08-10 Thread Marcelo Ricardo Leitner

On Mon, Aug 10, 2020 at 08:37:18AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:fffe3ae0 Merge tag 'for-linus-hmm' of git://git.kernel.org..
> git tree:   upstream
> console output: https://syzkaller.appspot.com/x/log.txt?x=12f34d3a90
> kernel config:  https://syzkaller.appspot.com/x/.config?x=50463ec6729f9706
> dashboard link: https://syzkaller.appspot.com/bug?extid=8f2165a7b1f2820feffc
> compiler:   gcc (GCC) 10.1.0-syz 20200507
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=1517701c90
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=11b7e0e290
> 
> IMPORTANT: if you fix the issue, please add the following tag to the commit:
> Reported-by: syzbot+8f2165a7b1f2820fe...@syzkaller.appspotmail.com
> 
> general protection fault, probably for non-canonical address 
> 0xdc4c:  [#1] PREEMPT SMP KASAN
> KASAN: null-ptr-deref in range [0x0260-0x0267]
> CPU: 0 PID: 12765 Comm: syz-executor391 Not tainted 5.8.0-syzkaller #0
> Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 
> rel-1.12.0-59-gc9ba5276e321-prebuilt.qemu.org 04/01/2014
> RIP: 0010:sctp_ulpevent_notify_peer_addr_change+0xa9/0xad0 
> net/sctp/ulpevent.c:346

Crashed in code added by 45ebf73ebcec ("sctp: check assoc before
SCTP_ADDR_{MADE_PRIM, ADDED} event"), but it would have crashed a
couple of instructions later on already anyway.

I can't reproduce this crash, with the same commit and kernel config.
I'm not seeing how transport->asoc can be null at there.

While trying to reproduce this, when I aborted a test, I actually
triggerred:

[ 1527.736212][ T8008] team0 (unregistering): Port device team_slave_1 removed
[ 1527.896902][ T8008] team0 (unregistering): Port device team_slave_0 removed
[ 1528.053936][ T8008] bond0 (unregistering): (slave bond_slave_1): Releasing 
backup interface
[ 1528.445113][ T8008] bond0 (unregistering): (slave bond_slave_0): Releasing 
backup interface
[ 1528.915669][ T8008] bond0 (unregistering): Released all slaves
[ 1530.531179][ T8008] [ cut here ]
[ 1530.666414][ T8008] ODEBUG: free active (active state 0) object type: 
timer_list hint: delayed_work_timer_fn+0x0/0x90
[ 1530.913574][ T8008] WARNING: CPU: 11 PID: 8008 at lib/debugobjects.c:485 
debug_print_object+0x160/0x250
[ 1531.165944][ T8008] Kernel panic - not syncing: panic_on_warn set ...
[ 1531.291997][ T8008] CPU: 11 PID: 8008 Comm: kworker/u48:8 Not tainted 5.8.0+ 
#6
[ 1531.554397][ T8008] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), 
BIOS 1.13.0-2.fc32 04/01/2014
[ 1531.842844][ T8008] Workqueue: netns cleanup_net
[ 1531.983054][ T8008] Call Trace:
[ 1532.122433][ T8008]  dump_stack+0x18f/0x20d
[ 1532.257582][ T8008]  panic+0x2e3/0x75c
[ 1532.385158][ T8008]  ? __warn_printk+0xf3/0xf3
[ 1532.520152][ T8008]  ? console_unlock+0x7f0/0xf30
[ 1532.643891][ T8008]  ? __warn.cold+0x5/0x45
[ 1532.763171][ T8008]  ? __warn+0xd6/0x1f2
[ 1532.884107][ T8008]  ? debug_print_object+0x160/0x250
[ 1533.011290][ T8008]  __warn.cold+0x20/0x45
[ 1533.132625][ T8008]  ? wake_up_klogd.part.0+0x8c/0xc0
[ 1533.248423][ T8008]  ? debug_print_object+0x160/0x250
[ 1533.370165][ T8008]  report_bug+0x1bd/0x210
[ 1533.492858][ T8008]  handle_bug+0x38/0x90
[ 1533.614108][ T8008]  exc_invalid_op+0x14/0x40
[ 1533.730968][ T8008]  asm_exc_invalid_op+0x12/0x20
[ 1533.851289][ T8008] RIP: 0010:debug_print_object+0x160/0x250
[ 1533.964027][ T8008] Code: dd 40 b8 93 88 48 89 fa 48 c1 ea 03 80 3c 02 00 0f 
85 bf 00 00 00 48 8b 14 dd 40 b8 93 88 48 c7 c7 a0 ad 93 88 e8 02 66 a9 fd <0f> 
0b 83 05 73 9f 13 07 01 48 83 c4 20 5b 5d 41 5c 41 5d c3 48 89
[ 1534.313398][ T8008] RSP: 0018:c9e378a8 EFLAGS: 00010086
[ 1534.432053][ T8008] RAX:  RBX: 0003 RCX: 

[ 1534.677101][ T8008] RDX: 8881331a2300 RSI: 815d8e17 RDI: 
f520001c6f07
[ 1534.930977][ T8008] RBP: 0001 R08: 0001 R09: 
888142fa0fcb
[ 1535.180403][ T8008] R10:  R11: 8026 R12: 
89bce120
[ 1535.424399][ T8008] R13: 81636500 R14: dead0100 R15: 
dc00
[ 1535.678140][ T8008]  ? calc_wheel_index+0x3f0/0x3f0
[ 1535.808026][ T8008]  ? vprintk_func+0x97/0x1a6
[ 1535.939928][ T8008]  ? debug_print_object+0x160/0x250
[ 1536.072538][ T8008]  debug_check_no_obj_freed+0x301/0x41c
[ 1536.203742][ T8008]  ? dev_attr_show+0x90/0x90
[ 1536.343659][ T8008]  kfree+0xf0/0x2c0
[ 1536.484984][ T8008]  ? dev_attr_show+0x90/0x90
[ 1536.620853][ T8008]  kvfree+0x42/0x50
[ 1536.752990][ T8008]  ? netdev_class_remove_file_ns+0x30/0x30
[ 1536.886457][ T8008]  device_release+0x71/0x200
[ 1537.015419][ T8008]  ? dev_attr_show+0x90/0x90
[ 1537.142315][ T8008]  kobject_put+0x171/0x270
[ 1537.269426][ T8008]  netdev_run_todo+0x765/0xac0
[ 1537.402993][ T8008]  ? dev_xdp_uninstall+0x3f0/0x3f0
[ 1537.542007][ T8008]  ?

[PATCH] kunit: tool: adjust parse regex

2020-07-28 Thread Marcelo Schmitt

kunit config subcommand terminates with error if .config has a
configuration assigned with a string, for instance:

CONFIG_CC_VERSION_TEXT="gcc (distro package version) ..."

This patch adjusts the parse regex to consider such string assignments.

Signed-off-by: Marcelo Schmitt 
---
 tools/testing/kunit/kunit_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/kunit/kunit_config.py 
b/tools/testing/kunit/kunit_config.py
index e75063d603b5..8e55693fe812 100644
--- a/tools/testing/kunit/kunit_config.py
+++ b/tools/testing/kunit/kunit_config.py
@@ -10,7 +10,7 @@ import collections
 import re
 
 CONFIG_IS_NOT_SET_PATTERN = r'^# CONFIG_(\w+) is not set$'
-CONFIG_PATTERN = r'^CONFIG_(\w+)=(\S+)$'
+CONFIG_PATTERN = r'^CONFIG_(\w+)=((\S+)|(".*"))$'
 
 KconfigEntryBase = collections.namedtuple('KconfigEntry', ['name', 'value'])
 
-- 
2.27.0

Re: [PATCH] sctp: remove redundant initialization of variable status

2020-07-24 Thread Marcelo Ricardo Leitner

On Fri, Jul 24, 2020 at 01:17:53PM +0100, Colin King wrote:
> From: Colin Ian King 
> 
> The variable status is being initialized with a value that is never read
> and it is being updated later with a new value.  The initialization is
> redundant and can be removed.
> 
> Addresses-Coverity: ("Unused value")
> Signed-off-by: Colin Ian King 

Acked-by: Marcelo Ricardo Leitner 

Are you willing to send another to patch to fix the var ordering in
reverse christmass tree in there?

> ---
>  net/sctp/protocol.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> index 7ecaf7d575c0..a0448f7c64b9 100644
> --- a/net/sctp/protocol.c
> +++ b/net/sctp/protocol.c
> @@ -1368,7 +1368,7 @@ static struct pernet_operations sctp_ctrlsock_ops = {
>  static __init int sctp_init(void)
>  {
>   int i;
> - int status = -EINVAL;
> + int status;
>   unsigned long goal;
>   unsigned long limit;
>   unsigned long nr_pages = totalram_pages();
> -- 
> 2.27.0
>

Re: KASAN: slab-out-of-bounds Write in sctp_setsockopt

2020-07-22 Thread Marcelo Ricardo Leitner

On Wed, Jul 22, 2020 at 11:22:23AM -0700, syzbot wrote:
> Hello,
> 
> syzbot found the following issue on:
> 
> HEAD commit:4f1b4da5 Merge branch 'net-atlantic-various-features'
> git tree:   net-next
> console output: https://syzkaller.appspot.com/x/log.txt?x=14b3a04090
> kernel config:  https://syzkaller.appspot.com/x/.config?x=2b7b67c0c1819c87
> dashboard link: https://syzkaller.appspot.com/bug?extid=0e4699d000d8b874d8dc
> compiler:   gcc (GCC) 10.1.0-syz 20200507
> syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=14c9335890
> C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=14ab61f090

The syz repo has:
setsockopt$inet_sctp6_SCTP_MAX_BURST(r0, 0x84, 0x10, 
&(0x7f000100)=@assoc_value, 0x8)
  ^^   

#define SCTP_DELAYED_ACK_TIME   16
#define SCTP_DELAYED_ACK SCTP_DELAYED_ACK_TIME
#define SCTP_DELAYED_SACK SCTP_DELAYED_ACK_TIME
#define SCTP_MAX_BURST  20  /* Set/Get max burst */

C repro has:
  syscall(__NR_setsockopt, r[0], 0x84, 0x10, 0x2100ul, 8ul);
   

So I'm wondering, what was the real intention of the call?


Anyhow, the issue is real, introduced by ebb25defdc17 ("sctp: pass a
kernel pointer to sctp_setsockopt_delayed_ack"). It used to use a
local storage bigger than the data provided by the user and used
one struct to read another's content on top of it. Quite masked.
I'll cook a fix.

  Marcelo

Re: [PATCH for v5.9] sctp: Replace HTTP links with HTTPS ones

2020-07-20 Thread Marcelo Ricardo Leitner

On Sun, Jul 19, 2020 at 10:26:44PM +0200, Alexander A. Klimov wrote:
> Rationale:
> Reduces attack surface on kernel devs opening the links for MITM
> as HTTPS traffic is much harder to manipulate.
> 
> Deterministic algorithm:
> For each file:
>   If not .svg:
> For each line:
>   If doesn't contain `\bxmlns\b`:
> For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`:
> If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`:
> If both the HTTP and HTTPS versions
> return 200 OK and serve the same content:
>   Replace HTTP with HTTPS.
> 
> Signed-off-by: Alexander A. Klimov 
> ---
>  Continuing my work started at 93431e0607e5.
>  See also: git log --oneline '--author=Alexander A. Klimov 
> ' v5.7..master
>  (Actually letting a shell for loop submit all this stuff for me.)
> 
>  If there are any URLs to be removed completely
>  or at least not (just) HTTPSified:
>  Just clearly say so and I'll *undo my change*.
>  See also: https://lkml.org/lkml/2020/6/27/64
> 
>  If there are any valid, but yet not changed URLs:
>  See: https://lkml.org/lkml/2020/6/26/837
> 
>  If you apply the patch, please let me know.
> 
>  Sorry again to all maintainers who complained about subject lines.
>  Now I realized that you want an actually perfect prefixes,
>  not just subsystem ones.
>  I tried my best...
>  And yes, *I could* (at least half-)automate it.
>  Impossible is nothing! :)

The subject prefix is right for sctp, but the patch tag should have
been "PATCH net-next" instead. :-)

Thankfully, they can fix it for us.

Acked-by: Marcelo Ricardo Leitner

Re: [PATCH v5 2/8] lib/mpi: Extend the MPI library

2020-07-10 Thread Marcelo Henrique Cerri

Hi, Tianjia.

On Thu, Jul 09, 2020 at 04:40:09PM +0800, Tianjia Zhang wrote:
> Expand the mpi library based on libgcrypt, and the ECC algorithm of
> mpi based on libgcrypt requires these functions.
> Some other algorithms will be developed based on mpi ecc, such as SM2.
> 
> Signed-off-by: Tianjia Zhang 
> ---
>  include/linux/mpi.h|  88 +++
>  lib/mpi/Makefile   |   5 +
>  lib/mpi/mpi-add.c  | 207 +
>  lib/mpi/mpi-bit.c  | 251 ++
>  lib/mpi/mpi-cmp.c  |  46 --
>  lib/mpi/mpi-div.c  | 238 +
>  lib/mpi/mpi-internal.h |  53 +++
>  lib/mpi/mpi-inv.c  | 143 ++
>  lib/mpi/mpi-mod.c  | 155 +++
>  lib/mpi/mpi-mul.c  |  94 
>  lib/mpi/mpicoder.c | 336 +
>  lib/mpi/mpih-div.c | 294 
>  lib/mpi/mpih-mul.c |  25 +++
>  lib/mpi/mpiutil.c  | 204 +
>  14 files changed, 2129 insertions(+), 10 deletions(-)
>  create mode 100644 lib/mpi/mpi-add.c
>  create mode 100644 lib/mpi/mpi-div.c
>  create mode 100644 lib/mpi/mpi-inv.c
>  create mode 100644 lib/mpi/mpi-mod.c
>  create mode 100644 lib/mpi/mpi-mul.c
> 
> diff --git a/include/linux/mpi.h b/include/linux/mpi.h
> index 7bd6d8af0004..2dddf4c6e011 100644
> --- a/include/linux/mpi.h
> +++ b/include/linux/mpi.h
> @@ -40,21 +40,79 @@ struct gcry_mpi {
>  typedef struct gcry_mpi *MPI;
>  
>  #define mpi_get_nlimbs(a) ((a)->nlimbs)
> +#define mpi_has_sign(a)   ((a)->sign)
>  
>  /*-- mpiutil.c --*/
>  MPI mpi_alloc(unsigned nlimbs);
> +void mpi_clear(MPI a);
>  void mpi_free(MPI a);
>  int mpi_resize(MPI a, unsigned nlimbs);
>  
> +static inline MPI mpi_new(unsigned int nbits)
> +{
> + return mpi_alloc((nbits + BITS_PER_MPI_LIMB - 1) / BITS_PER_MPI_LIMB);
> +}
> +
> +MPI mpi_copy(MPI a);
> +MPI mpi_alloc_like(MPI a);
> +void mpi_snatch(MPI w, MPI u);
> +MPI mpi_set(MPI w, MPI u);
> +MPI mpi_set_ui(MPI w, unsigned long u);
> +MPI mpi_alloc_set_ui(unsigned long u);
> +void mpi_swap_cond(MPI a, MPI b, unsigned long swap);
> +
> +/* Constants used to return constant MPIs.  See mpi_init if you
> + * want to add more constants.
> + */
> +#define MPI_NUMBER_OF_CONSTANTS 6
> +enum gcry_mpi_constants {
> + MPI_C_ZERO,
> + MPI_C_ONE,
> + MPI_C_TWO,
> + MPI_C_THREE,
> + MPI_C_FOUR,
> + MPI_C_EIGHT
> +};
> +
> +MPI mpi_const(enum gcry_mpi_constants no);
> +
>  /*-- mpicoder.c --*/
> +
> +/* Different formats of external big integer representation. */
> +enum gcry_mpi_format {
> + GCRYMPI_FMT_NONE = 0,
> + GCRYMPI_FMT_STD = 1,/* Twos complement stored without length. */
> + GCRYMPI_FMT_PGP = 2,/* As used by OpenPGP (unsigned only). */
> + GCRYMPI_FMT_SSH = 3,/* As used by SSH (like STD but with length). */
> + GCRYMPI_FMT_HEX = 4,/* Hex format. */
> + GCRYMPI_FMT_USG = 5,/* Like STD but unsigned. */
> + GCRYMPI_FMT_OPAQUE = 8  /* Opaque format (some functions only). */
> +};
> +
>  MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes);
>  MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread);
> +int mpi_fromstr(MPI val, const char *str);
> +MPI mpi_scanval(const char *string);
>  MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len);
>  void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign);
>  int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes,
>   int *sign);
>  int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes,
>int *sign);
> +int mpi_print(enum gcry_mpi_format format, unsigned char *buffer,
> + size_t buflen, size_t *nwritten, MPI a);
> +
> +/*-- mpi-mod.c --*/
> +void mpi_mod(MPI rem, MPI dividend, MPI divisor);
> +
> +/* Context used with Barrett reduction.  */
> +struct barrett_ctx_s;
> +typedef struct barrett_ctx_s *mpi_barrett_t;
> +
> +mpi_barrett_t mpi_barrett_init(MPI m, int copy);
> +void mpi_barrett_free(mpi_barrett_t ctx);
> +void mpi_mod_barrett(MPI r, MPI x, mpi_barrett_t ctx);
> +void mpi_mul_barrett(MPI w, MPI u, MPI v, mpi_barrett_t ctx);
>  
>  /*-- mpi-pow.c --*/
>  int mpi_powm(MPI res, MPI base, MPI exp, MPI mod);
> @@ -62,10 +120,40 @@ int mpi_powm(MPI res, MPI base, MPI exp, MPI mod);
>  /*-- mpi-cmp.c --*/
>  int mpi_cmp_ui(MPI u, ulong v);
>  int mpi_cmp(MPI u, MPI v);
> +int mpi_cmpabs(MPI u, MPI v);
>  
>  /*-- mpi-bit.c --*/
>  void mpi_normalize(MPI a);
>  unsigned mpi_get_nbits(MPI a);
> +int mpi_test_bit(MPI a, unsigned int n);
> +void mpi_set_bit(MPI a, unsigned int n);
> +void mpi_set_highbit(MPI a, unsigned int n);
> +void mpi_clear_highbit(MPI a, unsigned int n);
> +void mpi_clear_bit(MPI a, unsigned int n);
> +void mpi_rshift_limbs(MPI a, unsigned int count);
> +void mpi_rshift(MPI x, MPI a, unsigned int n);
> +void mpi_lshift_limbs(MPI a, unsigned int

[PATCH net] sctp: Don't advertise IPv4 addresses if ipv6only is set on the socket

2020-06-24 Thread Marcelo Ricardo Leitner

If a socket is set ipv6only, it will still send IPv4 addresses in the
INIT and INIT_ACK packets. This potentially misleads the peer into using
them, which then would cause association termination.

The fix is to not add IPv4 addresses to ipv6only sockets.

Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
Reported-by: Corey Minyard 
Signed-off-by: Marcelo Ricardo Leitner 
---
 include/net/sctp/constants.h | 8 +---
 net/sctp/associola.c | 5 -
 net/sctp/bind_addr.c | 1 +
 net/sctp/protocol.c  | 3 ++-
 4 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 
15b4d9aec7ff278e67a7183f10c14be237227d6b..122d9e2d8dfde33b787d575fc42d454732550698
 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -353,11 +353,13 @@ enum {
 ipv4_is_anycast_6to4(a))
 
 /* Flags used for the bind address copy functions.  */
-#define SCTP_ADDR6_ALLOWED 0x0001  /* IPv6 address is allowed by
+#define SCTP_ADDR4_ALLOWED 0x0001  /* IPv4 address is allowed by
   local sock family */
-#define SCTP_ADDR4_PEERSUPP0x0002  /* IPv4 address is supported by
+#define SCTP_ADDR6_ALLOWED 0x0002  /* IPv6 address is allowed by
+  local sock family */
+#define SCTP_ADDR4_PEERSUPP0x0004  /* IPv4 address is supported by
   peer */
-#define SCTP_ADDR6_PEERSUPP0x0004  /* IPv6 address is supported by
+#define SCTP_ADDR6_PEERSUPP0x0008  /* IPv6 address is supported by
   peer */
 
 /* Reasons to retransmit. */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 
72315137d7e7f20d5182291ef4b01102f030078b..8d735461fa196567ab19c583703aad098ef8e240
 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -1565,12 +1565,15 @@ void sctp_assoc_rwnd_decrease(struct sctp_association 
*asoc, unsigned int len)
 int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc,
 enum sctp_scope scope, gfp_t gfp)
 {
+   struct sock *sk = asoc->base.sk;
int flags;
 
/* Use scoping rules to determine the subset of addresses from
 * the endpoint.
 */
-   flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0;
+   flags = (PF_INET6 == sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0;
+   if (!inet_v6_ipv6only(sk))
+   flags |= SCTP_ADDR4_ALLOWED;
if (asoc->peer.ipv4_address)
flags |= SCTP_ADDR4_PEERSUPP;
if (asoc->peer.ipv6_address)
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 
53bc61537f44f4e766c417fcef72234df52ecd04..701c5a4e441d9c248df9472f22db5b78987f9e44
 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -461,6 +461,7 @@ static int sctp_copy_one_addr(struct net *net, struct 
sctp_bind_addr *dest,
 * well as the remote peer.
 */
if AF_INET == addr->sa.sa_family) &&
+ (flags & SCTP_ADDR4_ALLOWED) &&
  (flags & SCTP_ADDR4_PEERSUPP))) ||
(((AF_INET6 == addr->sa.sa_family) &&
  (flags & SCTP_ADDR6_ALLOWED) &&
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 
092d1afdee0d23cd974210839310fbf406dd443f..cde29f3c7fb3c40ee117636fa3b4b7f0a03e4fba
 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -148,7 +148,8 @@ int sctp_copy_local_addr_list(struct net *net, struct 
sctp_bind_addr *bp,
 * sock as well as the remote peer.
 */
if (addr->a.sa.sa_family == AF_INET &&
-   !(copy_flags & SCTP_ADDR4_PEERSUPP))
+   (!(copy_flags & SCTP_ADDR4_ALLOWED) ||
+!(copy_flags & SCTP_ADDR4_PEERSUPP)))
continue;
if (addr->a.sa.sa_family == AF_INET6 &&
(!(copy_flags & SCTP_ADDR6_ALLOWED) ||
-- 
2.25.4

Re: [PATCH] sctp: Don't advertise IPv4 addresses if ipv6only is set on the socket

2020-06-24 Thread Marcelo Ricardo Leitner

On Tue, Jun 23, 2020 at 11:04:17AM -0500, miny...@acm.org wrote:
> From: Corey Minyard 
> 
> If a socket was set ipv6only, it would still send IPv4 addresses in the
> init and init ack packets.  So don't add IPv4 addresses to ipv6only
> sockets.
> 
> Based on a patch by Xin Long 
> 
> Signed-off-by: Corey Minyard 
> ---
> I have tested this and it seem to fix the issue.  However, I'm wondering
> if it might be better to fix it where the addresses are put into the
> association as opposed to where they are put into the message.

Yes, it is. It even highlights why this issue was there in the first
place. Sending a patch right after this email.

  Marcelo

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread Marcelo Ricardo Leitner

On Tue, Jun 23, 2020 at 11:24:59PM +0200, Michael Tuexen wrote:
> > On 23. Jun 2020, at 23:21, Marcelo Ricardo Leitner 
> >  wrote:
> > 
> > On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
> >> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
> >>> From: Marcelo Ricardo Leitner
> >>>> Sent: 22 June 2020 19:33
> >>>> On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> >>>>>> On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> >>>>>> 
> >>>>>> On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> >>>>>>> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> >>>>>>> wrote:
> >>>>>>>> 
> >>>>>>>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> >>>>>>>> create an
> >>>>>>>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on 
> >>>>>>>> it,
> >>>>>>>> then I make a connection to it using ::1, the connection will drop 
> >>>>>>>> after
> >>>>>>>> 2.5 seconds with an ECONNRESET error.
> >>>>>>>> 
> >>>>>>>> It only happens on SCTP, it doesn't have the issue if you connect to 
> >>>>>>>> a
> >>>>>>>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> >>>>>>>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> >>>>>>>> I tried on an ARM system and x86_64.
> >>>>>>>> 
> >>>>>>>> I haven't dug into the kernel to see if I could find anything yet, 
> >>>>>>>> but I
> >>>>>>>> thought I would go ahead and report it.  I am attaching a reproducer.
> >>>>>>>> Basically, compile the following code:
> >>>>>>> The code only set IPV6_V6ONLY on server side, so the client side will
> >>>>>>> still bind all the local ipv4 addresses (as you didn't call bind() to
> >>>>>>> bind any specific addresses ). Then after the connection is created,
> >>>>>>> the client will send HB on the v4 paths to the server. The server
> >>>>>>> will abort the connection, as it can't support v4.
> >>>>>>> 
> >>>>>>> So you can work around it by either:
> >>>>>>> 
> >>>>>>> - set IPV6_V6ONLY on client side.
> >>>>>>> 
> >>>>>>> or
> >>>>>>> 
> >>>>>>> - bind to the specific v6 addresses on the client side.
> >>>>>>> 
> >>>>>>> I don't see RFC said something about this.
> >>>>>>> So it may not be a good idea to change the current behaviour
> >>>>>>> to not establish the connection in this case, which may cause 
> >>>>>>> regression.
> >>>>>> 
> >>>>>> Ok, I understand this.  It's a little strange, but I see why it works
> >>>>>> this way.
> >>>>> I don't. I would expect it to work as I described in my email.
> >>>>> Could someone explain me how and why it is behaving different from
> >>>>> my expectation?
> >>>> 
> >>>> It looks like a bug to me. Testing with this test app here, I can see
> >>>> the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> >>>> that's unexpected for a v6only socket. As is, it's the server saying
> >>>> "I'm available at these other addresses too, but not."
> >>> 
> >>> Does it even make sense to mix IPv4 and IPv6 addresses on the same
> >>> connection?
> >>> I don't remember ever seeing both types of address in a message,
> >>> but may not have looked.
> >> 
> >> That's an interesting question.  Do the RFCs say anything?  I would
> >> assume it was ok unless ipv6only was set.
> >> 
> >>> 
> >>> I also wonder whether the connection should be dropped for an error
> >>> response on a path that has never been validated.
> >> 
> >> That actually bothered me a bit more.  Shouldn't it stay up if any path
> >> is up?  That's kind of the whole p

Re: Strange problem with SCTP+IPv6

2020-06-23 Thread 'Marcelo Ricardo Leitner'

On Tue, Jun 23, 2020 at 11:17:56AM -0500, Corey Minyard wrote:
> On Tue, Jun 23, 2020 at 01:17:28PM +, David Laight wrote:
> > From: Marcelo Ricardo Leitner
> > > Sent: 22 June 2020 19:33
> > > On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > > > > On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > > > >
> > > > > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> > > > >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  
> > > > >> wrote:
> > > > >>>
> > > > >>> I've stumbled upon a strange problem with SCTP and IPv6.  If I 
> > > > >>> create an
> > > > >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option 
> > > > >>> on it,
> > > > >>> then I make a connection to it using ::1, the connection will drop 
> > > > >>> after
> > > > >>> 2.5 seconds with an ECONNRESET error.
> > > > >>>
> > > > >>> It only happens on SCTP, it doesn't have the issue if you connect 
> > > > >>> to a
> > > > >>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> > > > >>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> > > > >>> I tried on an ARM system and x86_64.
> > > > >>>
> > > > >>> I haven't dug into the kernel to see if I could find anything yet, 
> > > > >>> but I
> > > > >>> thought I would go ahead and report it.  I am attaching a 
> > > > >>> reproducer.
> > > > >>> Basically, compile the following code:
> > > > >> The code only set IPV6_V6ONLY on server side, so the client side will
> > > > >> still bind all the local ipv4 addresses (as you didn't call bind() to
> > > > >> bind any specific addresses ). Then after the connection is created,
> > > > >> the client will send HB on the v4 paths to the server. The server
> > > > >> will abort the connection, as it can't support v4.
> > > > >>
> > > > >> So you can work around it by either:
> > > > >>
> > > > >>  - set IPV6_V6ONLY on client side.
> > > > >>
> > > > >> or
> > > > >>
> > > > >>  - bind to the specific v6 addresses on the client side.
> > > > >>
> > > > >> I don't see RFC said something about this.
> > > > >> So it may not be a good idea to change the current behaviour
> > > > >> to not establish the connection in this case, which may cause 
> > > > >> regression.
> > > > >
> > > > > Ok, I understand this.  It's a little strange, but I see why it works
> > > > > this way.
> > > > I don't. I would expect it to work as I described in my email.
> > > > Could someone explain me how and why it is behaving different from
> > > > my expectation?
> > > 
> > > It looks like a bug to me. Testing with this test app here, I can see
> > > the INIT_ACK being sent with a bunch of ipv4 addresses in it and
> > > that's unexpected for a v6only socket. As is, it's the server saying
> > > "I'm available at these other addresses too, but not."
> > 
> > Does it even make sense to mix IPv4 and IPv6 addresses on the same
> > connection?
> > I don't remember ever seeing both types of address in a message,
> > but may not have looked.
> 
> That's an interesting question.  Do the RFCs say anything?  I would
> assume it was ok unless ipv6only was set.
> 
> > 
> > I also wonder whether the connection should be dropped for an error
> > response on a path that has never been validated.
> 
> That actually bothered me a bit more.  Shouldn't it stay up if any path
> is up?  That's kind of the whole point of multihoming.

Michael explained it on the other email. What he described is what I
observed in my tests.

> 
> > 
> > OTOH the whole 'multi-homing' part of SCTP sucks.
> 
> I don't think so.
> 
> > The IP addresses a server needs to bind to depend on where the
> > incoming connection will come from.
> > A local connection may be able to use a 192.168.x.x address
> > but a remote connection must not - as it may be defined locally
> > at the remote system.
> > But both connections can come into the public (routable) address.
> > We have to tell customers to explicitly configure the local IP
> > addresses - which means the application has to know what they are.
> > Fortunately these apps are pretty static - usually M3UA.
> 
> Umm, no,  If you have a private address, it better be behind a firewall,
> and the firewall should handle rewriting the packet to fix the addresses.
> 
> It doesn't appear that Linux netfilter does this.  There is a TODO in
> the code for this.  But that's how it *should* work.

Right, we don't support SCTP aware NAT [1].

1.https://tools.ietf.org/html/draft-stewart-behave-sctpnat-04

  Marcelo

> 
> -corey
> 
> > 
> > David
> > 
> > -
> > Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 
> > 1PT, UK
> > Registration No: 1397386 (Wales)
> >

Re: Strange problem with SCTP+IPv6

2020-06-22 Thread Marcelo Ricardo Leitner

On Mon, Jun 22, 2020 at 08:01:24PM +0200, Michael Tuexen wrote:
> > On 22. Jun 2020, at 18:57, Corey Minyard  wrote:
> > 
> > On Mon, Jun 22, 2020 at 08:01:23PM +0800, Xin Long wrote:
> >> On Sun, Jun 21, 2020 at 11:56 PM Corey Minyard  wrote:
> >>> 
> >>> I've stumbled upon a strange problem with SCTP and IPv6.  If I create an
> >>> sctp listening socket on :: and set the IPV6_V6ONLY socket option on it,
> >>> then I make a connection to it using ::1, the connection will drop after
> >>> 2.5 seconds with an ECONNRESET error.
> >>> 
> >>> It only happens on SCTP, it doesn't have the issue if you connect to a
> >>> full IPv6 address instead of ::1, and it doesn't happen if you don't
> >>> set IPV6_V6ONLY.  I have verified current end of tree kernel.org.
> >>> I tried on an ARM system and x86_64.
> >>> 
> >>> I haven't dug into the kernel to see if I could find anything yet, but I
> >>> thought I would go ahead and report it.  I am attaching a reproducer.
> >>> Basically, compile the following code:
> >> The code only set IPV6_V6ONLY on server side, so the client side will
> >> still bind all the local ipv4 addresses (as you didn't call bind() to
> >> bind any specific addresses ). Then after the connection is created,
> >> the client will send HB on the v4 paths to the server. The server
> >> will abort the connection, as it can't support v4.
> >> 
> >> So you can work around it by either:
> >> 
> >>  - set IPV6_V6ONLY on client side.
> >> 
> >> or
> >> 
> >>  - bind to the specific v6 addresses on the client side.
> >> 
> >> I don't see RFC said something about this.
> >> So it may not be a good idea to change the current behaviour
> >> to not establish the connection in this case, which may cause regression.
> > 
> > Ok, I understand this.  It's a little strange, but I see why it works
> > this way.
> I don't. I would expect it to work as I described in my email.
> Could someone explain me how and why it is behaving different from
> my expectation?

It looks like a bug to me. Testing with this test app here, I can see
the INIT_ACK being sent with a bunch of ipv4 addresses in it and
that's unexpected for a v6only socket. As is, it's the server saying
"I'm available at these other addresses too, but not."

Thanks,
Marcelo

> 
> Best regards
> Michael
> > 
> > Thanks,
> > 
> > -corey
> > 
> >> 
> >>> 
> >>>  gcc -g -o sctptest -Wall sctptest.c
> >>> 
> >>> and run it in one window as a server:
> >>> 
> >>>  ./sctptest a
> >>> 
> >>> (Pass in any option to be the server) and run the following in another
> >>> window as the client:
> >>> 
> >>>  ./sctptest
> >>> 
> >>> It disconnects after about 2.5 seconds.  If it works, it should just sit
> >>> there forever.
> >>> 
> >>> -corey
> >>> 
> >>> 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> #include 
> >>> 
> >>> static int
> >>> getaddr(const char *addr, const char *port, bool listen,
> >>>struct addrinfo **rai)
> >>> {
> >>>struct addrinfo *ai, hints;
> >>> 
> >>>memset(, 0, sizeof(hints));
> >>>hints.ai_flags = AI_ADDRCONFIG;
> >>>if (listen)
> >>>hints.ai_flags |= AI_PASSIVE;
> >>>hints.ai_family = AF_UNSPEC;
> >>>hints.ai_socktype = SOCK_STREAM;
> >>>hints.ai_protocol = IPPROTO_SCTP;
> >>>if (getaddrinfo(addr, port, , )) {
> >>>perror("getaddrinfo");
> >>>return -1;
> >>>}
> >>> 
> >>>*rai = ai;
> >>>return 0;
> >>> }
> >>> 
> >>> static int
> >>> waitread(int s)
> >>> {
> >>>char data[1];
> >>>ssize_t rv;
> >>> 
> >>>rv = read(s, data, sizeof(data));
> >>>if (rv == -1) {
> >>>perror("read");
> >>>return -1;
> >>>}
> >>

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 3182 matches

Mail list logo