Re: [PATCH v3 1/2] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

2019-09-16 Thread Wanpeng Li
On Mon, 16 Sep 2019 at 15:49, Paolo Bonzini  wrote:
>
> On 16/09/19 06:02, Wanpeng Li wrote:
> > Hi Paolo,
> >
> > Something like below? It still fluctuate when running complex guest os
> > like linux. Removing timer_advance_adjust_done will hinder introduce
> > patch v3 2/2 since there is no adjust done flag in each round
> > evaluation.
>
> That's not important, since the adjustment would be continuous.
>
> How much fluctuation can you see?

After I add a trace_printk to observe more closely, the adjustment is
continuous as expected.

>
> > I have two questions here, best-effort tune always as
> > below or periodically revaluate to get conservative value and get
> > best-effort value in each round evaluation as patch v3 2/2, which one
> > do you prefer? The former one can wast time to wait sometimes and the
> > later one can not get the best latency. In addition, can the adaptive
> > tune algorithm be using in all the scenarios contain
> > RT/over-subscribe?
>
> I prefer the former, like the patch below, mostly because of the extra
> complexity of the periodic reevaluation.

How about question two?

Wanpeng


Re: [PATCH v3 1/2] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

2019-09-16 Thread Paolo Bonzini
On 16/09/19 06:02, Wanpeng Li wrote:
> Hi Paolo,
> 
> Something like below? It still fluctuate when running complex guest os
> like linux. Removing timer_advance_adjust_done will hinder introduce
> patch v3 2/2 since there is no adjust done flag in each round
> evaluation.

That's not important, since the adjustment would be continuous.

How much fluctuation can you see?

> I have two questions here, best-effort tune always as
> below or periodically revaluate to get conservative value and get
> best-effort value in each round evaluation as patch v3 2/2, which one
> do you prefer? The former one can wast time to wait sometimes and the
> later one can not get the best latency. In addition, can the adaptive
> tune algorithm be using in all the scenarios contain
> RT/over-subscribe?

I prefer the former, like the patch below, mostly because of the extra
complexity of the periodic reevaluation.

Paolo

> 
> ---8<---
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index 685d17c..895735b 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -69,6 +69,7 @@
>  #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
>  /* step-by-step approximation to mitigate fluctuation */
>  #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
> +#define LAPIC_TIMER_ADVANCE_FILTER 5000
> 
>  static inline int apic_test_vector(int vec, void *bitmap)
>  {
> @@ -1479,29 +1480,28 @@ static inline void
> adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
>s64 advance_expire_delta)
>  {
>  struct kvm_lapic *apic = vcpu->arch.apic;
> -u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
> -u64 ns;
> +u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
> +
> +if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER ||
> +abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) {
> +/* filter out random fluctuations */
> +return;
> +}
> 
>  /* too early */
>  if (advance_expire_delta < 0) {
>  ns = -advance_expire_delta * 100ULL;
>  do_div(ns, vcpu->arch.virtual_tsc_khz);
> -timer_advance_ns -= min((u32)ns,
> -timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> +timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>  } else {
>  /* too late */
>  ns = advance_expire_delta * 100ULL;
>  do_div(ns, vcpu->arch.virtual_tsc_khz);
> -timer_advance_ns += min((u32)ns,
> -timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> +timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>  }
> 
> -if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> -apic->lapic_timer.timer_advance_adjust_done = true;
> -if (unlikely(timer_advance_ns > 5000)) {
> +if (unlikely(timer_advance_ns > 5000))
>  timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
> -apic->lapic_timer.timer_advance_adjust_done = false;
> -}
>  apic->lapic_timer.timer_advance_ns = timer_advance_ns;
>  }
> 
> @@ -1521,7 +1521,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu 
> *vcpu)
>  if (guest_tsc < tsc_deadline)
>  __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);
> 
> -if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
> +if (lapic_timer_advance_ns == -1)
>  adjust_lapic_timer_advance(vcpu,
> apic->lapic_timer.advance_expire_delta);
>  }
> 
> @@ -2298,10 +2298,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int
> timer_advance_ns)
>  apic->lapic_timer.timer.function = apic_timer_fn;
>  if (timer_advance_ns == -1) {
>  apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
> -apic->lapic_timer.timer_advance_adjust_done = false;
>  } else {
>  apic->lapic_timer.timer_advance_ns = timer_advance_ns;
> -apic->lapic_timer.timer_advance_adjust_done = true;
>  }
> 
> 
> diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
> index 50053d2..2aad7e2 100644
> --- a/arch/x86/kvm/lapic.h
> +++ b/arch/x86/kvm/lapic.h
> @@ -35,7 +35,6 @@ struct kvm_timer {
>  s64 advance_expire_delta;
>  atomic_t pending;/* accumulated triggered timers */
>  bool hv_timer_in_use;
> -bool timer_advance_adjust_done;
>  };
> 
>  struct kvm_lapic {
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 93b0bd4..4f65ef1 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -141,7 +141,7 @@
>   * advancement entirely.  Any other value is used as-is and disables adaptive
>   * tuning, i.e. allows priveleged userspace to set an exact advancement time.
>   */
> -static int __read_mostly lapic_timer_advance_ns = -1;
> +int __read_mostly lapic_timer_advance_ns = -1;
>  module_param(lapic_timer_advance_ns, int, S_IRUGO | S_IWUSR);
> 
>  static bool __read_mostly vector_hashing = true;
> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
> index 6594020..2c6ba86 100644
> --- 

Re: [PATCH v3 1/2] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

2019-09-15 Thread Wanpeng Li
On Thu, 12 Sep 2019 at 20:37, Paolo Bonzini  wrote:
>
> On 12/09/19 02:34, Wanpeng Li wrote:
> >>> -timer_advance_ns -= min((u32)ns,
> >>> -timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> >>> +timer_advance_ns -= ns;
>
> Looking more closely, this assignment...
>
> >>>} else {
> >>>/* too late */
> >>>ns = advance_expire_delta * 100ULL;
> >>>do_div(ns, vcpu->arch.virtual_tsc_khz);
> >>> -timer_advance_ns += min((u32)ns,
> >>> -timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> >>> +timer_advance_ns += ns;
>
> ... and this one are dead code now.  However...
>
> >>>}
> >>>
> >>> +timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
> >>> +(LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
> >>> +LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>
> ... you should instead remove this new assignment and just make the
> assignments above just
>
> timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>
> and
>
> timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP;
>
> In fact this whole last assignment is buggy, since advance_expire_delta
> is in TSC units rather than nanoseconds.
>
> >>>if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
> >>>apic->lapic_timer.timer_advance_adjust_done = true;
> >>>if (unlikely(timer_advance_ns > 5000)) {
> >> This looks great.  But instead of patch 2, why not remove
> >> timer_advance_adjust_done altogether?
> > It can fluctuate w/o stop.
>
> Possibly because of the wrong calculation of timer_advance_ns?

Hi Paolo,

Something like below? It still fluctuate when running complex guest os
like linux. Removing timer_advance_adjust_done will hinder introduce
patch v3 2/2 since there is no adjust done flag in each round
evaluation. I have two questions here, best-effort tune always as
below or periodically revaluate to get conservative value and get
best-effort value in each round evaluation as patch v3 2/2, which one
do you prefer? The former one can wast time to wait sometimes and the
later one can not get the best latency. In addition, can the adaptive
tune algorithm be using in all the scenarios contain
RT/over-subscribe?

---8<---
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 685d17c..895735b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -69,6 +69,7 @@
 #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
 /* step-by-step approximation to mitigate fluctuation */
 #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
+#define LAPIC_TIMER_ADVANCE_FILTER 5000

 static inline int apic_test_vector(int vec, void *bitmap)
 {
@@ -1479,29 +1480,28 @@ static inline void
adjust_lapic_timer_advance(struct kvm_vcpu *vcpu,
   s64 advance_expire_delta)
 {
 struct kvm_lapic *apic = vcpu->arch.apic;
-u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
-u64 ns;
+u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
+
+if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER ||
+abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE) {
+/* filter out random fluctuations */
+return;
+}

 /* too early */
 if (advance_expire_delta < 0) {
 ns = -advance_expire_delta * 100ULL;
 do_div(ns, vcpu->arch.virtual_tsc_khz);
-timer_advance_ns -= min((u32)ns,
-timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+timer_advance_ns -= ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
 } else {
 /* too late */
 ns = advance_expire_delta * 100ULL;
 do_div(ns, vcpu->arch.virtual_tsc_khz);
-timer_advance_ns += min((u32)ns,
-timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
+timer_advance_ns += ns/LAPIC_TIMER_ADVANCE_ADJUST_STEP;
 }

-if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
-apic->lapic_timer.timer_advance_adjust_done = true;
-if (unlikely(timer_advance_ns > 5000)) {
+if (unlikely(timer_advance_ns > 5000))
 timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
-apic->lapic_timer.timer_advance_adjust_done = false;
-}
 apic->lapic_timer.timer_advance_ns = timer_advance_ns;
 }

@@ -1521,7 +1521,7 @@ static void __kvm_wait_lapic_expire(struct kvm_vcpu *vcpu)
 if (guest_tsc < tsc_deadline)
 __wait_lapic_expire(vcpu, tsc_deadline - guest_tsc);

-if (unlikely(!apic->lapic_timer.timer_advance_adjust_done))
+if (lapic_timer_advance_ns == -1)
 adjust_lapic_timer_advance(vcpu,
apic->lapic_timer.advance_expire_delta);
 }

@@ -2298,10 +2298,8 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int
timer_advance_ns)
 apic->lapic_timer.timer.function = apic_timer_fn;
 if (timer_advance_ns == -1) {
 apic->lapic_timer.timer_advance_ns = LAPIC_TIMER_ADVANCE_ADJUST_INIT;
-apic->lapic_timer.timer_advance_adjust_done = false;
 } else {
 

Re: [PATCH v3 1/2] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

2019-09-12 Thread Paolo Bonzini
On 12/09/19 02:34, Wanpeng Li wrote:
>>> -timer_advance_ns -= min((u32)ns,
>>> -timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>>> +timer_advance_ns -= ns;

Looking more closely, this assignment...

>>>} else {
>>>/* too late */
>>>ns = advance_expire_delta * 100ULL;
>>>do_div(ns, vcpu->arch.virtual_tsc_khz);
>>> -timer_advance_ns += min((u32)ns,
>>> -timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
>>> +timer_advance_ns += ns;

... and this one are dead code now.  However...

>>>}
>>>
>>> +timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
>>> +(LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
>>> +LAPIC_TIMER_ADVANCE_ADJUST_STEP;

... you should instead remove this new assignment and just make the
assignments above just

timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP;

and

timer_advance -= ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP;

In fact this whole last assignment is buggy, since advance_expire_delta
is in TSC units rather than nanoseconds.

>>>if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
>>>apic->lapic_timer.timer_advance_adjust_done = true;
>>>if (unlikely(timer_advance_ns > 5000)) {
>> This looks great.  But instead of patch 2, why not remove
>> timer_advance_adjust_done altogether?
> It can fluctuate w/o stop.

Possibly because of the wrong calculation of timer_advance_ns?

Paolo


Re: [PATCH v3 1/2] KVM: LAPIC: Tune lapic_timer_advance_ns smoothly

2019-09-11 Thread Paolo Bonzini
On 28/08/19 10:19, Wanpeng Li wrote:
> From: Wanpeng Li 
> 
> Using a moving average based on per-vCPU lapic_timer_advance_ns to tune 
> smoothly, filter out drastic fluctuation which prevents this before, 
> let's assume it is 1 cycles.
> 
> Cc: Paolo Bonzini 
> Cc: Radim Krčmář 
> Signed-off-by: Wanpeng Li 
> ---
>  arch/x86/kvm/lapic.c | 18 --
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
> index e904ff0..181537a 100644
> --- a/arch/x86/kvm/lapic.c
> +++ b/arch/x86/kvm/lapic.c
> @@ -69,6 +69,7 @@
>  #define LAPIC_TIMER_ADVANCE_ADJUST_INIT 1000
>  /* step-by-step approximation to mitigate fluctuation */
>  #define LAPIC_TIMER_ADVANCE_ADJUST_STEP 8
> +#define LAPIC_TIMER_ADVANCE_FILTER 1
>  
>  static inline int apic_test_vector(int vec, void *bitmap)
>  {
> @@ -1484,23 +1485,28 @@ static inline void adjust_lapic_timer_advance(struct 
> kvm_vcpu *vcpu,
> s64 advance_expire_delta)
>  {
>   struct kvm_lapic *apic = vcpu->arch.apic;
> - u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns;
> - u64 ns;
> + u32 timer_advance_ns = apic->lapic_timer.timer_advance_ns, ns;
> +
> + if (abs(advance_expire_delta) > LAPIC_TIMER_ADVANCE_FILTER)
> + /* filter out drastic fluctuations */
> + return;
>  
>   /* too early */
>   if (advance_expire_delta < 0) {
>   ns = -advance_expire_delta * 100ULL;
>   do_div(ns, vcpu->arch.virtual_tsc_khz);
> - timer_advance_ns -= min((u32)ns,
> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> + timer_advance_ns -= ns;
>   } else {
>   /* too late */
>   ns = advance_expire_delta * 100ULL;
>   do_div(ns, vcpu->arch.virtual_tsc_khz);
> - timer_advance_ns += min((u32)ns,
> - timer_advance_ns / LAPIC_TIMER_ADVANCE_ADJUST_STEP);
> + timer_advance_ns += ns;
>   }
>  
> + timer_advance_ns = (apic->lapic_timer.timer_advance_ns *
> + (LAPIC_TIMER_ADVANCE_ADJUST_STEP - 1) + advance_expire_delta) /
> + LAPIC_TIMER_ADVANCE_ADJUST_STEP;
> +
>   if (abs(advance_expire_delta) < LAPIC_TIMER_ADVANCE_ADJUST_DONE)
>   apic->lapic_timer.timer_advance_adjust_done = true;
>   if (unlikely(timer_advance_ns > 5000)) {
> 

This looks great.  But instead of patch 2, why not remove
timer_advance_adjust_done altogether?

Paolo