Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2015-01-08 Thread Paolo Bonzini


On 08/01/2015 18:41, Marcelo Tosatti wrote:
 Paolo?
 
  And cover letter is a bit misleading:  The condition does nothing to
  guarantee TSC based __delay() loop.  (Right now, __delay() = delay_tsc()
  whenever the hardware has TSC, regardless of stability, thus always.)
 
 OK.

Yes, because of this it is unnecessary to check for stable TSC.  I'll
remove that.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2015-01-08 Thread Marcelo Tosatti
On Mon, Jan 05, 2015 at 07:12:36PM +0100, Radim Krcmar wrote:
 2014-12-23 15:58-0500, Marcelo Tosatti:
  For the hrtimer which emulates the tscdeadline timer in the guest,
  add an option to advance expiration, and busy spin on VM-entry waiting
  for the actual expiration time to elapse.
  
  This allows achieving low latencies in cyclictest (or any scenario 
  which requires strict timing regarding timer expiration).
  
  Reduces average cyclictest latency from 12us to 8us
  on Core i5 desktop.
  
  Note: this option requires tuning to find the appropriate value 
  for a particular hardware/guest combination. One method is to measure the 
  average delay between apic_timer_fn and VM-entry. 
  Another method is to start with 1000ns, and increase the value
  in say 500ns increments until avg cyclictest numbers stop decreasing.
  
  Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 Reviewed-by: Radim Krčmář rkrc...@redhat.com
 
 (Other patches weren't touched, so my previous Reviewed-by holds.)
 
  +++ kvm/arch/x86/kvm/x86.c
  @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
   static u32 tsc_tolerance_ppm = 250;
   module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
   
  +/* lapic timer advance (tscdeadline mode only) in nanoseconds */
  +unsigned int lapic_timer_advance_ns = 0;
  +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
  +
   static bool backwards_tsc_observed = false;
   
   #define KVM_NR_SHARED_MSRS 16
  @@ -5625,6 +5629,10 @@ static void kvm_timer_init(void)
  __register_hotcpu_notifier(kvmclock_cpu_notifier_block);
  cpu_notifier_register_done();
   
  +   if (check_tsc_unstable()  lapic_timer_advance_ns) {
  +   pr_info(kvm: unstable TSC, disabling 
  lapic_timer_advance_ns\n);
  +   lapic_timer_advance_ns = 0;
 
 Does unstable TSC invalidate this feature?
 (lapic_timer_advance_ns can be overridden, so we don't differentiate
  workflows that calibrate after starting with 0.)

Hum, i don't see why. Paolo?

 And cover letter is a bit misleading:  The condition does nothing to
 guarantee TSC based __delay() loop.  (Right now, __delay() = delay_tsc()
 whenever the hardware has TSC, regardless of stability, thus always.)

OK.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2015-01-05 Thread Radim Krcmar
2014-12-23 15:58-0500, Marcelo Tosatti:
 For the hrtimer which emulates the tscdeadline timer in the guest,
 add an option to advance expiration, and busy spin on VM-entry waiting
 for the actual expiration time to elapse.
 
 This allows achieving low latencies in cyclictest (or any scenario 
 which requires strict timing regarding timer expiration).
 
 Reduces average cyclictest latency from 12us to 8us
 on Core i5 desktop.
 
 Note: this option requires tuning to find the appropriate value 
 for a particular hardware/guest combination. One method is to measure the 
 average delay between apic_timer_fn and VM-entry. 
 Another method is to start with 1000ns, and increase the value
 in say 500ns increments until avg cyclictest numbers stop decreasing.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Reviewed-by: Radim Krčmář rkrc...@redhat.com

(Other patches weren't touched, so my previous Reviewed-by holds.)

 +++ kvm/arch/x86/kvm/x86.c
 @@ -108,6 +108,10 @@ EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz)
  static u32 tsc_tolerance_ppm = 250;
  module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
  
 +/* lapic timer advance (tscdeadline mode only) in nanoseconds */
 +unsigned int lapic_timer_advance_ns = 0;
 +module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR);
 +
  static bool backwards_tsc_observed = false;
  
  #define KVM_NR_SHARED_MSRS 16
 @@ -5625,6 +5629,10 @@ static void kvm_timer_init(void)
   __register_hotcpu_notifier(kvmclock_cpu_notifier_block);
   cpu_notifier_register_done();
  
 + if (check_tsc_unstable()  lapic_timer_advance_ns) {
 + pr_info(kvm: unstable TSC, disabling 
 lapic_timer_advance_ns\n);
 + lapic_timer_advance_ns = 0;

Does unstable TSC invalidate this feature?
(lapic_timer_advance_ns can be overridden, so we don't differentiate
 workflows that calibrate after starting with 0.)

And cover letter is a bit misleading:  The condition does nothing to
guarantee TSC based __delay() loop.  (Right now, __delay() = delay_tsc()
whenever the hardware has TSC, regardless of stability, thus always.)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2015-01-05 Thread Radim Krcmar
2015-01-05 19:12+0100, Radim Krcmar:
  (Right now, __delay() = delay_tsc()
 whenever the hardware has TSC, regardless of stability, thus always.)

(For quantifiers' sake, there also is 'tsc_disabled' variable.)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-23 Thread Paolo Bonzini


On 18/12/2014 13:24, Marcelo Tosatti wrote:
 True. I can change to a direct wait if that is preferred.

What about this:

guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
trace_kvm_wait_lapic_expire(vcpu-vcpu_id, guest_tsc - tsc_deadline);
 
/* We know that __delay is delay_tsc, see kvm_timer_init.  */
if (guest_tsc  tsc_deadline)
__delay(tsc_deadline - guest_tsc);

and a check in kvm_timer_init:

if (check_tsc_unstable()  lapic_timer_advance_ns) {
pr_info(kvm: unstable TSC, disabling 
lapic_timer_advance_ns\n);
lapic_timer_advance_ns = 0;
}

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-23 Thread Marcelo Tosatti
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.

This allows achieving low latencies in cyclictest (or any scenario 
which requires strict timing regarding timer expiration).

Reduces average cyclictest latency from 12us to 8us
on Core i5 desktop.

Note: this option requires tuning to find the appropriate value 
for a particular hardware/guest combination. One method is to measure the 
average delay between apic_timer_fn and VM-entry. 
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/lapic.c
===
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
 #include asm/page.h
 #include asm/current.h
 #include asm/apicdef.h
+#include asm/delay.h
 #include linux/atomic.h
 #include linux/jump_label.h
 #include kvm_cache_regs.h
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
 {
struct kvm_vcpu *vcpu = apic-vcpu;
wait_queue_head_t *q = vcpu-wq;
+   struct kvm_timer *ktimer = apic-lapic_timer;
 
/*
 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,60 @@ static void apic_timer_expired(struct kv
 
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+   if (apic_lvtt_tscdeadline(apic))
+   ktimer-expired_tscdeadline = ktimer-tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+   u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+   if (kvm_apic_hw_enabled(apic)) {
+   int vec = reg  APIC_VECTOR_MASK;
+
+   if (kvm_x86_ops-test_posted_interrupt)
+   return kvm_x86_ops-test_posted_interrupt(vcpu, vec);
+   else {
+   if (apic_test_vector(vec, apic-regs + APIC_ISR))
+   return true;
+   }
+   }
+   return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+   u64 guest_tsc, tsc_deadline;
+
+   if (!kvm_vcpu_has_lapic(vcpu))
+   return;
+
+   if (apic-lapic_timer.expired_tscdeadline == 0)
+   return;
+
+   if (!lapic_timer_int_injected(vcpu))
+   return;
+
+   tsc_deadline = apic-lapic_timer.expired_tscdeadline;
+   apic-lapic_timer.expired_tscdeadline = 0;
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
+
+   if (guest_tsc  tsc_deadline)
+   __delay(tsc_deadline - guest_tsc);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
ktime_t now;
+
atomic_set(apic-lapic_timer.pending, 0);
 
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1188,7 @@ static void start_apic_timer(struct kvm_
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic-lapic_timer.tscdeadline;
u64 ns = 0;
+   ktime_t expire;
struct kvm_vcpu *vcpu = apic-vcpu;
unsigned long this_tsc_khz = vcpu-arch.virtual_tsc_khz;
unsigned long flags;
@@ -1151,8 +1203,10 @@ static void start_apic_timer(struct kvm_
if (likely(tscdeadline  guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 100ULL;
do_div(ns, this_tsc_khz);
+   expire = ktime_add_ns(now, ns);
+   expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(apic-lapic_timer.timer,
-   ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
 
Index: kvm/arch/x86/kvm/lapic.h
===
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+   u64 expired_tscdeadline;
atomic_t pending;   /* accumulated triggered timers 
*/
 };
 
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm/arch/x86/kvm/x86.c
===
--- kvm.orig/arch/x86/kvm/x86.c
+++ kvm/arch/x86/kvm/x86.c
@@ -108,6 +108,10 @@ 

Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-18 Thread Marcelo Tosatti
On Wed, Dec 17, 2014 at 08:36:27PM +0100, Radim Krcmar wrote:
 2014-12-17 15:41-0200, Marcelo Tosatti:
  On Wed, Dec 17, 2014 at 03:58:13PM +0100, Radim Krcmar wrote:
   2014-12-16 09:08-0500, Marcelo Tosatti:
+   tsc_deadline = apic-lapic_timer.expired_tscdeadline;
+   apic-lapic_timer.expired_tscdeadline = 0;
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
+
+   while (guest_tsc  tsc_deadline) {
+   int delay = min(tsc_deadline - guest_tsc, 1000ULL);
   
   Why break the __delay() loop into smaller parts?
  
  So that you can handle interrupts, in case this code ever moves
  outside IRQ protected region.
 
 __delay() works only if it is delay_tsc(), which has this handled ...
 (It even considers rescheduling with unsynchronized TSC.)
 
 delay_tsc(delay) translates roughly to
 
   end = read_tsc() + delay;
   while (read_tsc()  end);
 
 so the code of our while loop has a structure like
 
   while ((guest_tsc = read_tsc())  tsc_deadline) {
 end = read_tsc() + min(tsc_deadline - guest_tsc, 1000);
 while (read_tsc()  end);
   }
 
 which complicates our original idea of
 
   while (read_tsc()  tsc_deadline);
 
 (but I'm completely fine with it.)

True. I can change to a direct wait if that is preferred.

+   __delay(delay);
   
   (Does not have to call delay_tsc, but I guess it won't change.)
   
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, 
native_read_tsc());
+   }
 }
 
   
   Btw. simple automatic delta tuning had worse results?
  
  Haven't tried automatic tuning.
  
  So what happens on a realtime environment is this: you execute the fixed
  number of instructions from interrupt handling all the way to VM-entry.
  
  Well, almost fixed. Fixed is the number of apic_timer_fn plus KVM
  instructions. You can also execute host scheduler and timekeeping 
  processing.
  
  In practice, the length to execute that instruction sequence is a bell
  shaped normal distribution around the average (the right side is
  slightly higher due to host scheduler and timekeeping processing).
  
  You want to advance the timer by the rightmost bucket, that way you
  guarantee lower possible latencies (which is the interest here).
 
 (Lower latencies would likely be achieved by having a timer that issues
  posted interrupts from another CPU, and the guest set to busy idle.)

Yes.

  That said, i don't see advantage in automatic tuning for the usecase 
  which this targets.
 
 Thanks, it doesn't make much difference in the long RT setup checklist.

Exactly.

 ---
 I was asking just because I consider programming to equal automation ...
 If we know that we will always set this to the rightmost bucket anyway,
 it could be done like this
 
   if ((s64)(delta = guest_tsc - tsc_deadline)  0)
 tsc_deadline_delta += delta;
   ...
   advance_ns = kvm_tsc_to_ns(tsc_deadline_delta);
 
 instead of a script that runs a test and sets the variable.
 (On the other hand, it would probably have to be more complicated to
  reach the same level of flexibility.)

You'd have to guarantee the vcpus are never interrupted by other work,
such as processing host interrupts, otherwise you could get high
increments for tsc_deadline_delta.

So to tune that value you do:

1) Boot guest.
2) Setup certain vCPUs as realtime (large checklist), which includes
pinning and host interrupt routing.
3) Measure with cyclictest on those vCPUs with the realtime conditions.

So its also a matter of configuration.

But yes the code above would set advance_ns to the rightmost bucket.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-17 Thread Radim Krcmar
2014-12-16 09:08-0500, Marcelo Tosatti:
 For the hrtimer which emulates the tscdeadline timer in the guest,
 add an option to advance expiration, and busy spin on VM-entry waiting
 for the actual expiration time to elapse.
 
 This allows achieving low latencies in cyclictest (or any scenario 
 which requires strict timing regarding timer expiration).
 
 Reduces average cyclictest latency from 12us to 8us
 on Core i5 desktop.
 
 Note: this option requires tuning to find the appropriate value 
 for a particular hardware/guest combination. One method is to measure the 
 average delay between apic_timer_fn and VM-entry. 
 Another method is to start with 1000ns, and increase the value
 in say 500ns increments until avg cyclictest numbers stop decreasing.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Reviewed-by: Radim Krčmář rkrc...@redhat.com

 +++ kvm/arch/x86/kvm/lapic.c
 @@ -1087,11 +1089,64 @@ static void apic_timer_expired(struct kv
[...]
 +/*
 + * On APICv, this test will cause a busy wait
 + * during a higher-priority task.
 + */

(A bit confusing ... this test doesn't busy wait.)

 +
 +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
[...]
 +void wait_lapic_expire(struct kvm_vcpu *vcpu)
 +{
[...]
 + tsc_deadline = apic-lapic_timer.expired_tscdeadline;
 + apic-lapic_timer.expired_tscdeadline = 0;
 + guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
 +
 + while (guest_tsc  tsc_deadline) {
 + int delay = min(tsc_deadline - guest_tsc, 1000ULL);

Why break the __delay() loop into smaller parts?

 +
 + __delay(delay);

(Does not have to call delay_tsc, but I guess it won't change.)

 + guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
 + }
  }
  

Btw. simple automatic delta tuning had worse results?
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-17 Thread Marcelo Tosatti
On Wed, Dec 17, 2014 at 03:58:13PM +0100, Radim Krcmar wrote:
 2014-12-16 09:08-0500, Marcelo Tosatti:
  For the hrtimer which emulates the tscdeadline timer in the guest,
  add an option to advance expiration, and busy spin on VM-entry waiting
  for the actual expiration time to elapse.
  
  This allows achieving low latencies in cyclictest (or any scenario 
  which requires strict timing regarding timer expiration).
  
  Reduces average cyclictest latency from 12us to 8us
  on Core i5 desktop.
  
  Note: this option requires tuning to find the appropriate value 
  for a particular hardware/guest combination. One method is to measure the 
  average delay between apic_timer_fn and VM-entry. 
  Another method is to start with 1000ns, and increase the value
  in say 500ns increments until avg cyclictest numbers stop decreasing.
  
  Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 Reviewed-by: Radim Krčmář rkrc...@redhat.com
 
  +++ kvm/arch/x86/kvm/lapic.c
  @@ -1087,11 +1089,64 @@ static void apic_timer_expired(struct kv
 [...]
  +/*
  + * On APICv, this test will cause a busy wait
  + * during a higher-priority task.
  + */
 
 (A bit confusing ... this test doesn't busy wait.)
 
  +
  +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 [...]
  +void wait_lapic_expire(struct kvm_vcpu *vcpu)
  +{
 [...]
  +   tsc_deadline = apic-lapic_timer.expired_tscdeadline;
  +   apic-lapic_timer.expired_tscdeadline = 0;
  +   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
  +
  +   while (guest_tsc  tsc_deadline) {
  +   int delay = min(tsc_deadline - guest_tsc, 1000ULL);
 
 Why break the __delay() loop into smaller parts?

So that you can handle interrupts, in case this code ever moves
outside IRQ protected region.

  +   __delay(delay);
 
 (Does not have to call delay_tsc, but I guess it won't change.)
 
  +   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
  +   }
   }
   
 
 Btw. simple automatic delta tuning had worse results?

Haven't tried automatic tuning.

So what happens on a realtime environment is this: you execute the fixed
number of instructions from interrupt handling all the way to VM-entry.

Well, almost fixed. Fixed is the number of apic_timer_fn plus KVM
instructions. You can also execute host scheduler and timekeeping 
processing.

In practice, the length to execute that instruction sequence is a bell
shaped normal distribution around the average (the right side is
slightly higher due to host scheduler and timekeeping processing).

You want to advance the timer by the rightmost bucket, that way you
guarantee lower possible latencies (which is the interest here).

That said, i don't see advantage in automatic tuning for the usecase 
which this targets.


--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-17 Thread Paolo Bonzini


On 17/12/2014 18:41, Marcelo Tosatti wrote:
  Btw. simple automatic delta tuning had worse results?
 Haven't tried automatic tuning.
 
 So what happens on a realtime environment is this: you execute the fixed
 number of instructions from interrupt handling all the way to VM-entry.
 
 Well, almost fixed. Fixed is the number of apic_timer_fn plus KVM
 instructions. You can also execute host scheduler and timekeeping 
 processing.
 
 In practice, the length to execute that instruction sequence is a bell
 shaped normal distribution around the average (the right side is
 slightly higher due to host scheduler and timekeeping processing).
 
 You want to advance the timer by the rightmost bucket, that way you
 guarantee lower possible latencies (which is the interest here).
 
 That said, i don't see advantage in automatic tuning for the usecase 
 which this targets.

Yeah, the value of the parameter can be computed pretty easily from
either the tracepoint or the kvm-unit-tests testcase.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-17 Thread Radim Krcmar
2014-12-17 15:41-0200, Marcelo Tosatti:
 On Wed, Dec 17, 2014 at 03:58:13PM +0100, Radim Krcmar wrote:
  2014-12-16 09:08-0500, Marcelo Tosatti:
   + tsc_deadline = apic-lapic_timer.expired_tscdeadline;
   + apic-lapic_timer.expired_tscdeadline = 0;
   + guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
   +
   + while (guest_tsc  tsc_deadline) {
   + int delay = min(tsc_deadline - guest_tsc, 1000ULL);
  
  Why break the __delay() loop into smaller parts?
 
 So that you can handle interrupts, in case this code ever moves
 outside IRQ protected region.

__delay() works only if it is delay_tsc(), which has this handled ...
(It even considers rescheduling with unsynchronized TSC.)

delay_tsc(delay) translates roughly to

  end = read_tsc() + delay;
  while (read_tsc()  end);

so the code of our while loop has a structure like

  while ((guest_tsc = read_tsc())  tsc_deadline) {
end = read_tsc() + min(tsc_deadline - guest_tsc, 1000);
while (read_tsc()  end);
  }

which complicates our original idea of

  while (read_tsc()  tsc_deadline);

(but I'm completely fine with it.)

   + __delay(delay);
  
  (Does not have to call delay_tsc, but I guess it won't change.)
  
   + guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
   + }
}

  
  Btw. simple automatic delta tuning had worse results?
 
 Haven't tried automatic tuning.
 
 So what happens on a realtime environment is this: you execute the fixed
 number of instructions from interrupt handling all the way to VM-entry.
 
 Well, almost fixed. Fixed is the number of apic_timer_fn plus KVM
 instructions. You can also execute host scheduler and timekeeping 
 processing.
 
 In practice, the length to execute that instruction sequence is a bell
 shaped normal distribution around the average (the right side is
 slightly higher due to host scheduler and timekeeping processing).
 
 You want to advance the timer by the rightmost bucket, that way you
 guarantee lower possible latencies (which is the interest here).

(Lower latencies would likely be achieved by having a timer that issues
 posted interrupts from another CPU, and the guest set to busy idle.)

 That said, i don't see advantage in automatic tuning for the usecase 
 which this targets.

Thanks, it doesn't make much difference in the long RT setup checklist.


---
I was asking just because I consider programming to equal automation ...
If we know that we will always set this to the rightmost bucket anyway,
it could be done like this

  if ((s64)(delta = guest_tsc - tsc_deadline)  0)
tsc_deadline_delta += delta;
  ...
  advance_ns = kvm_tsc_to_ns(tsc_deadline_delta);

instead of a script that runs a test and sets the variable.
(On the other hand, it would probably have to be more complicated to
 reach the same level of flexibility.)
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-16 Thread Marcelo Tosatti
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.

This allows achieving low latencies in cyclictest (or any scenario 
which requires strict timing regarding timer expiration).

Reduces average cyclictest latency from 12us to 8us
on Core i5 desktop.

Note: this option requires tuning to find the appropriate value 
for a particular hardware/guest combination. One method is to measure the 
average delay between apic_timer_fn and VM-entry. 
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/lapic.c
===
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
 #include asm/page.h
 #include asm/current.h
 #include asm/apicdef.h
+#include asm/delay.h
 #include linux/atomic.h
 #include linux/jump_label.h
 #include kvm_cache_regs.h
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
 {
struct kvm_vcpu *vcpu = apic-vcpu;
wait_queue_head_t *q = vcpu-wq;
+   struct kvm_timer *ktimer = apic-lapic_timer;
 
/*
 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,64 @@ static void apic_timer_expired(struct kv
 
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+   if (apic_lvtt_tscdeadline(apic))
+   ktimer-expired_tscdeadline = ktimer-tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+   u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+   if (kvm_apic_hw_enabled(apic)) {
+   int vec = reg  APIC_VECTOR_MASK;
+
+   if (kvm_x86_ops-test_posted_interrupt)
+   return kvm_x86_ops-test_posted_interrupt(vcpu, vec);
+   else {
+   if (apic_test_vector(vec, apic-regs + APIC_ISR))
+   return true;
+   }
+   }
+   return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+   u64 guest_tsc, tsc_deadline;
+
+   if (!kvm_vcpu_has_lapic(vcpu))
+   return;
+
+   if (apic-lapic_timer.expired_tscdeadline == 0)
+   return;
+
+   if (!lapic_timer_int_injected(vcpu))
+   return;
+
+   tsc_deadline = apic-lapic_timer.expired_tscdeadline;
+   apic-lapic_timer.expired_tscdeadline = 0;
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
+
+   while (guest_tsc  tsc_deadline) {
+   int delay = min(tsc_deadline - guest_tsc, 1000ULL);
+
+   __delay(delay);
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
+   }
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
ktime_t now;
+
atomic_set(apic-lapic_timer.pending, 0);
 
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1192,7 @@ static void start_apic_timer(struct kvm_
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic-lapic_timer.tscdeadline;
u64 ns = 0;
+   ktime_t expire;
struct kvm_vcpu *vcpu = apic-vcpu;
unsigned long this_tsc_khz = vcpu-arch.virtual_tsc_khz;
unsigned long flags;
@@ -1151,8 +1207,10 @@ static void start_apic_timer(struct kvm_
if (likely(tscdeadline  guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 100ULL;
do_div(ns, this_tsc_khz);
+   expire = ktime_add_ns(now, ns);
+   expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(apic-lapic_timer.timer,
-   ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
 
Index: kvm/arch/x86/kvm/lapic.h
===
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+   u64 expired_tscdeadline;
atomic_t pending;   /* accumulated triggered timers 
*/
 };
 
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm/arch/x86/kvm/x86.c

Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-16 Thread Paolo Bonzini
On 15/12/2014 23:06, Marcelo Tosatti wrote:
 For the hrtimer which emulates the tscdeadline timer in the guest,
 add an option to advance expiration, and busy spin on VM-entry waiting
 for the actual expiration time to elapse.
 
 This allows achieving low latencies in cyclictest (or any scenario 
 which requires strict timing regarding timer expiration).
 
 Reduces average cyclictest latency from 12us to 8us
 on Core i5 desktop.
 
 Note: this option requires tuning to find the appropriate value 
 for a particular hardware/guest combination. One method is to measure the 
 average delay between apic_timer_fn and VM-entry. 
 Another method is to start with 1000ns, and increase the value
 in say 500ns increments until avg cyclictest numbers stop decreasing.
 
 Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
 
 Index: kvm/arch/x86/kvm/lapic.c
 ===
 --- kvm.orig/arch/x86/kvm/lapic.c
 +++ kvm/arch/x86/kvm/lapic.c
 @@ -33,6 +33,7 @@
  #include asm/page.h
  #include asm/current.h
  #include asm/apicdef.h
 +#include asm/delay.h
  #include linux/atomic.h
  #include linux/jump_label.h
  #include kvm_cache_regs.h
 @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
  {
   struct kvm_vcpu *vcpu = apic-vcpu;
   wait_queue_head_t *q = vcpu-wq;
 + struct kvm_timer *ktimer = apic-lapic_timer;
  
   /*
* Note: KVM_REQ_PENDING_TIMER is implicitly checked in
 @@ -1087,11 +1089,64 @@ static void apic_timer_expired(struct kv
  
   if (waitqueue_active(q))
   wake_up_interruptible(q);
 +
 + if (apic_lvtt_tscdeadline(apic))
 + ktimer-expired_tscdeadline = ktimer-tscdeadline;
 +}
 +
 +/*
 + * On APICv, this test will cause a busy wait
 + * during a higher-priority task.
 + */
 +
 +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 +{
 + struct kvm_lapic *apic = vcpu-arch.apic;
 + u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
 +
 + if (kvm_apic_hw_enabled(apic)) {
 + int vec = reg  APIC_VECTOR_MASK;
 +
 + if (kvm_x86_ops-test_posted_interrupt)
 + return kvm_x86_ops-test_posted_interrupt(vcpu, vec);
 + else {
 + if (apic_test_vector(vec, apic-regs + APIC_ISR))
 + return true;
 + }
 + }
 + return false;
 +}
 +
 +void wait_lapic_expire(struct kvm_vcpu *vcpu)
 +{
 + struct kvm_lapic *apic = vcpu-arch.apic;
 + u64 guest_tsc, tsc_deadline;
 +
 + if (!kvm_vcpu_has_lapic(vcpu))
 + return;
 +
 + if (apic-lapic_timer.expired_tscdeadline == 0)
 + return;
 +
 + if (!lapic_timer_int_injected(vcpu))
 + return;

By the time we get here, I think, if expired_tscdeadline != 0 we're sure
that the interrupt has been injected.  It may be in IRR rather than ISR,
but at least on APICv the last test should be redundant.

So perhaps you can get rid of patch 1 and check
kvm_apic_vid_enabled(vcpu-kvm):

if (k_a_v_e(vcpu-kvm)
return true;
if (apic_test_vector(vec, apic-regs + APIC_ISR))
return true;

Does this sound correct?

Paolo



 + tsc_deadline = apic-lapic_timer.expired_tscdeadline;
 + apic-lapic_timer.expired_tscdeadline = 0;
 + guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
 +
 + while (guest_tsc  tsc_deadline) {
 + int delay = min(tsc_deadline - guest_tsc, 1000ULL);
 +
 + __delay(delay);
 + guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
 + }
  }
  
  static void start_apic_timer(struct kvm_lapic *apic)
  {
   ktime_t now;
 +
   atomic_set(apic-lapic_timer.pending, 0);
  
   if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
 @@ -1137,6 +1192,7 @@ static void start_apic_timer(struct kvm_
   /* lapic timer in tsc deadline mode */
   u64 guest_tsc, tscdeadline = apic-lapic_timer.tscdeadline;
   u64 ns = 0;
 + ktime_t expire;
   struct kvm_vcpu *vcpu = apic-vcpu;
   unsigned long this_tsc_khz = vcpu-arch.virtual_tsc_khz;
   unsigned long flags;
 @@ -1151,8 +1207,10 @@ static void start_apic_timer(struct kvm_
   if (likely(tscdeadline  guest_tsc)) {
   ns = (tscdeadline - guest_tsc) * 100ULL;
   do_div(ns, this_tsc_khz);
 + expire = ktime_add_ns(now, ns);
 + expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
   hrtimer_start(apic-lapic_timer.timer,
 - ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
 +   expire, HRTIMER_MODE_ABS);
   } else
   apic_timer_expired(apic);
  
 Index: kvm/arch/x86/kvm/lapic.h
 ===
 --- 

Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-16 Thread Marcelo Tosatti
On Tue, Dec 16, 2014 at 03:34:22PM +0100, Paolo Bonzini wrote:
 On 15/12/2014 23:06, Marcelo Tosatti wrote:
  For the hrtimer which emulates the tscdeadline timer in the guest,
  add an option to advance expiration, and busy spin on VM-entry waiting
  for the actual expiration time to elapse.
  
  This allows achieving low latencies in cyclictest (or any scenario 
  which requires strict timing regarding timer expiration).
  
  Reduces average cyclictest latency from 12us to 8us
  on Core i5 desktop.
  
  Note: this option requires tuning to find the appropriate value 
  for a particular hardware/guest combination. One method is to measure the 
  average delay between apic_timer_fn and VM-entry. 
  Another method is to start with 1000ns, and increase the value
  in say 500ns increments until avg cyclictest numbers stop decreasing.
  
  Signed-off-by: Marcelo Tosatti mtosa...@redhat.com
  
  Index: kvm/arch/x86/kvm/lapic.c
  ===
  --- kvm.orig/arch/x86/kvm/lapic.c
  +++ kvm/arch/x86/kvm/lapic.c
  @@ -33,6 +33,7 @@
   #include asm/page.h
   #include asm/current.h
   #include asm/apicdef.h
  +#include asm/delay.h
   #include linux/atomic.h
   #include linux/jump_label.h
   #include kvm_cache_regs.h
  @@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
   {
  struct kvm_vcpu *vcpu = apic-vcpu;
  wait_queue_head_t *q = vcpu-wq;
  +   struct kvm_timer *ktimer = apic-lapic_timer;
   
  /*
   * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
  @@ -1087,11 +1089,64 @@ static void apic_timer_expired(struct kv
   
  if (waitqueue_active(q))
  wake_up_interruptible(q);
  +
  +   if (apic_lvtt_tscdeadline(apic))
  +   ktimer-expired_tscdeadline = ktimer-tscdeadline;
  +}
  +
  +/*
  + * On APICv, this test will cause a busy wait
  + * during a higher-priority task.
  + */
  +
  +static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
  +{
  +   struct kvm_lapic *apic = vcpu-arch.apic;
  +   u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
  +
  +   if (kvm_apic_hw_enabled(apic)) {
  +   int vec = reg  APIC_VECTOR_MASK;
  +
  +   if (kvm_x86_ops-test_posted_interrupt)
  +   return kvm_x86_ops-test_posted_interrupt(vcpu, vec);
  +   else {
  +   if (apic_test_vector(vec, apic-regs + APIC_ISR))
  +   return true;
  +   }
  +   }
  +   return false;
  +}
  +
  +void wait_lapic_expire(struct kvm_vcpu *vcpu)
  +{
  +   struct kvm_lapic *apic = vcpu-arch.apic;
  +   u64 guest_tsc, tsc_deadline;
  +
  +   if (!kvm_vcpu_has_lapic(vcpu))
  +   return;
  +
  +   if (apic-lapic_timer.expired_tscdeadline == 0)
  +   return;
  +
  +   if (!lapic_timer_int_injected(vcpu))
  +   return;
 
 By the time we get here, I think, if expired_tscdeadline != 0 we're sure
 that the interrupt has been injected.  It may be in IRR rather than ISR,
 but at least on APICv the last test should be redundant.
 
 So perhaps you can get rid of patch 1 and check
 kvm_apic_vid_enabled(vcpu-kvm):
 
   if (k_a_v_e(vcpu-kvm)
   return true;
   if (apic_test_vector(vec, apic-regs + APIC_ISR))
   return true;
 
 Does this sound correct?

* expired_tscdeadline != 0.
* APIC timer interrupt delivery masked at LVTT register.

Implies expired_tscdeadline != 0 and interrupt not injected.

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-16 Thread Paolo Bonzini


On 16/12/2014 16:13, Marcelo Tosatti wrote:
  So perhaps you can get rid of patch 1 and check
  kvm_apic_vid_enabled(vcpu-kvm):
  
 if (k_a_v_e(vcpu-kvm)
 return true;
 if (apic_test_vector(vec, apic-regs + APIC_ISR))
 return true;
  
  Does this sound correct?
 * expired_tscdeadline != 0.
 * APIC timer interrupt delivery masked at LVTT register.
 
 Implies expired_tscdeadline != 0 and interrupt not injected.

Good point.

Paolo
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch 2/3] KVM: x86: add option to advance tscdeadline hrtimer expiration

2014-12-15 Thread Marcelo Tosatti
For the hrtimer which emulates the tscdeadline timer in the guest,
add an option to advance expiration, and busy spin on VM-entry waiting
for the actual expiration time to elapse.

This allows achieving low latencies in cyclictest (or any scenario 
which requires strict timing regarding timer expiration).

Reduces average cyclictest latency from 12us to 8us
on Core i5 desktop.

Note: this option requires tuning to find the appropriate value 
for a particular hardware/guest combination. One method is to measure the 
average delay between apic_timer_fn and VM-entry. 
Another method is to start with 1000ns, and increase the value
in say 500ns increments until avg cyclictest numbers stop decreasing.

Signed-off-by: Marcelo Tosatti mtosa...@redhat.com

Index: kvm/arch/x86/kvm/lapic.c
===
--- kvm.orig/arch/x86/kvm/lapic.c
+++ kvm/arch/x86/kvm/lapic.c
@@ -33,6 +33,7 @@
 #include asm/page.h
 #include asm/current.h
 #include asm/apicdef.h
+#include asm/delay.h
 #include linux/atomic.h
 #include linux/jump_label.h
 #include kvm_cache_regs.h
@@ -1073,6 +1074,7 @@ static void apic_timer_expired(struct kv
 {
struct kvm_vcpu *vcpu = apic-vcpu;
wait_queue_head_t *q = vcpu-wq;
+   struct kvm_timer *ktimer = apic-lapic_timer;
 
/*
 * Note: KVM_REQ_PENDING_TIMER is implicitly checked in
@@ -1087,11 +1089,64 @@ static void apic_timer_expired(struct kv
 
if (waitqueue_active(q))
wake_up_interruptible(q);
+
+   if (apic_lvtt_tscdeadline(apic))
+   ktimer-expired_tscdeadline = ktimer-tscdeadline;
+}
+
+/*
+ * On APICv, this test will cause a busy wait
+ * during a higher-priority task.
+ */
+
+static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+   u32 reg = kvm_apic_get_reg(apic, APIC_LVTT);
+
+   if (kvm_apic_hw_enabled(apic)) {
+   int vec = reg  APIC_VECTOR_MASK;
+
+   if (kvm_x86_ops-test_posted_interrupt)
+   return kvm_x86_ops-test_posted_interrupt(vcpu, vec);
+   else {
+   if (apic_test_vector(vec, apic-regs + APIC_ISR))
+   return true;
+   }
+   }
+   return false;
+}
+
+void wait_lapic_expire(struct kvm_vcpu *vcpu)
+{
+   struct kvm_lapic *apic = vcpu-arch.apic;
+   u64 guest_tsc, tsc_deadline;
+
+   if (!kvm_vcpu_has_lapic(vcpu))
+   return;
+
+   if (apic-lapic_timer.expired_tscdeadline == 0)
+   return;
+
+   if (!lapic_timer_int_injected(vcpu))
+   return;
+
+   tsc_deadline = apic-lapic_timer.expired_tscdeadline;
+   apic-lapic_timer.expired_tscdeadline = 0;
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
+
+   while (guest_tsc  tsc_deadline) {
+   int delay = min(tsc_deadline - guest_tsc, 1000ULL);
+
+   __delay(delay);
+   guest_tsc = kvm_x86_ops-read_l1_tsc(vcpu, native_read_tsc());
+   }
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
ktime_t now;
+
atomic_set(apic-lapic_timer.pending, 0);
 
if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) {
@@ -1137,6 +1192,7 @@ static void start_apic_timer(struct kvm_
/* lapic timer in tsc deadline mode */
u64 guest_tsc, tscdeadline = apic-lapic_timer.tscdeadline;
u64 ns = 0;
+   ktime_t expire;
struct kvm_vcpu *vcpu = apic-vcpu;
unsigned long this_tsc_khz = vcpu-arch.virtual_tsc_khz;
unsigned long flags;
@@ -1151,8 +1207,10 @@ static void start_apic_timer(struct kvm_
if (likely(tscdeadline  guest_tsc)) {
ns = (tscdeadline - guest_tsc) * 100ULL;
do_div(ns, this_tsc_khz);
+   expire = ktime_add_ns(now, ns);
+   expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
hrtimer_start(apic-lapic_timer.timer,
-   ktime_add_ns(now, ns), HRTIMER_MODE_ABS);
+ expire, HRTIMER_MODE_ABS);
} else
apic_timer_expired(apic);
 
Index: kvm/arch/x86/kvm/lapic.h
===
--- kvm.orig/arch/x86/kvm/lapic.h
+++ kvm/arch/x86/kvm/lapic.h
@@ -14,6 +14,7 @@ struct kvm_timer {
u32 timer_mode;
u32 timer_mode_mask;
u64 tscdeadline;
+   u64 expired_tscdeadline;
atomic_t pending;   /* accumulated triggered timers 
*/
 };
 
@@ -170,4 +171,6 @@ static inline bool kvm_apic_has_events(s
 
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
+void wait_lapic_expire(struct kvm_vcpu *vcpu);
+
 #endif
Index: kvm/arch/x86/kvm/x86.c