[PATCH 0/2] Paravirt loops per jiffy calculation

2008-07-22 Thread Glauber Costa
Hey,

Over the last weeks, there has been some discussion regarding paravirt
lpj calculation for kvm. Here's my shot at that.

KVM hypervisor already put the right value in our pvclock structures,
as part of the xen compatibility efforts.

So the first patch moves the respective xen code to pvclock.c (since
we'll be doing the same calculation anyway), while the second, implements
functions for kvm that makes use of it.

Turns out that only implementing a pv get_tsc_khz is not enough, since
it will only do the right thing for cpu0, which is not what we desire.

So we set preset_lpj. Recall this code is run before arch parameter setup,
so if we pass lpj=xx in the command line, it'll still work.

Have a good (paravirtual) time!


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[PATCH 1/2] factor out cpu_khz to common code

2008-07-22 Thread Glauber Costa
KVM intends to use paravirt code to calibrate khz. Xen
current code will do just fine. So as a first step, factor out
code to pvclock.c.

Signed-off-by: Glauber Costa [EMAIL PROTECTED]
---
 arch/x86/kernel/pvclock.c |   12 
 arch/x86/xen/time.c   |   11 ++-
 include/asm-x86/pvclock.h |1 +
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a..1c54b5f 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct 
pvclock_shadow_time *dst,
return dst-version;
 }
 
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
+{
+   u64 tsc_khz = 100ULL  32;
+
+   do_div(tsc_khz, src-tsc_to_system_mul);
+   if (src-tsc_shift  0)
+   tsc_khz = -src-tsc_shift;
+   else
+   tsc_khz = src-tsc_shift;
+   return tsc_khz;
+}
+
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 {
struct pvclock_shadow_time shadow;
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b774..1eb88fe 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -200,17 +200,10 @@ unsigned long long xen_sched_clock(void)
 /* Get the TSC speed from Xen */
 unsigned long xen_tsc_khz(void)
 {
-   u64 xen_khz = 100ULL  32;
-   const struct pvclock_vcpu_time_info *info =
+   struct pvclock_vcpu_time_info *info =
HYPERVISOR_shared_info-vcpu_info[0].time;
 
-   do_div(xen_khz, info-tsc_to_system_mul);
-   if (info-tsc_shift  0)
-   xen_khz = -info-tsc_shift;
-   else
-   xen_khz = info-tsc_shift;
-
-   return xen_khz;
+   return pvclock_tsc_khz(info);
 }
 
 static cycle_t xen_clocksource_read(void)
diff --git a/include/asm-x86/pvclock.h b/include/asm-x86/pvclock.h
index 85b1bba..28783bc 100644
--- a/include/asm-x86/pvclock.h
+++ b/include/asm-x86/pvclock.h
@@ -6,6 +6,7 @@
 
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
+unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
 void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
struct pvclock_vcpu_time_info *vcpu,
struct timespec *ts);
-- 
1.5.5.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


[PATCH 2/2] use paravirt function to calculate cpu khz

2008-07-22 Thread Glauber Costa
We're currently facing timing problems in guests that do
calibration under heavy load, and then the load vanishes.
This means we'll have a much lower lpj than we actually should,
and delays end up taking less time than they should, which is a
nasty bug.

Solution is to pass on the lpj value from host to guest, and have it
preset.

Signed-off-by: Glauber Costa [EMAIL PROTECTED]
---
 arch/x86/kernel/kvmclock.c |   30 ++
 1 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def0..84db1bb 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
return ret;
 }
 
+/*
+ * If we don't do that, there is the possibility that the guest
+ * will calibrate under heavy load - thus, getting a lower lpj -
+ * and execute the delays themselves without load. This is wrong,
+ * because no delay loop can finish beforehand.
+ * Any heuristics is subject to fail, because ultimately, a large
+ * poll of guests can be running and trouble each other. So we preset
+ * lpj here
+ */
+static unsigned long kvm_get_tsc_khz(void)
+{
+   return preset_lpj;
+}
+
+static void kvm_get_preset_lpj(void)
+{
+   struct pvclock_vcpu_time_info *src;
+   unsigned long khz;
+   u64 lpj;
+
+   src = per_cpu(hv_clock, 0);
+   khz = pvclock_tsc_khz(src);
+
+lpj = ((u64)khz * 1000);
+do_div(lpj, HZ);
+   preset_lpj = lpj;
+}
+
 static struct clocksource kvm_clock = {
.name = kvm-clock,
.read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
pv_time_ops.get_wallclock = kvm_get_wallclock;
pv_time_ops.set_wallclock = kvm_set_wallclock;
pv_time_ops.sched_clock = kvm_clock_read;
+   pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
 #ifdef CONFIG_X86_LOCAL_APIC
pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
 #endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
 #ifdef CONFIG_KEXEC
machine_ops.crash_shutdown  = kvm_crash_shutdown;
 #endif
+   kvm_get_preset_lpj();
clocksource_register(kvm_clock);
}
 }
-- 
1.5.5.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] tun: Fix/rewrite packet filtering logic

2008-07-22 Thread Jeff Garzik
Max Krasnyansky wrote:
 Please see the following thread to get some context on this
   http://marc.info/?l=linux-netdevm=121564433018903w=2
 
 Basically the issue is that current multi-cast filtering stuff in
 the TUN/TAP driver is seriously broken.
 Original patch went in without proper review and ACK. It was broken and
 confusing to start with and subsequent patches broke it completely.
 To give you an idea of what's broken here are some of the issues:
 
 - Very confusing comments throughout the code that imply that the
 character device is a network interface in its own right, and that packets
 are passed between the two nics. Which is completely wrong.
 
 - Wrong set of ioctls is used for setting up filters. They look like
 shortcuts for manipulating state of the tun/tap network interface but
 in reality manipulate the state of the TX filter.
 
 - ioctls that were originally used for setting address of the the TX filter
 got fixed and now set the address of the network interface itself. Which
 made filter totaly useless.
 
 - Filtering is done too late. Instead of filtering early on, to avoid
 unnecessary wakeups, filtering is done in the read() call.
 
 The list goes on and on :)
 
 So the patch cleans all that up. It introduces simple and clean interface for
 setting up TX filters (TUNSETTXFILTER + tun_filter spec) and does filtering
 before enqueuing the packets.
 
 TX filtering is useful in the scenarios where TAP is part of a bridge, in
 which case it gets all broadcast, multicast and potentially other packets when
 the bridge is learning. So for example Ethernet tunnelling app may want to
 setup TX filters to avoid tunnelling multicast traffic. QEMU and other
 hypervisors can push RX filtering that is currently done in the guest into the
 host context therefore saving wakeups and unnecessary data transfer.
 
 This is on top of the latest and greatest :). Assuming virt folks are ok with
 the API this should go into 2.6.27.
 
 Signed-off-by: Max Krasnyansky [EMAIL PROTECTED]
 ---
  drivers/net/tun.c  |  316 
 +++-
  include/linux/if_tun.h |   24 +++-
  2 files changed, 174 insertions(+), 166 deletions(-)
 
 diff --git a/drivers/net/tun.c b/drivers/net/tun.c
 index 2693f88..901551c 100644
 --- a/drivers/net/tun.c
 +++ b/drivers/net/tun.c
 @@ -18,15 +18,11 @@
  /*
   *  Changes:
   *
 - *  Brian Braunstein [EMAIL PROTECTED] 2007/03/23
 - *Fixed hw address handling.  Now net_device.dev_addr is kept consistent
 - *with tun.dev_addr when the address is set by this module.
 - *
   *  Mike Kershaw [EMAIL PROTECTED] 2005/08/14
   *Add TUNSETLINK ioctl to set the link encapsulation
   *
   *  Mark Smith [EMAIL PROTECTED]
 - *   Use random_ether_addr() for tap MAC address.
 + *Use random_ether_addr() for tap MAC address.
   *
   *  Harald Roelle [EMAIL PROTECTED]  2004/04/20
   *Fixes in packet dropping, queue length setting and queue wakeup.
 @@ -83,9 +79,16 @@ static int debug;
  #define DBG1( a... )
  #endif
  
 +#define FLT_EXACT_COUNT 8
 +struct tap_filter {
 + unsigned intcount;/* Number of addrs. Zero means disabled */
 + u32 mask[2];  /* Mask of the hashed addrs */
 + unsigned char   addr[FLT_EXACT_COUNT][ETH_ALEN];
 +};
 +
  struct tun_struct {
   struct list_headlist;
 - unsigned long   flags;
 + unsigned intflags;
   int attached;
   uid_t   owner;
   gid_t   group;
 @@ -94,19 +97,119 @@ struct tun_struct {
   struct sk_buff_head readq;
  
   struct net_device   *dev;
 + struct fasync_struct*fasync;
  
 - struct fasync_struct*fasync;
 -
 - unsigned long if_flags;
 - u8 dev_addr[ETH_ALEN];
 - u32 chr_filter[2];
 - u32 net_filter[2];
 + struct tap_filter   txflt;
  
  #ifdef TUN_DEBUG
   int debug;
  #endif
  };
  
 +/* TAP filterting */
 +static void addr_hash_set(u32 *mask, const u8 *addr)
 +{
 + int n = ether_crc(ETH_ALEN, addr)  26;
 + mask[n  5] |= (1  (n  31));
 +}
 +
 +static unsigned int addr_hash_test(const u32 *mask, const u8 *addr)
 +{
 + int n = ether_crc(ETH_ALEN, addr)  26;
 + return mask[n  5]  (1  (n  31));
 +}
 +
 +static int update_filter(struct tap_filter *filter, void __user *arg)
 +{
 + struct { u8 u[ETH_ALEN]; } *addr;
 + struct tun_filter uf;
 + int err, alen, n, nexact;
 +
 + if (copy_from_user(uf, arg, sizeof(uf)))
 + return -EFAULT;
 +
 + if (!uf.count) {
 + /* Disabled */
 + filter-count = 0;
 + return 0;
 + }
 +
 + alen = ETH_ALEN * uf.count;
 + addr = kmalloc(alen, GFP_KERNEL);
 + if (!addr)
 + return -ENOMEM;
 +
 + if (copy_from_user(addr, arg + sizeof(uf), alen)) {
 + err = -EFAULT;
 + goto done;
 + }
 +
 + /* The filter is updated without holding any locks. 

Re: [PATCH] tun: Fix/rewrite packet filtering logic

2008-07-22 Thread David Miller
From: Jeff Garzik [EMAIL PROTECTED]
Date: Tue, 22 Jul 2008 19:41:47 -0400

 looks mostly OK, but stuff like the above should be
 
   (void __user *) arg
 
 Did you check this with sparse (Documentation/sparse.txt)?

Jeff, I already added this particular patch to the tree
a week or so ago.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization


Re: [PATCH] tun: Fix/rewrite packet filtering logic

2008-07-22 Thread Jeff Garzik
David Miller wrote:
 From: Jeff Garzik [EMAIL PROTECTED]
 Date: Tue, 22 Jul 2008 19:41:47 -0400
 
 looks mostly OK, but stuff like the above should be

  (void __user *) arg

 Did you check this with sparse (Documentation/sparse.txt)?
 
 Jeff, I already added this particular patch to the tree
 a week or so ago.

Yeah, later on in my queue were the fixes.

Jeff



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linux-foundation.org/mailman/listinfo/virtualization